Chris Down (6):
mm, memcg: Prevent memory.high load/store tearing
mm, memcg: Prevent memory.max load tearing
mm, memcg: Prevent memory.low load/store tearing
mm, memcg: Prevent memory.min load/store tearing
mm, memcg: Prevent memory.swap.max load tearing
mm, memcg: Prevent mem_cgroup_protected store tearing
mm/memcontrol.c | 42 ++++++++++++++++++++++--------------------
mm/page_counter.c | 17 +++++++++++------
2 files changed, 33 insertions(+), 26 deletions(-)
--
2.25.1
A mem_cgroup's high attribute can be concurrently set at the same time
as we are trying to read it -- for example, if we are in
memory_high_write at the same time as we are trying to do high reclaim.
Signed-off-by: Chris Down <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Roman Gushchin <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
---
mm/memcontrol.c | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 63bb6a2aab81..d32d3c0a16d4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2228,7 +2228,7 @@ static void reclaim_high(struct mem_cgroup *memcg,
gfp_t gfp_mask)
{
do {
- if (page_counter_read(&memcg->memory) <= memcg->high)
+ if (page_counter_read(&memcg->memory) <= READ_ONCE(memcg->high))
continue;
memcg_memory_event(memcg, MEMCG_HIGH);
try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
@@ -2545,7 +2545,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
* reclaim, the cost of mismatch is negligible.
*/
do {
- if (page_counter_read(&memcg->memory) > memcg->high) {
+ if (page_counter_read(&memcg->memory) > READ_ONCE(memcg->high)) {
/* Don't bother a random interrupted task */
if (in_interrupt()) {
schedule_work(&memcg->high_work);
@@ -4257,7 +4257,8 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
*pheadroom = PAGE_COUNTER_MAX;
while ((parent = parent_mem_cgroup(memcg))) {
- unsigned long ceiling = min(memcg->memory.max, memcg->high);
+ unsigned long ceiling = min(memcg->memory.max,
+ READ_ONCE(memcg->high));
unsigned long used = page_counter_read(&memcg->memory);
*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
@@ -4978,7 +4979,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (!memcg)
return ERR_PTR(error);
- memcg->high = PAGE_COUNTER_MAX;
+ WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX);
memcg->soft_limit = PAGE_COUNTER_MAX;
if (parent) {
memcg->swappiness = mem_cgroup_swappiness(parent);
@@ -5131,7 +5132,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
- memcg->high = PAGE_COUNTER_MAX;
+ WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX);
memcg->soft_limit = PAGE_COUNTER_MAX;
memcg_wb_domain_size_changed(memcg);
}
@@ -5947,7 +5948,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
if (err)
return err;
- memcg->high = high;
+ WRITE_ONCE(memcg->high, high);
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
--
2.25.1
This can be set concurrently with reads, which may cause the wrong value
to be propagated.
Signed-off-by: Chris Down <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Roman Gushchin <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
---
mm/memcontrol.c | 4 ++--
mm/page_counter.c | 9 ++++++---
2 files changed, 8 insertions(+), 5 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index aca2964ea494..c85a304fa4a1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6262,7 +6262,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
return MEMCG_PROT_NONE;
emin = memcg->memory.min;
- elow = memcg->memory.low;
+ elow = READ_ONCE(memcg->memory.low);
parent = parent_mem_cgroup(memcg);
/* No parent means a non-hierarchical mode on v1 memcg */
@@ -6291,7 +6291,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
if (elow && parent_elow) {
unsigned long low_usage, siblings_low_usage;
- low_usage = min(usage, memcg->memory.low);
+ low_usage = min(usage, READ_ONCE(memcg->memory.low));
siblings_low_usage = atomic_long_read(
&parent->memory.children_low_usage);
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 50184929b61f..18b7f779f2e2 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -17,6 +17,7 @@ static void propagate_protected_usage(struct page_counter *c,
unsigned long usage)
{
unsigned long protected, old_protected;
+ unsigned long low;
long delta;
if (!c->parent)
@@ -34,8 +35,10 @@ static void propagate_protected_usage(struct page_counter *c,
atomic_long_add(delta, &c->parent->children_min_usage);
}
- if (c->low || atomic_long_read(&c->low_usage)) {
- if (usage <= c->low)
+ low = READ_ONCE(c->low);
+
+ if (low || atomic_long_read(&c->low_usage)) {
+ if (usage <= low)
protected = usage;
else
protected = 0;
@@ -231,7 +234,7 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
{
struct page_counter *c;
- counter->low = nr_pages;
+ WRITE_ONCE(counter->low, nr_pages);
for (c = counter; c; c = c->parent)
propagate_protected_usage(c, atomic_long_read(&c->usage));
--
2.25.1
This can be set concurrently with reads, which may cause the wrong value
to be propagated.
Signed-off-by: Chris Down <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Roman Gushchin <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
---
mm/memcontrol.c | 4 ++--
mm/page_counter.c | 10 ++++++----
2 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c85a304fa4a1..e0ed790a2a8c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6261,7 +6261,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
if (!usage)
return MEMCG_PROT_NONE;
- emin = memcg->memory.min;
+ emin = READ_ONCE(memcg->memory.min);
elow = READ_ONCE(memcg->memory.low);
parent = parent_mem_cgroup(memcg);
@@ -6277,7 +6277,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
if (emin && parent_emin) {
unsigned long min_usage, siblings_min_usage;
- min_usage = min(usage, memcg->memory.min);
+ min_usage = min(usage, READ_ONCE(memcg->memory.min));
siblings_min_usage = atomic_long_read(
&parent->memory.children_min_usage);
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 18b7f779f2e2..ae471c7d255f 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -17,14 +17,16 @@ static void propagate_protected_usage(struct page_counter *c,
unsigned long usage)
{
unsigned long protected, old_protected;
- unsigned long low;
+ unsigned long low, min;
long delta;
if (!c->parent)
return;
- if (c->min || atomic_long_read(&c->min_usage)) {
- if (usage <= c->min)
+ min = READ_ONCE(c->min);
+
+ if (min || atomic_long_read(&c->min_usage)) {
+ if (usage <= min)
protected = usage;
else
protected = 0;
@@ -217,7 +219,7 @@ void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
{
struct page_counter *c;
- counter->min = nr_pages;
+ WRITE_ONCE(counter->min, nr_pages);
for (c = counter; c; c = c->parent)
propagate_protected_usage(c, atomic_long_read(&c->usage));
--
2.25.1
The read side of this is all protected, but we can still tear if
multiple iterations of mem_cgroup_protected are going at the same time.
There's some intentional racing in mem_cgroup_protected which is ok, but
load/store tearing should be avoided.
Signed-off-by: Chris Down <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Roman Gushchin <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
---
mm/memcontrol.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 57048a38c75d..e9af606238ab 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6301,8 +6301,8 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
}
exit:
- memcg->memory.emin = emin;
- memcg->memory.elow = elow;
+ WRITE_ONCE(memcg->memory.emin, emin);
+ WRITE_ONCE(memcg->memory.elow, elow);
if (usage <= emin)
return MEMCG_PROT_MIN;
--
2.25.1
This one is a bit more nuanced because we have memcg_max_mutex, which is
mostly just used for enforcing invariants, but we still need to
READ_ONCE since (despite its name) it doesn't really protect memory.max
access.
On write (page_counter_set_max() and memory_max_write()) we use xchg(),
which uses smp_mb(), so that's already fine.
Signed-off-by: Chris Down <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Roman Gushchin <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
---
mm/memcontrol.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d32d3c0a16d4..aca2964ea494 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1507,7 +1507,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memory)),
- K((u64)memcg->memory.max), memcg->memory.failcnt);
+ K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->swap)),
@@ -1538,7 +1538,7 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
unsigned long max;
- max = memcg->memory.max;
+ max = READ_ONCE(memcg->memory.max);
if (mem_cgroup_swappiness(memcg)) {
unsigned long memsw_max;
unsigned long swap_max;
@@ -3006,7 +3006,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
* Make sure that the new limit (memsw or memory limit) doesn't
* break our basic invariant rule memory.max <= memsw.max.
*/
- limits_invariant = memsw ? max >= memcg->memory.max :
+ limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
max <= memcg->memsw.max;
if (!limits_invariant) {
mutex_unlock(&memcg_max_mutex);
@@ -3753,8 +3753,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
/* Hierarchical information */
memory = memsw = PAGE_COUNTER_MAX;
for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
- memory = min(memory, mi->memory.max);
- memsw = min(memsw, mi->memsw.max);
+ memory = min(memory, READ_ONCE(mi->memory.max));
+ memsw = min(memsw, READ_ONCE(mi->memsw.max));
}
seq_printf(m, "hierarchical_memory_limit %llu\n",
(u64)memory * PAGE_SIZE);
@@ -4257,7 +4257,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
*pheadroom = PAGE_COUNTER_MAX;
while ((parent = parent_mem_cgroup(memcg))) {
- unsigned long ceiling = min(memcg->memory.max,
+ unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
READ_ONCE(memcg->high));
unsigned long used = page_counter_read(&memcg->memory);
--
2.25.1
The write side of this is xchg()/smp_mb(), so that's all good. Just a
few sites missing a READ_ONCE.
Signed-off-by: Chris Down <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Roman Gushchin <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
---
mm/memcontrol.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e0ed790a2a8c..57048a38c75d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1511,7 +1511,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->swap)),
- K((u64)memcg->swap.max), memcg->swap.failcnt);
+ K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
else {
pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memsw)),
@@ -1544,7 +1544,7 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
unsigned long swap_max;
memsw_max = memcg->memsw.max;
- swap_max = memcg->swap.max;
+ swap_max = READ_ONCE(memcg->swap.max);
swap_max = min(swap_max, (unsigned long)total_swap_pages);
max = min(max + swap_max, memsw_max);
}
@@ -7025,7 +7025,8 @@ bool mem_cgroup_swap_full(struct page *page)
return false;
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
- if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
+ if (page_counter_read(&memcg->swap) * 2 >=
+ READ_ONCE(memcg->swap.max))
return true;
return false;
--
2.25.1
On Thu 12-03-20 17:32:51, Chris Down wrote:
> A mem_cgroup's high attribute can be concurrently set at the same time
> as we are trying to read it -- for example, if we are in
> memory_high_write at the same time as we are trying to do high reclaim.
I assume this is a replace all kinda patch because css_alloc shouldn't
really be a subject to races. I am not sure about css_reset but it
sounds like a safe as well.
That being said I do not object because this cannot be harmful but it
would be nice to mention that in the changelog just in case somebody
wonders about this in future.
> Signed-off-by: Chris Down <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Roman Gushchin <[email protected]>
> Cc: Tejun Heo <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
Acked-by: Michal Hocko <[email protected]>
> ---
> mm/memcontrol.c | 13 +++++++------
> 1 file changed, 7 insertions(+), 6 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 63bb6a2aab81..d32d3c0a16d4 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2228,7 +2228,7 @@ static void reclaim_high(struct mem_cgroup *memcg,
> gfp_t gfp_mask)
> {
> do {
> - if (page_counter_read(&memcg->memory) <= memcg->high)
> + if (page_counter_read(&memcg->memory) <= READ_ONCE(memcg->high))
> continue;
> memcg_memory_event(memcg, MEMCG_HIGH);
> try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
> @@ -2545,7 +2545,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
> * reclaim, the cost of mismatch is negligible.
> */
> do {
> - if (page_counter_read(&memcg->memory) > memcg->high) {
> + if (page_counter_read(&memcg->memory) > READ_ONCE(memcg->high)) {
> /* Don't bother a random interrupted task */
> if (in_interrupt()) {
> schedule_work(&memcg->high_work);
> @@ -4257,7 +4257,8 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
> *pheadroom = PAGE_COUNTER_MAX;
>
> while ((parent = parent_mem_cgroup(memcg))) {
> - unsigned long ceiling = min(memcg->memory.max, memcg->high);
> + unsigned long ceiling = min(memcg->memory.max,
> + READ_ONCE(memcg->high));
> unsigned long used = page_counter_read(&memcg->memory);
>
> *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
> @@ -4978,7 +4979,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
> if (!memcg)
> return ERR_PTR(error);
>
> - memcg->high = PAGE_COUNTER_MAX;
> + WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX);
> memcg->soft_limit = PAGE_COUNTER_MAX;
> if (parent) {
> memcg->swappiness = mem_cgroup_swappiness(parent);
> @@ -5131,7 +5132,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
> page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
> page_counter_set_min(&memcg->memory, 0);
> page_counter_set_low(&memcg->memory, 0);
> - memcg->high = PAGE_COUNTER_MAX;
> + WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX);
> memcg->soft_limit = PAGE_COUNTER_MAX;
> memcg_wb_domain_size_changed(memcg);
> }
> @@ -5947,7 +5948,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
> if (err)
> return err;
>
> - memcg->high = high;
> + WRITE_ONCE(memcg->high, high);
>
> for (;;) {
> unsigned long nr_pages = page_counter_read(&memcg->memory);
> --
> 2.25.1
>
--
Michal Hocko
SUSE Labs
On Thu 12-03-20 17:32:56, Chris Down wrote:
> This one is a bit more nuanced because we have memcg_max_mutex, which is
> mostly just used for enforcing invariants, but we still need to
> READ_ONCE since (despite its name) it doesn't really protect memory.max
> access.
>
> On write (page_counter_set_max() and memory_max_write()) we use xchg(),
> which uses smp_mb(), so that's already fine.
>
> Signed-off-by: Chris Down <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Roman Gushchin <[email protected]>
> Cc: Tejun Heo <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
Acked-by: Michal Hocko <[email protected]>
> ---
> mm/memcontrol.c | 12 ++++++------
> 1 file changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index d32d3c0a16d4..aca2964ea494 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1507,7 +1507,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
>
> pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
> K((u64)page_counter_read(&memcg->memory)),
> - K((u64)memcg->memory.max), memcg->memory.failcnt);
> + K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
> if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
> pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
> K((u64)page_counter_read(&memcg->swap)),
> @@ -1538,7 +1538,7 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
> {
> unsigned long max;
>
> - max = memcg->memory.max;
> + max = READ_ONCE(memcg->memory.max);
> if (mem_cgroup_swappiness(memcg)) {
> unsigned long memsw_max;
> unsigned long swap_max;
> @@ -3006,7 +3006,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
> * Make sure that the new limit (memsw or memory limit) doesn't
> * break our basic invariant rule memory.max <= memsw.max.
> */
> - limits_invariant = memsw ? max >= memcg->memory.max :
> + limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
> max <= memcg->memsw.max;
> if (!limits_invariant) {
> mutex_unlock(&memcg_max_mutex);
> @@ -3753,8 +3753,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
> /* Hierarchical information */
> memory = memsw = PAGE_COUNTER_MAX;
> for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
> - memory = min(memory, mi->memory.max);
> - memsw = min(memsw, mi->memsw.max);
> + memory = min(memory, READ_ONCE(mi->memory.max));
> + memsw = min(memsw, READ_ONCE(mi->memsw.max));
> }
> seq_printf(m, "hierarchical_memory_limit %llu\n",
> (u64)memory * PAGE_SIZE);
> @@ -4257,7 +4257,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
> *pheadroom = PAGE_COUNTER_MAX;
>
> while ((parent = parent_mem_cgroup(memcg))) {
> - unsigned long ceiling = min(memcg->memory.max,
> + unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
> READ_ONCE(memcg->high));
> unsigned long used = page_counter_read(&memcg->memory);
>
> --
> 2.25.1
--
Michal Hocko
SUSE Labs
On Thu 12-03-20 17:33:01, Chris Down wrote:
> This can be set concurrently with reads, which may cause the wrong value
> to be propagated.
>
> Signed-off-by: Chris Down <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Roman Gushchin <[email protected]>
> Cc: Tejun Heo <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
Acked-by: Michal Hocko <[email protected]>
> ---
> mm/memcontrol.c | 4 ++--
> mm/page_counter.c | 9 ++++++---
> 2 files changed, 8 insertions(+), 5 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index aca2964ea494..c85a304fa4a1 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -6262,7 +6262,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
> return MEMCG_PROT_NONE;
>
> emin = memcg->memory.min;
> - elow = memcg->memory.low;
> + elow = READ_ONCE(memcg->memory.low);
>
> parent = parent_mem_cgroup(memcg);
> /* No parent means a non-hierarchical mode on v1 memcg */
> @@ -6291,7 +6291,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
> if (elow && parent_elow) {
> unsigned long low_usage, siblings_low_usage;
>
> - low_usage = min(usage, memcg->memory.low);
> + low_usage = min(usage, READ_ONCE(memcg->memory.low));
> siblings_low_usage = atomic_long_read(
> &parent->memory.children_low_usage);
>
> diff --git a/mm/page_counter.c b/mm/page_counter.c
> index 50184929b61f..18b7f779f2e2 100644
> --- a/mm/page_counter.c
> +++ b/mm/page_counter.c
> @@ -17,6 +17,7 @@ static void propagate_protected_usage(struct page_counter *c,
> unsigned long usage)
> {
> unsigned long protected, old_protected;
> + unsigned long low;
> long delta;
>
> if (!c->parent)
> @@ -34,8 +35,10 @@ static void propagate_protected_usage(struct page_counter *c,
> atomic_long_add(delta, &c->parent->children_min_usage);
> }
>
> - if (c->low || atomic_long_read(&c->low_usage)) {
> - if (usage <= c->low)
> + low = READ_ONCE(c->low);
> +
> + if (low || atomic_long_read(&c->low_usage)) {
> + if (usage <= low)
> protected = usage;
> else
> protected = 0;
> @@ -231,7 +234,7 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
> {
> struct page_counter *c;
>
> - counter->low = nr_pages;
> + WRITE_ONCE(counter->low, nr_pages);
>
> for (c = counter; c; c = c->parent)
> propagate_protected_usage(c, atomic_long_read(&c->usage));
> --
> 2.25.1
>
--
Michal Hocko
SUSE Labs
On Thu 12-03-20 17:33:11, Chris Down wrote:
> The write side of this is xchg()/smp_mb(), so that's all good. Just a
> few sites missing a READ_ONCE.
>
> Signed-off-by: Chris Down <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Roman Gushchin <[email protected]>
> Cc: Tejun Heo <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
Acked-by: Michal Hocko <[email protected]>
> ---
> mm/memcontrol.c | 7 ++++---
> 1 file changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e0ed790a2a8c..57048a38c75d 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1511,7 +1511,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
> if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
> pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
> K((u64)page_counter_read(&memcg->swap)),
> - K((u64)memcg->swap.max), memcg->swap.failcnt);
> + K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
> else {
> pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
> K((u64)page_counter_read(&memcg->memsw)),
> @@ -1544,7 +1544,7 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
> unsigned long swap_max;
>
> memsw_max = memcg->memsw.max;
> - swap_max = memcg->swap.max;
> + swap_max = READ_ONCE(memcg->swap.max);
> swap_max = min(swap_max, (unsigned long)total_swap_pages);
> max = min(max + swap_max, memsw_max);
> }
> @@ -7025,7 +7025,8 @@ bool mem_cgroup_swap_full(struct page *page)
> return false;
>
> for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
> - if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
> + if (page_counter_read(&memcg->swap) * 2 >=
> + READ_ONCE(memcg->swap.max))
> return true;
>
> return false;
> --
> 2.25.1
>
--
Michal Hocko
SUSE Labs
On Thu 12-03-20 17:33:07, Chris Down wrote:
> This can be set concurrently with reads, which may cause the wrong value
> to be propagated.
>
> Signed-off-by: Chris Down <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Roman Gushchin <[email protected]>
> Cc: Tejun Heo <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
Acked-by: Michal Hocko <[email protected]>
> ---
> mm/memcontrol.c | 4 ++--
> mm/page_counter.c | 10 ++++++----
> 2 files changed, 8 insertions(+), 6 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index c85a304fa4a1..e0ed790a2a8c 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -6261,7 +6261,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
> if (!usage)
> return MEMCG_PROT_NONE;
>
> - emin = memcg->memory.min;
> + emin = READ_ONCE(memcg->memory.min);
> elow = READ_ONCE(memcg->memory.low);
>
> parent = parent_mem_cgroup(memcg);
> @@ -6277,7 +6277,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
> if (emin && parent_emin) {
> unsigned long min_usage, siblings_min_usage;
>
> - min_usage = min(usage, memcg->memory.min);
> + min_usage = min(usage, READ_ONCE(memcg->memory.min));
> siblings_min_usage = atomic_long_read(
> &parent->memory.children_min_usage);
>
> diff --git a/mm/page_counter.c b/mm/page_counter.c
> index 18b7f779f2e2..ae471c7d255f 100644
> --- a/mm/page_counter.c
> +++ b/mm/page_counter.c
> @@ -17,14 +17,16 @@ static void propagate_protected_usage(struct page_counter *c,
> unsigned long usage)
> {
> unsigned long protected, old_protected;
> - unsigned long low;
> + unsigned long low, min;
> long delta;
>
> if (!c->parent)
> return;
>
> - if (c->min || atomic_long_read(&c->min_usage)) {
> - if (usage <= c->min)
> + min = READ_ONCE(c->min);
> +
> + if (min || atomic_long_read(&c->min_usage)) {
> + if (usage <= min)
> protected = usage;
> else
> protected = 0;
> @@ -217,7 +219,7 @@ void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
> {
> struct page_counter *c;
>
> - counter->min = nr_pages;
> + WRITE_ONCE(counter->min, nr_pages);
>
> for (c = counter; c; c = c->parent)
> propagate_protected_usage(c, atomic_long_read(&c->usage));
> --
> 2.25.1
>
--
Michal Hocko
SUSE Labs
On Thu 12-03-20 17:33:16, Chris Down wrote:
> The read side of this is all protected, but we can still tear if
> multiple iterations of mem_cgroup_protected are going at the same time.
>
> There's some intentional racing in mem_cgroup_protected which is ok, but
> load/store tearing should be avoided.
>
> Signed-off-by: Chris Down <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Roman Gushchin <[email protected]>
> Cc: Tejun Heo <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
Acked-by: Michal Hocko <[email protected]>
> ---
> mm/memcontrol.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 57048a38c75d..e9af606238ab 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -6301,8 +6301,8 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
> }
>
> exit:
> - memcg->memory.emin = emin;
> - memcg->memory.elow = elow;
> + WRITE_ONCE(memcg->memory.emin, emin);
> + WRITE_ONCE(memcg->memory.elow, elow);
>
> if (usage <= emin)
> return MEMCG_PROT_MIN;
> --
> 2.25.1
--
Michal Hocko
SUSE Labs
Hello.
I see suspicious asymmetry, in the current mainline:
> WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
> READ_ONCE(memcg->memory.min),
> READ_ONCE(parent->memory.emin),
> atomic_long_read(&parent->memory.children_min_usage)));
>
> WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
> memcg->memory.low, READ_ONCE(parent->memory.elow),
> atomic_long_read(&parent->memory.children_low_usage)));
On Thu, Mar 12, 2020 at 05:33:01PM +0000, Chris Down <[email protected]> wrote:
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index aca2964ea494..c85a304fa4a1 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -6262,7 +6262,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
> return MEMCG_PROT_NONE;
>
> emin = memcg->memory.min;
> - elow = memcg->memory.low;
> + elow = READ_ONCE(memcg->memory.low);
>
> parent = parent_mem_cgroup(memcg);
> /* No parent means a non-hierarchical mode on v1 memcg */
> @@ -6291,7 +6291,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
> if (elow && parent_elow) {
> unsigned long low_usage, siblings_low_usage;
>
> - low_usage = min(usage, memcg->memory.low);
> + low_usage = min(usage, READ_ONCE(memcg->memory.low));
> siblings_low_usage = atomic_long_read(
> &parent->memory.children_low_usage);
Is it possible that these hunks were lost during rebase/merge?
IMHO it should apply as:
-- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6428,7 +6428,8 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
atomic_long_read(&parent->memory.children_min_usage)));
WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
- memcg->memory.low, READ_ONCE(parent->memory.elow),
+ READ_ONCE(memcg->memory.low),
+ READ_ONCE(parent->memory.elow),
atomic_long_read(&parent->memory.children_low_usage)));
out:
Michal
Hi Michal,
Good catch! Andrew and I must have missed these when massaging the commits with
other stuff in -mm, which is totally understandable considering the amount of
places being touched by this and other patch series at the same time. Just goes
to show how complex it can be sometimes, since I even double checked these and
didn't see that missed hunk :-)
The good news is that these are belt-and-suspenders: this avoids a rare
theoretical case, not something likely to be practically dangerous. But yes, we
should fix it up. Since the commit is already out, I'll just submit a new one.
Thanks for the report!
Chris