We used to have per-cpu memcg and lruvec stats and the readers have to
traverse and sum the stats from each cpu. This summing was racy and may
expose transient negative values. So, an explicit check was added to
avoid such scenarios. Now these stats are moved to rstat infrastructure
and are no more per-cpu, so we can remove the fixup for transient
negative values.
Signed-off-by: Shakeel Butt <[email protected]>
---
include/linux/memcontrol.h | 15 ++-------------
1 file changed, 2 insertions(+), 13 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7028d8e4a3d7..5f2a39a43d47 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -991,30 +991,19 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
{
- long x = READ_ONCE(memcg->vmstats.state[idx]);
-#ifdef CONFIG_SMP
- if (x < 0)
- x = 0;
-#endif
- return x;
+ return READ_ONCE(memcg->vmstats.state[idx]);
}
static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
enum node_stat_item idx)
{
struct mem_cgroup_per_node *pn;
- long x;
if (mem_cgroup_disabled())
return node_page_state(lruvec_pgdat(lruvec), idx);
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- x = READ_ONCE(pn->lruvec_stats.state[idx]);
-#ifdef CONFIG_SMP
- if (x < 0)
- x = 0;
-#endif
- return x;
+ return READ_ONCE(pn->lruvec_stats.state[idx]);
}
static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
--
2.32.0.432.gabb21c7263-goog
On Tue, Jul 27, 2021 at 06:22:43PM -0700, Shakeel Butt wrote:
> We used to have per-cpu memcg and lruvec stats and the readers have to
> traverse and sum the stats from each cpu. This summing was racy and may
> expose transient negative values. So, an explicit check was added to
> avoid such scenarios. Now these stats are moved to rstat infrastructure
> and are no more per-cpu, so we can remove the fixup for transient
> negative values.
>
> Signed-off-by: Shakeel Butt <[email protected]>
Acked-by: Roman Gushchin <[email protected]>
Thanks!
On 28.07.21 03:22, Shakeel Butt wrote:
> We used to have per-cpu memcg and lruvec stats and the readers have to
> traverse and sum the stats from each cpu. This summing was racy and may
> expose transient negative values. So, an explicit check was added to
> avoid such scenarios. Now these stats are moved to rstat infrastructure
> and are no more per-cpu, so we can remove the fixup for transient
> negative values.
>
> Signed-off-by: Shakeel Butt <[email protected]>
> ---
> include/linux/memcontrol.h | 15 ++-------------
> 1 file changed, 2 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 7028d8e4a3d7..5f2a39a43d47 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -991,30 +991,19 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
>
> static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
> {
> - long x = READ_ONCE(memcg->vmstats.state[idx]);
> -#ifdef CONFIG_SMP
> - if (x < 0)
> - x = 0;
> -#endif
> - return x;
> + return READ_ONCE(memcg->vmstats.state[idx]);
> }
>
> static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
> enum node_stat_item idx)
> {
> struct mem_cgroup_per_node *pn;
> - long x;
>
> if (mem_cgroup_disabled())
> return node_page_state(lruvec_pgdat(lruvec), idx);
>
> pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> - x = READ_ONCE(pn->lruvec_stats.state[idx]);
> -#ifdef CONFIG_SMP
> - if (x < 0)
> - x = 0;
> -#endif
> - return x;
> + return READ_ONCE(pn->lruvec_stats.state[idx]);
> }
>
> static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>
Reviewed-by: David Hildenbrand <[email protected]>
--
Thanks,
David / dhildenb
On Tue, 27 Jul 2021 18:22:43 -0700 Shakeel Butt <[email protected]> wrote:
> We used to have per-cpu memcg and lruvec stats and the readers have to
> traverse and sum the stats from each cpu. This summing was racy and may
> expose transient negative values. So, an explicit check was added to
> avoid such scenarios. Now these stats are moved to rstat infrastructure
> and are no more per-cpu, so we can remove the fixup for transient
> negative values.
We can't do anything about the same code in lruvec_page_state_local()?
On Wed, Jul 28, 2021 at 12:43 PM Andrew Morton
<[email protected]> wrote:
>
> On Tue, 27 Jul 2021 18:22:43 -0700 Shakeel Butt <[email protected]> wrote:
>
> > We used to have per-cpu memcg and lruvec stats and the readers have to
> > traverse and sum the stats from each cpu. This summing was racy and may
> > expose transient negative values. So, an explicit check was added to
> > avoid such scenarios. Now these stats are moved to rstat infrastructure
> > and are no more per-cpu, so we can remove the fixup for transient
> > negative values.
>
> We can't do anything about the same code in lruvec_page_state_local()?
lruvec_page_state_local() is used by cgroup v1's memory.numa_stat for
cgroup local stats (not hierarchical) and are still per-cpu. To make
it non-per-cpu, we have to add 'long
state_local[NR_VM_NODE_STAT_ITEMS]' in 'struct lruvec_stats' and do
aggregation in rstat flushing. So, paying the cpu traversal cost with
more memory usage. I am not sure if it is worth it.
On Tue 27-07-21 18:22:43, Shakeel Butt wrote:
> We used to have per-cpu memcg and lruvec stats and the readers have to
> traverse and sum the stats from each cpu. This summing was racy and may
> expose transient negative values. So, an explicit check was added to
> avoid such scenarios. Now these stats are moved to rstat infrastructure
> and are no more per-cpu, so we can remove the fixup for transient
> negative values.
>
> Signed-off-by: Shakeel Butt <[email protected]>
Acked-by: Michal Hocko <[email protected]>
> ---
> include/linux/memcontrol.h | 15 ++-------------
> 1 file changed, 2 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 7028d8e4a3d7..5f2a39a43d47 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -991,30 +991,19 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
>
> static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
> {
> - long x = READ_ONCE(memcg->vmstats.state[idx]);
> -#ifdef CONFIG_SMP
> - if (x < 0)
> - x = 0;
> -#endif
> - return x;
> + return READ_ONCE(memcg->vmstats.state[idx]);
> }
>
> static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
> enum node_stat_item idx)
> {
> struct mem_cgroup_per_node *pn;
> - long x;
>
> if (mem_cgroup_disabled())
> return node_page_state(lruvec_pgdat(lruvec), idx);
>
> pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> - x = READ_ONCE(pn->lruvec_stats.state[idx]);
> -#ifdef CONFIG_SMP
> - if (x < 0)
> - x = 0;
> -#endif
> - return x;
> + return READ_ONCE(pn->lruvec_stats.state[idx]);
> }
>
> static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> --
> 2.32.0.432.gabb21c7263-goog
--
Michal Hocko
SUSE Labs