2023-02-20 09:17:17

by Qi Zheng

[permalink] [raw]
Subject: [PATCH 2/5] mm: vmscan: make memcg slab shrink lockless

Like global slab shrink, since commit 1cd0bd06093c
("rcu: Remove CONFIG_SRCU"), it's time to use SRCU
to protect readers who previously held shrinker_rwsem.

We can test with the following script:

```
DIR="/root/shrinker/memcg/mnt"

do_create()
{
mkdir /sys/fs/cgroup/memory/test
echo 200M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
for i in `seq 0 $1`;
do
mkdir /sys/fs/cgroup/memory/test/$i;
echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
mkdir -p $DIR/$i;
done
}

do_mount()
{
for i in `seq $1 $2`;
do
mount -t tmpfs $i $DIR/$i;
done
}

do_touch()
{
for i in `seq $1 $2`;
do
echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
done
}

do_create 2000
do_mount 0 2000
do_touch 0 1000
```

Before applying:

46.60% [kernel] [k] down_read_trylock
18.70% [kernel] [k] up_read
15.44% [kernel] [k] shrink_slab
4.37% [kernel] [k] _find_next_bit
2.75% [kernel] [k] xa_load
2.07% [kernel] [k] idr_find
1.73% [kernel] [k] do_shrink_slab
1.42% [kernel] [k] shrink_lruvec
0.74% [kernel] [k] shrink_node
0.60% [kernel] [k] list_lru_count_one

After applying:

19.53% [kernel] [k] _find_next_bit
14.63% [kernel] [k] do_shrink_slab
14.58% [kernel] [k] shrink_slab
11.83% [kernel] [k] shrink_lruvec
9.33% [kernel] [k] __blk_flush_plug
6.67% [kernel] [k] mem_cgroup_iter
3.73% [kernel] [k] list_lru_count_one
2.43% [kernel] [k] shrink_node
1.96% [kernel] [k] super_cache_count
1.78% [kernel] [k] __rcu_read_unlock
1.38% [kernel] [k] __srcu_read_lock
1.30% [kernel] [k] xas_descend

We can see that the readers is no longer blocked.

Signed-off-by: Qi Zheng <[email protected]>
---
mm/vmscan.c | 56 ++++++++++++++++++++++++++++++-----------------------
1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 95a3d6ddc6c1..dc47396ecd0e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -57,6 +57,7 @@
#include <linux/khugepaged.h>
#include <linux/rculist_nulls.h>
#include <linux/random.h>
+#include <linux/srcu.h>

#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -221,8 +222,21 @@ static inline int shrinker_defer_size(int nr_items)
static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
int nid)
{
- return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
- lockdep_is_held(&shrinker_rwsem));
+ return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info,
+ &shrinker_srcu,
+ lockdep_is_held(&shrinker_rwsem));
+}
+
+static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg,
+ int nid)
+{
+ return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info,
+ &shrinker_srcu);
+}
+
+static void free_shrinker_info_rcu(struct rcu_head *head)
+{
+ kvfree(container_of(head, struct shrinker_info, rcu));
}

static int expand_one_shrinker_info(struct mem_cgroup *memcg,
@@ -257,7 +271,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
defer_size - old_defer_size);

rcu_assign_pointer(pn->shrinker_info, new);
- kvfree_rcu(old, rcu);
+ call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu);
}

return 0;
@@ -350,13 +364,14 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
{
if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
struct shrinker_info *info;
+ int srcu_idx;

- rcu_read_lock();
- info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
+ srcu_idx = srcu_read_lock(&shrinker_srcu);
+ info = shrinker_info_srcu(memcg, nid);
/* Pairs with smp mb in shrink_slab() */
smp_mb__before_atomic();
set_bit(shrinker_id, info->map);
- rcu_read_unlock();
+ srcu_read_unlock(&shrinker_srcu, srcu_idx);
}
}

@@ -370,7 +385,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
return -ENOSYS;

down_write(&shrinker_rwsem);
- /* This may call shrinker, so it must use down_read_trylock() */
id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
if (id < 0)
goto unlock;
@@ -404,7 +418,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
{
struct shrinker_info *info;

- info = shrinker_info_protected(memcg, nid);
+ info = shrinker_info_srcu(memcg, nid);
return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
}

@@ -413,13 +427,13 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
{
struct shrinker_info *info;

- info = shrinker_info_protected(memcg, nid);
+ info = shrinker_info_srcu(memcg, nid);
return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
}

void reparent_shrinker_deferred(struct mem_cgroup *memcg)
{
- int i, nid;
+ int i, nid, srcu_idx;
long nr;
struct mem_cgroup *parent;
struct shrinker_info *child_info, *parent_info;
@@ -429,16 +443,16 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
parent = root_mem_cgroup;

/* Prevent from concurrent shrinker_info expand */
- down_read(&shrinker_rwsem);
+ srcu_idx = srcu_read_lock(&shrinker_srcu);
for_each_node(nid) {
- child_info = shrinker_info_protected(memcg, nid);
- parent_info = shrinker_info_protected(parent, nid);
+ child_info = shrinker_info_srcu(memcg, nid);
+ parent_info = shrinker_info_srcu(parent, nid);
for (i = 0; i < shrinker_nr_max; i++) {
nr = atomic_long_read(&child_info->nr_deferred[i]);
atomic_long_add(nr, &parent_info->nr_deferred[i]);
}
}
- up_read(&shrinker_rwsem);
+ srcu_read_unlock(&shrinker_srcu, srcu_idx);
}

static bool cgroup_reclaim(struct scan_control *sc)
@@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
{
struct shrinker_info *info;
unsigned long ret, freed = 0;
+ int srcu_idx;
int i;

if (!mem_cgroup_online(memcg))
return 0;

- if (!down_read_trylock(&shrinker_rwsem))
- return 0;
-
- info = shrinker_info_protected(memcg, nid);
+ srcu_idx = srcu_read_lock(&shrinker_srcu);
+ info = shrinker_info_srcu(memcg, nid);
if (unlikely(!info))
goto unlock;

@@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
set_shrinker_bit(memcg, nid, i);
}
freed += ret;
-
- if (rwsem_is_contended(&shrinker_rwsem)) {
- freed = freed ? : 1;
- break;
- }
}
unlock:
- up_read(&shrinker_rwsem);
+ srcu_read_unlock(&shrinker_srcu, srcu_idx);
return freed;
}
#else /* CONFIG_MEMCG */
--
2.20.1



2023-02-21 21:37:35

by Kirill Tkhai

[permalink] [raw]
Subject: Re: [PATCH 2/5] mm: vmscan: make memcg slab shrink lockless

On 20.02.2023 12:16, Qi Zheng wrote:
> Like global slab shrink, since commit 1cd0bd06093c
> ("rcu: Remove CONFIG_SRCU"), it's time to use SRCU
> to protect readers who previously held shrinker_rwsem.
>
> We can test with the following script:
>
> ```
> DIR="/root/shrinker/memcg/mnt"
>
> do_create()
> {
> mkdir /sys/fs/cgroup/memory/test
> echo 200M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> for i in `seq 0 $1`;
> do
> mkdir /sys/fs/cgroup/memory/test/$i;
> echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
> mkdir -p $DIR/$i;
> done
> }
>
> do_mount()
> {
> for i in `seq $1 $2`;
> do
> mount -t tmpfs $i $DIR/$i;
> done
> }
>
> do_touch()
> {
> for i in `seq $1 $2`;
> do
> echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
> dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
> done
> }
>
> do_create 2000
> do_mount 0 2000
> do_touch 0 1000
> ```
>
> Before applying:
>
> 46.60% [kernel] [k] down_read_trylock
> 18.70% [kernel] [k] up_read
> 15.44% [kernel] [k] shrink_slab
> 4.37% [kernel] [k] _find_next_bit
> 2.75% [kernel] [k] xa_load
> 2.07% [kernel] [k] idr_find
> 1.73% [kernel] [k] do_shrink_slab
> 1.42% [kernel] [k] shrink_lruvec
> 0.74% [kernel] [k] shrink_node
> 0.60% [kernel] [k] list_lru_count_one
>
> After applying:
>
> 19.53% [kernel] [k] _find_next_bit
> 14.63% [kernel] [k] do_shrink_slab
> 14.58% [kernel] [k] shrink_slab
> 11.83% [kernel] [k] shrink_lruvec
> 9.33% [kernel] [k] __blk_flush_plug
> 6.67% [kernel] [k] mem_cgroup_iter
> 3.73% [kernel] [k] list_lru_count_one
> 2.43% [kernel] [k] shrink_node
> 1.96% [kernel] [k] super_cache_count
> 1.78% [kernel] [k] __rcu_read_unlock
> 1.38% [kernel] [k] __srcu_read_lock
> 1.30% [kernel] [k] xas_descend
>
> We can see that the readers is no longer blocked.
>
> Signed-off-by: Qi Zheng <[email protected]>
> ---
> mm/vmscan.c | 56 ++++++++++++++++++++++++++++++-----------------------
> 1 file changed, 32 insertions(+), 24 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 95a3d6ddc6c1..dc47396ecd0e 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -57,6 +57,7 @@
> #include <linux/khugepaged.h>
> #include <linux/rculist_nulls.h>
> #include <linux/random.h>
> +#include <linux/srcu.h>
>
> #include <asm/tlbflush.h>
> #include <asm/div64.h>
> @@ -221,8 +222,21 @@ static inline int shrinker_defer_size(int nr_items)
> static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
> int nid)
> {
> - return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
> - lockdep_is_held(&shrinker_rwsem));
> + return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info,
> + &shrinker_srcu,
> + lockdep_is_held(&shrinker_rwsem));
> +}
> +
> +static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg,
> + int nid)
> +{
> + return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info,
> + &shrinker_srcu);
> +}
> +
> +static void free_shrinker_info_rcu(struct rcu_head *head)
> +{
> + kvfree(container_of(head, struct shrinker_info, rcu));
> }
>
> static int expand_one_shrinker_info(struct mem_cgroup *memcg,
> @@ -257,7 +271,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
> defer_size - old_defer_size);
>
> rcu_assign_pointer(pn->shrinker_info, new);
> - kvfree_rcu(old, rcu);
> + call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu);
> }
>
> return 0;
> @@ -350,13 +364,14 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
> {
> if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
> struct shrinker_info *info;
> + int srcu_idx;
>
> - rcu_read_lock();
> - info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
> + srcu_idx = srcu_read_lock(&shrinker_srcu);
> + info = shrinker_info_srcu(memcg, nid);
> /* Pairs with smp mb in shrink_slab() */
> smp_mb__before_atomic();
> set_bit(shrinker_id, info->map);
> - rcu_read_unlock();
> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
> }
> }
>
> @@ -370,7 +385,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
> return -ENOSYS;
>
> down_write(&shrinker_rwsem);
> - /* This may call shrinker, so it must use down_read_trylock() */
> id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
> if (id < 0)
> goto unlock;
> @@ -404,7 +418,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
> {
> struct shrinker_info *info;
>
> - info = shrinker_info_protected(memcg, nid);
> + info = shrinker_info_srcu(memcg, nid);
> return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
> }
>
> @@ -413,13 +427,13 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
> {
> struct shrinker_info *info;
>
> - info = shrinker_info_protected(memcg, nid);
> + info = shrinker_info_srcu(memcg, nid);
> return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
> }
>
> void reparent_shrinker_deferred(struct mem_cgroup *memcg)
> {
> - int i, nid;
> + int i, nid, srcu_idx;
> long nr;
> struct mem_cgroup *parent;
> struct shrinker_info *child_info, *parent_info;
> @@ -429,16 +443,16 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
> parent = root_mem_cgroup;
>
> /* Prevent from concurrent shrinker_info expand */
> - down_read(&shrinker_rwsem);
> + srcu_idx = srcu_read_lock(&shrinker_srcu);

Don't we still have to be protected against parallel expand_one_shrinker_info()?

It looks like parent->nodeinfo[nid]->shrinker_info pointer may be changed in expand*
right after we've dereferenced it here.

> for_each_node(nid) {
> - child_info = shrinker_info_protected(memcg, nid);
> - parent_info = shrinker_info_protected(parent, nid);
> + child_info = shrinker_info_srcu(memcg, nid);
> + parent_info = shrinker_info_srcu(parent, nid);
> for (i = 0; i < shrinker_nr_max; i++) {
> nr = atomic_long_read(&child_info->nr_deferred[i]);
> atomic_long_add(nr, &parent_info->nr_deferred[i]);
> }
> }
> - up_read(&shrinker_rwsem);
> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
> }
>
> static bool cgroup_reclaim(struct scan_control *sc)
> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
> {
> struct shrinker_info *info;
> unsigned long ret, freed = 0;
> + int srcu_idx;
> int i;
>
> if (!mem_cgroup_online(memcg))
> return 0;
>
> - if (!down_read_trylock(&shrinker_rwsem))
> - return 0;
> -
> - info = shrinker_info_protected(memcg, nid);
> + srcu_idx = srcu_read_lock(&shrinker_srcu);
> + info = shrinker_info_srcu(memcg, nid);
> if (unlikely(!info))
> goto unlock;
>
> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
> set_shrinker_bit(memcg, nid, i);
> }
> freed += ret;
> -
> - if (rwsem_is_contended(&shrinker_rwsem)) {
> - freed = freed ? : 1;
> - break;
> - }
> }
> unlock:
> - up_read(&shrinker_rwsem);
> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
> return freed;
> }
> #else /* CONFIG_MEMCG */


2023-02-21 21:43:20

by Kirill Tkhai

[permalink] [raw]
Subject: Re: [PATCH 2/5] mm: vmscan: make memcg slab shrink lockless

On 20.02.2023 12:16, Qi Zheng wrote:
> Like global slab shrink, since commit 1cd0bd06093c
> ("rcu: Remove CONFIG_SRCU"), it's time to use SRCU
> to protect readers who previously held shrinker_rwsem.
>
> We can test with the following script:
>
> ```
> DIR="/root/shrinker/memcg/mnt"
>
> do_create()
> {
> mkdir /sys/fs/cgroup/memory/test
> echo 200M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> for i in `seq 0 $1`;
> do
> mkdir /sys/fs/cgroup/memory/test/$i;
> echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
> mkdir -p $DIR/$i;
> done
> }
>
> do_mount()
> {
> for i in `seq $1 $2`;
> do
> mount -t tmpfs $i $DIR/$i;
> done
> }
>
> do_touch()
> {
> for i in `seq $1 $2`;
> do
> echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs;
> dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 &
> done
> }
>
> do_create 2000
> do_mount 0 2000
> do_touch 0 1000
> ```
>
> Before applying:
>
> 46.60% [kernel] [k] down_read_trylock
> 18.70% [kernel] [k] up_read
> 15.44% [kernel] [k] shrink_slab
> 4.37% [kernel] [k] _find_next_bit
> 2.75% [kernel] [k] xa_load
> 2.07% [kernel] [k] idr_find
> 1.73% [kernel] [k] do_shrink_slab
> 1.42% [kernel] [k] shrink_lruvec
> 0.74% [kernel] [k] shrink_node
> 0.60% [kernel] [k] list_lru_count_one
>
> After applying:
>
> 19.53% [kernel] [k] _find_next_bit
> 14.63% [kernel] [k] do_shrink_slab
> 14.58% [kernel] [k] shrink_slab
> 11.83% [kernel] [k] shrink_lruvec
> 9.33% [kernel] [k] __blk_flush_plug
> 6.67% [kernel] [k] mem_cgroup_iter
> 3.73% [kernel] [k] list_lru_count_one
> 2.43% [kernel] [k] shrink_node
> 1.96% [kernel] [k] super_cache_count
> 1.78% [kernel] [k] __rcu_read_unlock
> 1.38% [kernel] [k] __srcu_read_lock
> 1.30% [kernel] [k] xas_descend
>
> We can see that the readers is no longer blocked.
>
> Signed-off-by: Qi Zheng <[email protected]>
> ---
> mm/vmscan.c | 56 ++++++++++++++++++++++++++++++-----------------------
> 1 file changed, 32 insertions(+), 24 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 95a3d6ddc6c1..dc47396ecd0e 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -57,6 +57,7 @@
> #include <linux/khugepaged.h>
> #include <linux/rculist_nulls.h>
> #include <linux/random.h>
> +#include <linux/srcu.h>
>
> #include <asm/tlbflush.h>
> #include <asm/div64.h>
> @@ -221,8 +222,21 @@ static inline int shrinker_defer_size(int nr_items)
> static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
> int nid)
> {
> - return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
> - lockdep_is_held(&shrinker_rwsem));
> + return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info,
> + &shrinker_srcu,
> + lockdep_is_held(&shrinker_rwsem));
> +}
> +
> +static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg,
> + int nid)
> +{
> + return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info,
> + &shrinker_srcu);
> +}
> +
> +static void free_shrinker_info_rcu(struct rcu_head *head)
> +{
> + kvfree(container_of(head, struct shrinker_info, rcu));
> }
>
> static int expand_one_shrinker_info(struct mem_cgroup *memcg,
> @@ -257,7 +271,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
> defer_size - old_defer_size);
>
> rcu_assign_pointer(pn->shrinker_info, new);
> - kvfree_rcu(old, rcu);
> + call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu);
> }
>
> return 0;
> @@ -350,13 +364,14 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
> {
> if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
> struct shrinker_info *info;
> + int srcu_idx;
>
> - rcu_read_lock();
> - info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
> + srcu_idx = srcu_read_lock(&shrinker_srcu);
> + info = shrinker_info_srcu(memcg, nid);
> /* Pairs with smp mb in shrink_slab() */
> smp_mb__before_atomic();
> set_bit(shrinker_id, info->map);
> - rcu_read_unlock();
> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
> }
> }
>
> @@ -370,7 +385,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
> return -ENOSYS;
>
> down_write(&shrinker_rwsem);
> - /* This may call shrinker, so it must use down_read_trylock() */
> id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
> if (id < 0)
> goto unlock;
> @@ -404,7 +418,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
> {
> struct shrinker_info *info;
>
> - info = shrinker_info_protected(memcg, nid);
> + info = shrinker_info_srcu(memcg, nid);
> return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
> }
>
> @@ -413,13 +427,13 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
> {
> struct shrinker_info *info;
>
> - info = shrinker_info_protected(memcg, nid);
> + info = shrinker_info_srcu(memcg, nid);
> return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
> }
>
> void reparent_shrinker_deferred(struct mem_cgroup *memcg)
> {
> - int i, nid;
> + int i, nid, srcu_idx;
> long nr;
> struct mem_cgroup *parent;
> struct shrinker_info *child_info, *parent_info;
> @@ -429,16 +443,16 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
> parent = root_mem_cgroup;
>
> /* Prevent from concurrent shrinker_info expand */
> - down_read(&shrinker_rwsem);
> + srcu_idx = srcu_read_lock(&shrinker_srcu);
> for_each_node(nid) {
> - child_info = shrinker_info_protected(memcg, nid);
> - parent_info = shrinker_info_protected(parent, nid);
> + child_info = shrinker_info_srcu(memcg, nid);
> + parent_info = shrinker_info_srcu(parent, nid);
> for (i = 0; i < shrinker_nr_max; i++) {
> nr = atomic_long_read(&child_info->nr_deferred[i]);
> atomic_long_add(nr, &parent_info->nr_deferred[i]);
> }
> }
> - up_read(&shrinker_rwsem);
> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
> }
>
> static bool cgroup_reclaim(struct scan_control *sc)
> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
> {
> struct shrinker_info *info;
> unsigned long ret, freed = 0;
> + int srcu_idx;
> int i;
>
> if (!mem_cgroup_online(memcg))
> return 0;
>
> - if (!down_read_trylock(&shrinker_rwsem))
> - return 0;
> -
> - info = shrinker_info_protected(memcg, nid);
> + srcu_idx = srcu_read_lock(&shrinker_srcu);
> + info = shrinker_info_srcu(memcg, nid);
> if (unlikely(!info))
> goto unlock;

There is shrinker_nr_max dereference under this hunk. It's not in the patch:

for_each_set_bit(i, info->map, shrinker_nr_max) {

Since shrinker_nr_max may grow in parallel, this leads to access beyond allocated memory :(

It looks like we should save size of info->map as a new member of struct shrinker_info.

>
> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
> set_shrinker_bit(memcg, nid, i);
> }
> freed += ret;
> -
> - if (rwsem_is_contended(&shrinker_rwsem)) {
> - freed = freed ? : 1;
> - break;
> - }
> }
> unlock:
> - up_read(&shrinker_rwsem);
> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
> return freed;
> }
> #else /* CONFIG_MEMCG */


2023-02-22 07:32:16

by Qi Zheng

[permalink] [raw]
Subject: Re: [PATCH 2/5] mm: vmscan: make memcg slab shrink lockless



On 2023/2/22 05:28, Kirill Tkhai wrote:
> On 20.02.2023 12:16, Qi Zheng wrote:
<...>
>>
>> void reparent_shrinker_deferred(struct mem_cgroup *memcg)
>> {
>> - int i, nid;
>> + int i, nid, srcu_idx;
>> long nr;
>> struct mem_cgroup *parent;
>> struct shrinker_info *child_info, *parent_info;
>> @@ -429,16 +443,16 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
>> parent = root_mem_cgroup;
>>
>> /* Prevent from concurrent shrinker_info expand */
>> - down_read(&shrinker_rwsem);
>> + srcu_idx = srcu_read_lock(&shrinker_srcu);
>
> Don't we still have to be protected against parallel expand_one_shrinker_info()?
>
> It looks like parent->nodeinfo[nid]->shrinker_info pointer may be changed in expand*
> right after we've dereferenced it here.

Hi Kirill,

Oh, indeed. We may wrongly reparent the child's nr_deferred to the old
parent's nr_deferred (it is about to be freed by call_srcu).

The reparent_shrinker_deferred() will only be called on the offline
path (not a hotspot path), so we may be able to use shrinker_mutex
introduced later for protection. What do you think?

Thanks,
Qi

>
>> for_each_node(nid) {
>> - child_info = shrinker_info_protected(memcg, nid);
>> - parent_info = shrinker_info_protected(parent, nid);
>> + child_info = shrinker_info_srcu(memcg, nid);
>> + parent_info = shrinker_info_srcu(parent, nid);
>> for (i = 0; i < shrinker_nr_max; i++) {
>> nr = atomic_long_read(&child_info->nr_deferred[i]);
>> atomic_long_add(nr, &parent_info->nr_deferred[i]);
>> }
>> }
>> - up_read(&shrinker_rwsem);
>> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
>> }
>>
>> static bool cgroup_reclaim(struct scan_control *sc)
>> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>> {
>> struct shrinker_info *info;
>> unsigned long ret, freed = 0;
>> + int srcu_idx;
>> int i;
>>
>> if (!mem_cgroup_online(memcg))
>> return 0;
>>
>> - if (!down_read_trylock(&shrinker_rwsem))
>> - return 0;
>> -
>> - info = shrinker_info_protected(memcg, nid);
>> + srcu_idx = srcu_read_lock(&shrinker_srcu);
>> + info = shrinker_info_srcu(memcg, nid);
>> if (unlikely(!info))
>> goto unlock;
>>
>> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>> set_shrinker_bit(memcg, nid, i);
>> }
>> freed += ret;
>> -
>> - if (rwsem_is_contended(&shrinker_rwsem)) {
>> - freed = freed ? : 1;
>> - break;
>> - }
>> }
>> unlock:
>> - up_read(&shrinker_rwsem);
>> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
>> return freed;
>> }
>> #else /* CONFIG_MEMCG */
>

--
Thanks,
Qi

2023-02-22 08:16:49

by Qi Zheng

[permalink] [raw]
Subject: Re: [PATCH 2/5] mm: vmscan: make memcg slab shrink lockless

Hi Kirill,

On 2023/2/22 05:43, Kirill Tkhai wrote:
> On 20.02.2023 12:16, Qi Zheng wrote:
>> Like global slab shrink, since commit 1cd0bd06093c<...>
>> static bool cgroup_reclaim(struct scan_control *sc)
>> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>> {
>> struct shrinker_info *info;
>> unsigned long ret, freed = 0;
>> + int srcu_idx;
>> int i;
>>
>> if (!mem_cgroup_online(memcg))
>> return 0;
>>
>> - if (!down_read_trylock(&shrinker_rwsem))
>> - return 0;
>> -
>> - info = shrinker_info_protected(memcg, nid);
>> + srcu_idx = srcu_read_lock(&shrinker_srcu);
>> + info = shrinker_info_srcu(memcg, nid);
>> if (unlikely(!info))
>> goto unlock;
>
> There is shrinker_nr_max dereference under this hunk. It's not in the patch:
>
> for_each_set_bit(i, info->map, shrinker_nr_max) {
>
> Since shrinker_nr_max may grow in parallel, this leads to access beyond allocated memory :(

Oh, indeed.

>
> It looks like we should save size of info->map as a new member of struct shrinker_info.

Agree, then we only traverse info->map_size each time. Like below:

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b6eda2ab205d..f1b5d2803007 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -97,6 +97,7 @@ struct shrinker_info {
struct rcu_head rcu;
atomic_long_t *nr_deferred;
unsigned long *map;
+ int map_size;
};

struct lruvec_stats_percpu {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f94bfe540675..dd07eb107915 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -239,14 +239,20 @@ static void free_shrinker_info_rcu(struct rcu_head
*head)
kvfree(container_of(head, struct shrinker_info, rcu));
}

-static int expand_one_shrinker_info(struct mem_cgroup *memcg,
- int map_size, int defer_size,
- int old_map_size, int old_defer_size)
+static int expand_one_shrinker_info(struct mem_cgroup *memcg, int
new_nr_max,
+ int old_nr_max)
{
+ int map_size, defer_size, old_map_size, old_defer_size;
struct shrinker_info *new, *old;
struct mem_cgroup_per_node *pn;
int nid;
- int size = map_size + defer_size;
+ int size;
+
+ map_size = shrinker_map_size(new_nr_max);
+ defer_size = shrinker_defer_size(new_nr_max);
+ old_map_size = shrinker_map_size(shrinker_nr_max);
+ old_defer_size = shrinker_defer_size(shrinker_nr_max);
+ size = map_size + defer_size;

for_each_node(nid) {
pn = memcg->nodeinfo[nid];
@@ -261,6 +267,7 @@ static int expand_one_shrinker_info(struct
mem_cgroup *memcg,

new->nr_deferred = (atomic_long_t *)(new + 1);
new->map = (void *)new->nr_deferred + defer_size;
+ new->map_size = new_nr_max;

/* map: set all old bits, clear all new bits */
memset(new->map, (int)0xff, old_map_size);
@@ -310,6 +317,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg)
}
info->nr_deferred = (atomic_long_t *)(info + 1);
info->map = (void *)info->nr_deferred + defer_size;
+ info->map_size = shrinker_nr_max;
rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info,
info);
}
mutex_unlock(&shrinker_mutex);
@@ -327,8 +335,6 @@ static int expand_shrinker_info(int new_id)
{
int ret = 0;
int new_nr_max = new_id + 1;
- int map_size, defer_size = 0;
- int old_map_size, old_defer_size = 0;
struct mem_cgroup *memcg;

if (!need_expand(new_nr_max))
@@ -339,15 +345,9 @@ static int expand_shrinker_info(int new_id)

lockdep_assert_held(&shrinker_mutex);

- map_size = shrinker_map_size(new_nr_max);
- defer_size = shrinker_defer_size(new_nr_max);
- old_map_size = shrinker_map_size(shrinker_nr_max);
- old_defer_size = shrinker_defer_size(shrinker_nr_max);
-
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
- ret = expand_one_shrinker_info(memcg, map_size, defer_size,
- old_map_size,
old_defer_size);
+ ret = expand_one_shrinker_info(memcg, new_nr_max,
shrinker_nr_max);
if (ret) {
mem_cgroup_iter_break(NULL, memcg);
goto out;
@@ -912,7 +912,7 @@ static unsigned long shrink_slab_memcg(gfp_t
gfp_mask, int nid,
if (unlikely(!info))
goto unlock;

- for_each_set_bit(i, info->map, shrinker_nr_max) {
+ for_each_set_bit(i, info->map, info->map_size) {
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,

I will send the v2.

Thanks,
Qi

>
>>
>> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>> set_shrinker_bit(memcg, nid, i);
>> }
>> freed += ret;
>> -
>> - if (rwsem_is_contended(&shrinker_rwsem)) {
>> - freed = freed ? : 1;
>> - break;
>> - }
>> }
>> unlock:
>> - up_read(&shrinker_rwsem);
>> + srcu_read_unlock(&shrinker_srcu, srcu_idx);
>> return freed;
>> }
>> #else /* CONFIG_MEMCG */
>

--
Thanks,
Qi

2023-02-22 08:22:01

by Qi Zheng

[permalink] [raw]
Subject: Re: [PATCH 2/5] mm: vmscan: make memcg slab shrink lockless



On 2023/2/22 16:16, Qi Zheng wrote:
> Hi Kirill,
>
> On 2023/2/22 05:43, Kirill Tkhai wrote:
>> On 20.02.2023 12:16, Qi Zheng wrote:
>>> Like global slab shrink, since commit 1cd0bd06093c<...>
>>>   static bool cgroup_reclaim(struct scan_control *sc)
>>> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t
>>> gfp_mask, int nid,
>>>   {
>>>       struct shrinker_info *info;
>>>       unsigned long ret, freed = 0;
>>> +    int srcu_idx;
>>>       int i;
>>>       if (!mem_cgroup_online(memcg))
>>>           return 0;
>>> -    if (!down_read_trylock(&shrinker_rwsem))
>>> -        return 0;
>>> -
>>> -    info = shrinker_info_protected(memcg, nid);
>>> +    srcu_idx = srcu_read_lock(&shrinker_srcu);
>>> +    info = shrinker_info_srcu(memcg, nid);
>>>       if (unlikely(!info))
>>>           goto unlock;
>>
>> There is shrinker_nr_max dereference under this hunk. It's not in the
>> patch:
>>
>>          for_each_set_bit(i, info->map, shrinker_nr_max) {
>>
>> Since shrinker_nr_max may grow in parallel, this leads to access
>> beyond allocated memory :(
>
> Oh, indeed.
>
>>
>> It looks like we should save size of info->map as a new member of
>> struct shrinker_info.
>
> Agree, then we only traverse info->map_size each time. Like below:
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index b6eda2ab205d..f1b5d2803007 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -97,6 +97,7 @@ struct shrinker_info {
>         struct rcu_head rcu;
>         atomic_long_t *nr_deferred;
>         unsigned long *map;
> +       int map_size;
>  };
>
>  struct lruvec_stats_percpu {
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index f94bfe540675..dd07eb107915 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -239,14 +239,20 @@ static void free_shrinker_info_rcu(struct rcu_head
> *head)
>         kvfree(container_of(head, struct shrinker_info, rcu));
>  }
>
> -static int expand_one_shrinker_info(struct mem_cgroup *memcg,
> -                                   int map_size, int defer_size,
> -                                   int old_map_size, int old_defer_size)
> +static int expand_one_shrinker_info(struct mem_cgroup *memcg, int
> new_nr_max,
> +                                   int old_nr_max)
>  {
> +       int map_size, defer_size, old_map_size, old_defer_size;
>         struct shrinker_info *new, *old;
>         struct mem_cgroup_per_node *pn;
>         int nid;
> -       int size = map_size + defer_size;
> +       int size;
> +
> +       map_size = shrinker_map_size(new_nr_max);
> +       defer_size = shrinker_defer_size(new_nr_max);
> +       old_map_size = shrinker_map_size(shrinker_nr_max);
> +       old_defer_size = shrinker_defer_size(shrinker_nr_max);

Perhaps these should still be calculated outside the loop as before.

> +       size = map_size + defer_size;
>
>         for_each_node(nid) {
>                 pn = memcg->nodeinfo[nid];
> @@ -261,6 +267,7 @@ static int expand_one_shrinker_info(struct
> mem_cgroup *memcg,
>
>                 new->nr_deferred = (atomic_long_t *)(new + 1);
>                 new->map = (void *)new->nr_deferred + defer_size;
> +               new->map_size = new_nr_max;
>
>                 /* map: set all old bits, clear all new bits */
>                 memset(new->map, (int)0xff, old_map_size);
> @@ -310,6 +317,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg)
>                 }
>                 info->nr_deferred = (atomic_long_t *)(info + 1);
>                 info->map = (void *)info->nr_deferred + defer_size;
> +               info->map_size = shrinker_nr_max;
>                 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info,
> info);
>         }
>         mutex_unlock(&shrinker_mutex);
> @@ -327,8 +335,6 @@ static int expand_shrinker_info(int new_id)
>  {
>         int ret = 0;
>         int new_nr_max = new_id + 1;
> -       int map_size, defer_size = 0;
> -       int old_map_size, old_defer_size = 0;
>         struct mem_cgroup *memcg;
>
>         if (!need_expand(new_nr_max))
> @@ -339,15 +345,9 @@ static int expand_shrinker_info(int new_id)
>
>         lockdep_assert_held(&shrinker_mutex);
>
> -       map_size = shrinker_map_size(new_nr_max);
> -       defer_size = shrinker_defer_size(new_nr_max);
> -       old_map_size = shrinker_map_size(shrinker_nr_max);
> -       old_defer_size = shrinker_defer_size(shrinker_nr_max);
> -
>         memcg = mem_cgroup_iter(NULL, NULL, NULL);
>         do {
> -               ret = expand_one_shrinker_info(memcg, map_size, defer_size,
> -                                              old_map_size,
> old_defer_size);
> +               ret = expand_one_shrinker_info(memcg, new_nr_max,
> shrinker_nr_max);
>                 if (ret) {
>                         mem_cgroup_iter_break(NULL, memcg);
>                         goto out;
> @@ -912,7 +912,7 @@ static unsigned long shrink_slab_memcg(gfp_t
> gfp_mask, int nid,
>         if (unlikely(!info))
>                 goto unlock;
>
> -       for_each_set_bit(i, info->map, shrinker_nr_max) {
> +       for_each_set_bit(i, info->map, info->map_size) {
>                 struct shrink_control sc = {
>                         .gfp_mask = gfp_mask,
>                         .nid = nid,
>
> I will send the v2.
>
> Thanks,
> Qi
>
>>
>>> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t
>>> gfp_mask, int nid,
>>>                   set_shrinker_bit(memcg, nid, i);
>>>           }
>>>           freed += ret;
>>> -
>>> -        if (rwsem_is_contended(&shrinker_rwsem)) {
>>> -            freed = freed ? : 1;
>>> -            break;
>>> -        }
>>>       }
>>>   unlock:
>>> -    up_read(&shrinker_rwsem);
>>> +    srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>       return freed;
>>>   }
>>>   #else /* CONFIG_MEMCG */
>>
>

--
Thanks,
Qi

2023-02-22 19:59:18

by Kirill Tkhai

[permalink] [raw]
Subject: Re: [PATCH 2/5] mm: vmscan: make memcg slab shrink lockless

On 22.02.2023 10:32, Qi Zheng wrote:
>
>
> On 2023/2/22 05:28, Kirill Tkhai wrote:
>> On 20.02.2023 12:16, Qi Zheng wrote:
> <...>
>>>     void reparent_shrinker_deferred(struct mem_cgroup *memcg)
>>>   {
>>> -    int i, nid;
>>> +    int i, nid, srcu_idx;
>>>       long nr;
>>>       struct mem_cgroup *parent;
>>>       struct shrinker_info *child_info, *parent_info;
>>> @@ -429,16 +443,16 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
>>>           parent = root_mem_cgroup;
>>>         /* Prevent from concurrent shrinker_info expand */
>>> -    down_read(&shrinker_rwsem);
>>> +    srcu_idx = srcu_read_lock(&shrinker_srcu);
>>
>> Don't we still have to be protected against parallel expand_one_shrinker_info()?
>>
>> It looks like parent->nodeinfo[nid]->shrinker_info pointer may be changed in expand*
>> right after we've dereferenced it here.
>
> Hi Kirill,
>
> Oh, indeed. We may wrongly reparent the child's nr_deferred to the old
> parent's nr_deferred (it is about to be freed by call_srcu).
>
> The reparent_shrinker_deferred() will only be called on the offline
> path (not a hotspot path), so we may be able to use shrinker_mutex
> introduced later for protection. What do you think?

It looks good for me. One more thing I'd analyzed is whether we want to have
is two reparent_shrinker_deferred() are executing in parallel.

Possible, we should leave rwsem there as it was used before..

>>
>>>       for_each_node(nid) {
>>> -        child_info = shrinker_info_protected(memcg, nid);
>>> -        parent_info = shrinker_info_protected(parent, nid);
>>> +        child_info = shrinker_info_srcu(memcg, nid);
>>> +        parent_info = shrinker_info_srcu(parent, nid);
>>>           for (i = 0; i < shrinker_nr_max; i++) {
>>>               nr = atomic_long_read(&child_info->nr_deferred[i]);
>>>               atomic_long_add(nr, &parent_info->nr_deferred[i]);
>>>           }
>>>       }
>>> -    up_read(&shrinker_rwsem);
>>> +    srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>   }
>>>     static bool cgroup_reclaim(struct scan_control *sc)
>>> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>   {
>>>       struct shrinker_info *info;
>>>       unsigned long ret, freed = 0;
>>> +    int srcu_idx;
>>>       int i;
>>>         if (!mem_cgroup_online(memcg))
>>>           return 0;
>>>   -    if (!down_read_trylock(&shrinker_rwsem))
>>> -        return 0;
>>> -
>>> -    info = shrinker_info_protected(memcg, nid);
>>> +    srcu_idx = srcu_read_lock(&shrinker_srcu);
>>> +    info = shrinker_info_srcu(memcg, nid);
>>>       if (unlikely(!info))
>>>           goto unlock;
>>>   @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>                   set_shrinker_bit(memcg, nid, i);
>>>           }
>>>           freed += ret;
>>> -
>>> -        if (rwsem_is_contended(&shrinker_rwsem)) {
>>> -            freed = freed ? : 1;
>>> -            break;
>>> -        }
>>>       }
>>>   unlock:
>>> -    up_read(&shrinker_rwsem);
>>> +    srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>       return freed;
>>>   }
>>>   #else /* CONFIG_MEMCG */
>>
>


2023-02-22 20:06:01

by Kirill Tkhai

[permalink] [raw]
Subject: Re: [PATCH 2/5] mm: vmscan: make memcg slab shrink lockless

On 22.02.2023 11:21, Qi Zheng wrote:
>
>
> On 2023/2/22 16:16, Qi Zheng wrote:
>> Hi Kirill,
>>
>> On 2023/2/22 05:43, Kirill Tkhai wrote:
>>> On 20.02.2023 12:16, Qi Zheng wrote:
>>>> Like global slab shrink, since commit 1cd0bd06093c<...>
>>>>   static bool cgroup_reclaim(struct scan_control *sc)
>>>> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>   {
>>>>       struct shrinker_info *info;
>>>>       unsigned long ret, freed = 0;
>>>> +    int srcu_idx;
>>>>       int i;
>>>>       if (!mem_cgroup_online(memcg))
>>>>           return 0;
>>>> -    if (!down_read_trylock(&shrinker_rwsem))
>>>> -        return 0;
>>>> -
>>>> -    info = shrinker_info_protected(memcg, nid);
>>>> +    srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>> +    info = shrinker_info_srcu(memcg, nid);
>>>>       if (unlikely(!info))
>>>>           goto unlock;
>>>
>>> There is shrinker_nr_max dereference under this hunk. It's not in the patch:
>>>
>>>          for_each_set_bit(i, info->map, shrinker_nr_max) {
>>>
>>> Since shrinker_nr_max may grow in parallel, this leads to access beyond allocated memory :(
>>
>> Oh, indeed.
>>
>>>
>>> It looks like we should save size of info->map as a new member of struct shrinker_info.
>>
>> Agree, then we only traverse info->map_size each time. Like below:
>>
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index b6eda2ab205d..f1b5d2803007 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -97,6 +97,7 @@ struct shrinker_info {
>>          struct rcu_head rcu;
>>          atomic_long_t *nr_deferred;
>>          unsigned long *map;
>> +       int map_size;

Sure, like this. The only thing (after rethinking) I want to say is that despite "size" was
may suggestion, now it makes me think that name "size" is about size in bytes.

Possible, something like map_nr_max would be better here.

>>   };
>>
>>   struct lruvec_stats_percpu {
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index f94bfe540675..dd07eb107915 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -239,14 +239,20 @@ static void free_shrinker_info_rcu(struct rcu_head *head)
>>          kvfree(container_of(head, struct shrinker_info, rcu));
>>   }
>>
>> -static int expand_one_shrinker_info(struct mem_cgroup *memcg,
>> -                                   int map_size, int defer_size,
>> -                                   int old_map_size, int old_defer_size)
>> +static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_nr_max,
>> +                                   int old_nr_max)
>>   {
>> +       int map_size, defer_size, old_map_size, old_defer_size;
>>          struct shrinker_info *new, *old;
>>          struct mem_cgroup_per_node *pn;
>>          int nid;
>> -       int size = map_size + defer_size;
>> +       int size;
>> +
>> +       map_size = shrinker_map_size(new_nr_max);
>> +       defer_size = shrinker_defer_size(new_nr_max);
>> +       old_map_size = shrinker_map_size(shrinker_nr_max);
>> +       old_defer_size = shrinker_defer_size(shrinker_nr_max);
>
> Perhaps these should still be calculated outside the loop as before.

Yeah, for me it's also better.

>> +       size = map_size + defer_size;
>>
>>          for_each_node(nid) {
>>                  pn = memcg->nodeinfo[nid];
>> @@ -261,6 +267,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
>>
>>                  new->nr_deferred = (atomic_long_t *)(new + 1);
>>                  new->map = (void *)new->nr_deferred + defer_size;
>> +               new->map_size = new_nr_max;
>>
>>                  /* map: set all old bits, clear all new bits */
>>                  memset(new->map, (int)0xff, old_map_size);
>> @@ -310,6 +317,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg)
>>                  }
>>                  info->nr_deferred = (atomic_long_t *)(info + 1);
>>                  info->map = (void *)info->nr_deferred + defer_size;
>> +               info->map_size = shrinker_nr_max;
>>                  rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
>>          }
>>          mutex_unlock(&shrinker_mutex);
>> @@ -327,8 +335,6 @@ static int expand_shrinker_info(int new_id)
>>   {
>>          int ret = 0;
>>          int new_nr_max = new_id + 1;
>> -       int map_size, defer_size = 0;
>> -       int old_map_size, old_defer_size = 0;
>>          struct mem_cgroup *memcg;
>>
>>          if (!need_expand(new_nr_max))
>> @@ -339,15 +345,9 @@ static int expand_shrinker_info(int new_id)
>>
>>          lockdep_assert_held(&shrinker_mutex);
>>
>> -       map_size = shrinker_map_size(new_nr_max);
>> -       defer_size = shrinker_defer_size(new_nr_max);
>> -       old_map_size = shrinker_map_size(shrinker_nr_max);
>> -       old_defer_size = shrinker_defer_size(shrinker_nr_max);
>> -
>>          memcg = mem_cgroup_iter(NULL, NULL, NULL);
>>          do {
>> -               ret = expand_one_shrinker_info(memcg, map_size, defer_size,
>> -                                              old_map_size, old_defer_size);
>> +               ret = expand_one_shrinker_info(memcg, new_nr_max, shrinker_nr_max);
>>                  if (ret) {
>>                          mem_cgroup_iter_break(NULL, memcg);
>>                          goto out;
>> @@ -912,7 +912,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>          if (unlikely(!info))
>>                  goto unlock;
>>
>> -       for_each_set_bit(i, info->map, shrinker_nr_max) {
>> +       for_each_set_bit(i, info->map, info->map_size) {
>>                  struct shrink_control sc = {
>>                          .gfp_mask = gfp_mask,
>>                          .nid = nid,
>>
>> I will send the v2.
>>
>> Thanks,
>> Qi
>>
>>>
>>>> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>                   set_shrinker_bit(memcg, nid, i);
>>>>           }
>>>>           freed += ret;
>>>> -
>>>> -        if (rwsem_is_contended(&shrinker_rwsem)) {
>>>> -            freed = freed ? : 1;
>>>> -            break;
>>>> -        }
>>>>       }
>>>>   unlock:
>>>> -    up_read(&shrinker_rwsem);
>>>> +    srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>       return freed;
>>>>   }
>>>>   #else /* CONFIG_MEMCG */
>>>
>>
>


2023-02-23 04:36:17

by Qi Zheng

[permalink] [raw]
Subject: Re: [PATCH 2/5] mm: vmscan: make memcg slab shrink lockless



On 2023/2/23 03:58, Kirill Tkhai wrote:
> On 22.02.2023 10:32, Qi Zheng wrote:
>>
>>
>> On 2023/2/22 05:28, Kirill Tkhai wrote:
>>> On 20.02.2023 12:16, Qi Zheng wrote:
>> <...>
>>>>     void reparent_shrinker_deferred(struct mem_cgroup *memcg)
>>>>   {
>>>> -    int i, nid;
>>>> +    int i, nid, srcu_idx;
>>>>       long nr;
>>>>       struct mem_cgroup *parent;
>>>>       struct shrinker_info *child_info, *parent_info;
>>>> @@ -429,16 +443,16 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
>>>>           parent = root_mem_cgroup;
>>>>         /* Prevent from concurrent shrinker_info expand */
>>>> -    down_read(&shrinker_rwsem);
>>>> +    srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>
>>> Don't we still have to be protected against parallel expand_one_shrinker_info()?
>>>
>>> It looks like parent->nodeinfo[nid]->shrinker_info pointer may be changed in expand*
>>> right after we've dereferenced it here.
>>
>> Hi Kirill,
>>
>> Oh, indeed. We may wrongly reparent the child's nr_deferred to the old
>> parent's nr_deferred (it is about to be freed by call_srcu).
>>
>> The reparent_shrinker_deferred() will only be called on the offline
>> path (not a hotspot path), so we may be able to use shrinker_mutex
>> introduced later for protection. What do you think?
>
> It looks good for me. One more thing I'd analyzed is whether we want to have
> is two reparent_shrinker_deferred() are executing in parallel.

I see that mem_cgroup_css_offline() is already protected by
cgroup_mutex, so maybe shrinker_mutex is enough here. :)

>
> Possible, we should leave rwsem there as it was used before..
>
>>>
>>>>       for_each_node(nid) {
>>>> -        child_info = shrinker_info_protected(memcg, nid);
>>>> -        parent_info = shrinker_info_protected(parent, nid);
>>>> +        child_info = shrinker_info_srcu(memcg, nid);
>>>> +        parent_info = shrinker_info_srcu(parent, nid);
>>>>           for (i = 0; i < shrinker_nr_max; i++) {
>>>>               nr = atomic_long_read(&child_info->nr_deferred[i]);
>>>>               atomic_long_add(nr, &parent_info->nr_deferred[i]);
>>>>           }
>>>>       }
>>>> -    up_read(&shrinker_rwsem);
>>>> +    srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>   }
>>>>     static bool cgroup_reclaim(struct scan_control *sc)
>>>> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>   {
>>>>       struct shrinker_info *info;
>>>>       unsigned long ret, freed = 0;
>>>> +    int srcu_idx;
>>>>       int i;
>>>>         if (!mem_cgroup_online(memcg))
>>>>           return 0;
>>>>   -    if (!down_read_trylock(&shrinker_rwsem))
>>>> -        return 0;
>>>> -
>>>> -    info = shrinker_info_protected(memcg, nid);
>>>> +    srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>> +    info = shrinker_info_srcu(memcg, nid);
>>>>       if (unlikely(!info))
>>>>           goto unlock;
>>>>   @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>                   set_shrinker_bit(memcg, nid, i);
>>>>           }
>>>>           freed += ret;
>>>> -
>>>> -        if (rwsem_is_contended(&shrinker_rwsem)) {
>>>> -            freed = freed ? : 1;
>>>> -            break;
>>>> -        }
>>>>       }
>>>>   unlock:
>>>> -    up_read(&shrinker_rwsem);
>>>> +    srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>       return freed;
>>>>   }
>>>>   #else /* CONFIG_MEMCG */
>>>
>>
>

--
Thanks,
Qi

2023-02-23 04:38:00

by Qi Zheng

[permalink] [raw]
Subject: Re: [PATCH 2/5] mm: vmscan: make memcg slab shrink lockless



On 2023/2/23 04:05, Kirill Tkhai wrote:
> On 22.02.2023 11:21, Qi Zheng wrote:
>>
>>
>> On 2023/2/22 16:16, Qi Zheng wrote:
>>> Hi Kirill,
>>>
>>> On 2023/2/22 05:43, Kirill Tkhai wrote:
>>>> On 20.02.2023 12:16, Qi Zheng wrote:
>>>>> Like global slab shrink, since commit 1cd0bd06093c<...>
>>>>>   static bool cgroup_reclaim(struct scan_control *sc)
>>>>> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>>   {
>>>>>       struct shrinker_info *info;
>>>>>       unsigned long ret, freed = 0;
>>>>> +    int srcu_idx;
>>>>>       int i;
>>>>>       if (!mem_cgroup_online(memcg))
>>>>>           return 0;
>>>>> -    if (!down_read_trylock(&shrinker_rwsem))
>>>>> -        return 0;
>>>>> -
>>>>> -    info = shrinker_info_protected(memcg, nid);
>>>>> +    srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>>> +    info = shrinker_info_srcu(memcg, nid);
>>>>>       if (unlikely(!info))
>>>>>           goto unlock;
>>>>
>>>> There is shrinker_nr_max dereference under this hunk. It's not in the patch:
>>>>
>>>>          for_each_set_bit(i, info->map, shrinker_nr_max) {
>>>>
>>>> Since shrinker_nr_max may grow in parallel, this leads to access beyond allocated memory :(
>>>
>>> Oh, indeed.
>>>
>>>>
>>>> It looks like we should save size of info->map as a new member of struct shrinker_info.
>>>
>>> Agree, then we only traverse info->map_size each time. Like below:
>>>
>>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>>> index b6eda2ab205d..f1b5d2803007 100644
>>> --- a/include/linux/memcontrol.h
>>> +++ b/include/linux/memcontrol.h
>>> @@ -97,6 +97,7 @@ struct shrinker_info {
>>>          struct rcu_head rcu;
>>>          atomic_long_t *nr_deferred;
>>>          unsigned long *map;
>>> +       int map_size;
>
> Sure, like this. The only thing (after rethinking) I want to say is that despite "size" was
> may suggestion, now it makes me think that name "size" is about size in bytes.
>
> Possible, something like map_nr_max would be better here.

Agree. Will change to it.

>
>>>   };
>>>
>>>   struct lruvec_stats_percpu {
>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>> index f94bfe540675..dd07eb107915 100644
>>> --- a/mm/vmscan.c
>>> +++ b/mm/vmscan.c
>>> @@ -239,14 +239,20 @@ static void free_shrinker_info_rcu(struct rcu_head *head)
>>>          kvfree(container_of(head, struct shrinker_info, rcu));
>>>   }
>>>
>>> -static int expand_one_shrinker_info(struct mem_cgroup *memcg,
>>> -                                   int map_size, int defer_size,
>>> -                                   int old_map_size, int old_defer_size)
>>> +static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_nr_max,
>>> +                                   int old_nr_max)
>>>   {
>>> +       int map_size, defer_size, old_map_size, old_defer_size;
>>>          struct shrinker_info *new, *old;
>>>          struct mem_cgroup_per_node *pn;
>>>          int nid;
>>> -       int size = map_size + defer_size;
>>> +       int size;
>>> +
>>> +       map_size = shrinker_map_size(new_nr_max);
>>> +       defer_size = shrinker_defer_size(new_nr_max);
>>> +       old_map_size = shrinker_map_size(shrinker_nr_max);
>>> +       old_defer_size = shrinker_defer_size(shrinker_nr_max);
>>
>> Perhaps these should still be calculated outside the loop as before.
>
> Yeah, for me it's also better.
>
>>> +       size = map_size + defer_size;
>>>
>>>          for_each_node(nid) {
>>>                  pn = memcg->nodeinfo[nid];
>>> @@ -261,6 +267,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
>>>
>>>                  new->nr_deferred = (atomic_long_t *)(new + 1);
>>>                  new->map = (void *)new->nr_deferred + defer_size;
>>> +               new->map_size = new_nr_max;
>>>
>>>                  /* map: set all old bits, clear all new bits */
>>>                  memset(new->map, (int)0xff, old_map_size);
>>> @@ -310,6 +317,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg)
>>>                  }
>>>                  info->nr_deferred = (atomic_long_t *)(info + 1);
>>>                  info->map = (void *)info->nr_deferred + defer_size;
>>> +               info->map_size = shrinker_nr_max;
>>>                  rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
>>>          }
>>>          mutex_unlock(&shrinker_mutex);
>>> @@ -327,8 +335,6 @@ static int expand_shrinker_info(int new_id)
>>>   {
>>>          int ret = 0;
>>>          int new_nr_max = new_id + 1;
>>> -       int map_size, defer_size = 0;
>>> -       int old_map_size, old_defer_size = 0;
>>>          struct mem_cgroup *memcg;
>>>
>>>          if (!need_expand(new_nr_max))
>>> @@ -339,15 +345,9 @@ static int expand_shrinker_info(int new_id)
>>>
>>>          lockdep_assert_held(&shrinker_mutex);
>>>
>>> -       map_size = shrinker_map_size(new_nr_max);
>>> -       defer_size = shrinker_defer_size(new_nr_max);
>>> -       old_map_size = shrinker_map_size(shrinker_nr_max);
>>> -       old_defer_size = shrinker_defer_size(shrinker_nr_max);
>>> -
>>>          memcg = mem_cgroup_iter(NULL, NULL, NULL);
>>>          do {
>>> -               ret = expand_one_shrinker_info(memcg, map_size, defer_size,
>>> -                                              old_map_size, old_defer_size);
>>> +               ret = expand_one_shrinker_info(memcg, new_nr_max, shrinker_nr_max);
>>>                  if (ret) {
>>>                          mem_cgroup_iter_break(NULL, memcg);
>>>                          goto out;
>>> @@ -912,7 +912,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>          if (unlikely(!info))
>>>                  goto unlock;
>>>
>>> -       for_each_set_bit(i, info->map, shrinker_nr_max) {
>>> +       for_each_set_bit(i, info->map, info->map_size) {
>>>                  struct shrink_control sc = {
>>>                          .gfp_mask = gfp_mask,
>>>                          .nid = nid,
>>>
>>> I will send the v2.
>>>
>>> Thanks,
>>> Qi
>>>
>>>>
>>>>> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>>                   set_shrinker_bit(memcg, nid, i);
>>>>>           }
>>>>>           freed += ret;
>>>>> -
>>>>> -        if (rwsem_is_contended(&shrinker_rwsem)) {
>>>>> -            freed = freed ? : 1;
>>>>> -            break;
>>>>> -        }
>>>>>       }
>>>>>   unlock:
>>>>> -    up_read(&shrinker_rwsem);
>>>>> +    srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>>       return freed;
>>>>>   }
>>>>>   #else /* CONFIG_MEMCG */
>>>>
>>>
>>
>

--
Thanks,
Qi