2023-08-07 11:24:28

by Qi Zheng

[permalink] [raw]
Subject: [PATCH v4 44/48] mm: shrinker: add a secondary array for shrinker_info::{map, nr_deferred}

Currently, we maintain two linear arrays per node per memcg, which are
shrinker_info::map and shrinker_info::nr_deferred. And we need to resize
them when the shrinker_nr_max is exceeded, that is, allocate a new array,
and then copy the old array to the new array, and finally free the old
array by RCU.

For shrinker_info::map, we do set_bit() under the RCU lock, so we may set
the value into the old map which is about to be freed. This may cause the
value set to be lost. The current solution is not to copy the old map when
resizing, but to set all the corresponding bits in the new map to 1. This
solves the data loss problem, but bring the overhead of more pointless
loops while doing memcg slab shrink.

For shrinker_info::nr_deferred, we will only modify it under the read lock
of shrinker_rwsem, so it will not run concurrently with the resizing. But
after we make memcg slab shrink lockless, there will be the same data loss
problem as shrinker_info::map, and we can't work around it like the map.

For such resizable arrays, the most straightforward idea is to change it
to xarray, like we did for list_lru [1]. We need to do xa_store() in the
list_lru_add()-->set_shrinker_bit(), but this will cause memory
allocation, and the list_lru_add() doesn't accept failure. A possible
solution is to pre-allocate, but the location of pre-allocation is not
well determined.

Therefore, this commit chooses to introduce a secondary array for
shrinker_info::{map, nr_deferred}, so that we only need to copy this
secondary array every time the size is resized. Then even if we get the
old secondary array under the RCU lock, the found map and nr_deferred are
also true, so no data is lost.

[1]. https://lore.kernel.org/all/[email protected]/

Signed-off-by: Qi Zheng <[email protected]>
Reviewed-by: Muchun Song <[email protected]>
---
include/linux/memcontrol.h | 12 +-
include/linux/shrinker.h | 17 +++
mm/shrinker.c | 250 +++++++++++++++++++++++--------------
3 files changed, 172 insertions(+), 107 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 11810a2cfd2d..b49515bb6fbd 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -21,6 +21,7 @@
#include <linux/vmstat.h>
#include <linux/writeback.h>
#include <linux/page-flags.h>
+#include <linux/shrinker.h>

struct mem_cgroup;
struct obj_cgroup;
@@ -88,17 +89,6 @@ struct mem_cgroup_reclaim_iter {
unsigned int generation;
};

-/*
- * Bitmap and deferred work of shrinker::id corresponding to memcg-aware
- * shrinkers, which have elements charged to this memcg.
- */
-struct shrinker_info {
- struct rcu_head rcu;
- atomic_long_t *nr_deferred;
- unsigned long *map;
- int map_nr_max;
-};
-
struct lruvec_stats_percpu {
/* Local (CPU and cgroup) state */
long state[NR_VM_NODE_STAT_ITEMS];
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 025c8070dd86..eb342994675a 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -5,6 +5,23 @@
#include <linux/atomic.h>
#include <linux/types.h>

+#define SHRINKER_UNIT_BITS BITS_PER_LONG
+
+/*
+ * Bitmap and deferred work of shrinker::id corresponding to memcg-aware
+ * shrinkers, which have elements charged to the memcg.
+ */
+struct shrinker_info_unit {
+ atomic_long_t nr_deferred[SHRINKER_UNIT_BITS];
+ DECLARE_BITMAP(map, SHRINKER_UNIT_BITS);
+};
+
+struct shrinker_info {
+ struct rcu_head rcu;
+ int map_nr_max;
+ struct shrinker_info_unit *unit[];
+};
+
/*
* This struct is used to pass information from page reclaim to the shrinkers.
* We consolidate the values for easier extension later.
diff --git a/mm/shrinker.c b/mm/shrinker.c
index a27779ed3798..1911c06b8af5 100644
--- a/mm/shrinker.c
+++ b/mm/shrinker.c
@@ -12,15 +12,50 @@ DECLARE_RWSEM(shrinker_rwsem);
#ifdef CONFIG_MEMCG
static int shrinker_nr_max;

-/* The shrinker_info is expanded in a batch of BITS_PER_LONG */
-static inline int shrinker_map_size(int nr_items)
+static inline int shrinker_unit_size(int nr_items)
{
- return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
+ return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *));
}

-static inline int shrinker_defer_size(int nr_items)
+static inline void shrinker_unit_free(struct shrinker_info *info, int start)
{
- return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
+ struct shrinker_info_unit **unit;
+ int nr, i;
+
+ if (!info)
+ return;
+
+ unit = info->unit;
+ nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS);
+
+ for (i = start; i < nr; i++) {
+ if (!unit[i])
+ break;
+
+ kvfree(unit[i]);
+ unit[i] = NULL;
+ }
+}
+
+static inline int shrinker_unit_alloc(struct shrinker_info *new,
+ struct shrinker_info *old, int nid)
+{
+ struct shrinker_info_unit *unit;
+ int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS);
+ int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0;
+ int i;
+
+ for (i = start; i < nr; i++) {
+ unit = kvzalloc_node(sizeof(*unit), GFP_KERNEL, nid);
+ if (!unit) {
+ shrinker_unit_free(new, start);
+ return -ENOMEM;
+ }
+
+ new->unit[i] = unit;
+ }
+
+ return 0;
}

void free_shrinker_info(struct mem_cgroup *memcg)
@@ -32,6 +67,7 @@ void free_shrinker_info(struct mem_cgroup *memcg)
for_each_node(nid) {
pn = memcg->nodeinfo[nid];
info = rcu_dereference_protected(pn->shrinker_info, true);
+ shrinker_unit_free(info, 0);
kvfree(info);
rcu_assign_pointer(pn->shrinker_info, NULL);
}
@@ -40,28 +76,27 @@ void free_shrinker_info(struct mem_cgroup *memcg)
int alloc_shrinker_info(struct mem_cgroup *memcg)
{
struct shrinker_info *info;
- int nid, size, ret = 0;
- int map_size, defer_size = 0;
+ int nid, ret = 0;
+ int array_size = 0;

down_write(&shrinker_rwsem);
- map_size = shrinker_map_size(shrinker_nr_max);
- defer_size = shrinker_defer_size(shrinker_nr_max);
- size = map_size + defer_size;
+ array_size = shrinker_unit_size(shrinker_nr_max);
for_each_node(nid) {
- info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
- if (!info) {
- free_shrinker_info(memcg);
- ret = -ENOMEM;
- break;
- }
- info->nr_deferred = (atomic_long_t *)(info + 1);
- info->map = (void *)info->nr_deferred + defer_size;
+ info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid);
+ if (!info)
+ goto err;
info->map_nr_max = shrinker_nr_max;
+ if (shrinker_unit_alloc(info, NULL, nid))
+ goto err;
rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
}
up_write(&shrinker_rwsem);

return ret;
+
+err:
+ free_shrinker_info(memcg);
+ return -ENOMEM;
}

static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
@@ -71,15 +106,12 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
lockdep_is_held(&shrinker_rwsem));
}

-static int expand_one_shrinker_info(struct mem_cgroup *memcg,
- int map_size, int defer_size,
- int old_map_size, int old_defer_size,
- int new_nr_max)
+static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size,
+ int old_size, int new_nr_max)
{
struct shrinker_info *new, *old;
struct mem_cgroup_per_node *pn;
int nid;
- int size = map_size + defer_size;

for_each_node(nid) {
pn = memcg->nodeinfo[nid];
@@ -92,21 +124,18 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
if (new_nr_max <= old->map_nr_max)
continue;

- new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
+ new = kvmalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid);
if (!new)
return -ENOMEM;

- new->nr_deferred = (atomic_long_t *)(new + 1);
- new->map = (void *)new->nr_deferred + defer_size;
new->map_nr_max = new_nr_max;

- /* map: set all old bits, clear all new bits */
- memset(new->map, (int)0xff, old_map_size);
- memset((void *)new->map + old_map_size, 0, map_size - old_map_size);
- /* nr_deferred: copy old values, clear all new values */
- memcpy(new->nr_deferred, old->nr_deferred, old_defer_size);
- memset((void *)new->nr_deferred + old_defer_size, 0,
- defer_size - old_defer_size);
+ /* copy old values, allocate all new values */
+ memcpy(new->unit, old->unit, old_size);
+ if (shrinker_unit_alloc(new, old, nid)) {
+ kvfree(new);
+ return -ENOMEM;
+ }

rcu_assign_pointer(pn->shrinker_info, new);
kvfree_rcu(old, rcu);
@@ -118,9 +147,8 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
static int expand_shrinker_info(int new_id)
{
int ret = 0;
- int new_nr_max = round_up(new_id + 1, BITS_PER_LONG);
- int map_size, defer_size = 0;
- int old_map_size, old_defer_size = 0;
+ int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS);
+ int new_size, old_size = 0;
struct mem_cgroup *memcg;

if (!root_mem_cgroup)
@@ -128,15 +156,12 @@ static int expand_shrinker_info(int new_id)

lockdep_assert_held(&shrinker_rwsem);

- map_size = shrinker_map_size(new_nr_max);
- defer_size = shrinker_defer_size(new_nr_max);
- old_map_size = shrinker_map_size(shrinker_nr_max);
- old_defer_size = shrinker_defer_size(shrinker_nr_max);
+ new_size = shrinker_unit_size(new_nr_max);
+ old_size = shrinker_unit_size(shrinker_nr_max);

memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
- ret = expand_one_shrinker_info(memcg, map_size, defer_size,
- old_map_size, old_defer_size,
+ ret = expand_one_shrinker_info(memcg, new_size, old_size,
new_nr_max);
if (ret) {
mem_cgroup_iter_break(NULL, memcg);
@@ -150,17 +175,34 @@ static int expand_shrinker_info(int new_id)
return ret;
}

+static inline int shriner_id_to_index(int shrinker_id)
+{
+ return shrinker_id / SHRINKER_UNIT_BITS;
+}
+
+static inline int shriner_id_to_offset(int shrinker_id)
+{
+ return shrinker_id % SHRINKER_UNIT_BITS;
+}
+
+static inline int calc_shrinker_id(int index, int offset)
+{
+ return index * SHRINKER_UNIT_BITS + offset;
+}
+
void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
{
if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
struct shrinker_info *info;
+ struct shrinker_info_unit *unit;

rcu_read_lock();
info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
+ unit = info->unit[shriner_id_to_index(shrinker_id)];
if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
/* Pairs with smp mb in shrink_slab() */
smp_mb__before_atomic();
- set_bit(shrinker_id, info->map);
+ set_bit(shriner_id_to_offset(shrinker_id), unit->map);
}
rcu_read_unlock();
}
@@ -209,26 +251,31 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
struct mem_cgroup *memcg)
{
struct shrinker_info *info;
+ struct shrinker_info_unit *unit;

info = shrinker_info_protected(memcg, nid);
- return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
+ unit = info->unit[shriner_id_to_index(shrinker->id)];
+ return atomic_long_xchg(&unit->nr_deferred[shriner_id_to_offset(shrinker->id)], 0);
}

static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
struct mem_cgroup *memcg)
{
struct shrinker_info *info;
+ struct shrinker_info_unit *unit;

info = shrinker_info_protected(memcg, nid);
- return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
+ unit = info->unit[shriner_id_to_index(shrinker->id)];
+ return atomic_long_add_return(nr, &unit->nr_deferred[shriner_id_to_offset(shrinker->id)]);
}

void reparent_shrinker_deferred(struct mem_cgroup *memcg)
{
- int i, nid;
+ int nid, index, offset;
long nr;
struct mem_cgroup *parent;
struct shrinker_info *child_info, *parent_info;
+ struct shrinker_info_unit *child_unit, *parent_unit;

parent = parent_mem_cgroup(memcg);
if (!parent)
@@ -239,9 +286,13 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
for_each_node(nid) {
child_info = shrinker_info_protected(memcg, nid);
parent_info = shrinker_info_protected(parent, nid);
- for (i = 0; i < child_info->map_nr_max; i++) {
- nr = atomic_long_read(&child_info->nr_deferred[i]);
- atomic_long_add(nr, &parent_info->nr_deferred[i]);
+ for (index = 0; index < shriner_id_to_index(child_info->map_nr_max); index++) {
+ child_unit = child_info->unit[index];
+ parent_unit = parent_info->unit[index];
+ for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) {
+ nr = atomic_long_read(&child_unit->nr_deferred[offset]);
+ atomic_long_add(nr, &parent_unit->nr_deferred[offset]);
+ }
}
}
up_read(&shrinker_rwsem);
@@ -407,7 +458,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
{
struct shrinker_info *info;
unsigned long ret, freed = 0;
- int i;
+ int offset, index = 0;

if (!mem_cgroup_online(memcg))
return 0;
@@ -419,56 +470,63 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
if (unlikely(!info))
goto unlock;

- for_each_set_bit(i, info->map, info->map_nr_max) {
- struct shrink_control sc = {
- .gfp_mask = gfp_mask,
- .nid = nid,
- .memcg = memcg,
- };
- struct shrinker *shrinker;
+ for (; index < shriner_id_to_index(info->map_nr_max); index++) {
+ struct shrinker_info_unit *unit;

- shrinker = idr_find(&shrinker_idr, i);
- if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
- if (!shrinker)
- clear_bit(i, info->map);
- continue;
- }
+ unit = info->unit[index];

- /* Call non-slab shrinkers even though kmem is disabled */
- if (!memcg_kmem_online() &&
- !(shrinker->flags & SHRINKER_NONSLAB))
- continue;
+ for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) {
+ struct shrink_control sc = {
+ .gfp_mask = gfp_mask,
+ .nid = nid,
+ .memcg = memcg,
+ };
+ struct shrinker *shrinker;
+ int shrinker_id = calc_shrinker_id(index, offset);

- ret = do_shrink_slab(&sc, shrinker, priority);
- if (ret == SHRINK_EMPTY) {
- clear_bit(i, info->map);
- /*
- * After the shrinker reported that it had no objects to
- * free, but before we cleared the corresponding bit in
- * the memcg shrinker map, a new object might have been
- * added. To make sure, we have the bit set in this
- * case, we invoke the shrinker one more time and reset
- * the bit if it reports that it is not empty anymore.
- * The memory barrier here pairs with the barrier in
- * set_shrinker_bit():
- *
- * list_lru_add() shrink_slab_memcg()
- * list_add_tail() clear_bit()
- * <MB> <MB>
- * set_bit() do_shrink_slab()
- */
- smp_mb__after_atomic();
- ret = do_shrink_slab(&sc, shrinker, priority);
- if (ret == SHRINK_EMPTY)
- ret = 0;
- else
- set_shrinker_bit(memcg, nid, i);
- }
- freed += ret;
+ shrinker = idr_find(&shrinker_idr, shrinker_id);
+ if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
+ if (!shrinker)
+ clear_bit(offset, unit->map);
+ continue;
+ }

- if (rwsem_is_contended(&shrinker_rwsem)) {
- freed = freed ? : 1;
- break;
+ /* Call non-slab shrinkers even though kmem is disabled */
+ if (!memcg_kmem_online() &&
+ !(shrinker->flags & SHRINKER_NONSLAB))
+ continue;
+
+ ret = do_shrink_slab(&sc, shrinker, priority);
+ if (ret == SHRINK_EMPTY) {
+ clear_bit(offset, unit->map);
+ /*
+ * After the shrinker reported that it had no objects to
+ * free, but before we cleared the corresponding bit in
+ * the memcg shrinker map, a new object might have been
+ * added. To make sure, we have the bit set in this
+ * case, we invoke the shrinker one more time and reset
+ * the bit if it reports that it is not empty anymore.
+ * The memory barrier here pairs with the barrier in
+ * set_shrinker_bit():
+ *
+ * list_lru_add() shrink_slab_memcg()
+ * list_add_tail() clear_bit()
+ * <MB> <MB>
+ * set_bit() do_shrink_slab()
+ */
+ smp_mb__after_atomic();
+ ret = do_shrink_slab(&sc, shrinker, priority);
+ if (ret == SHRINK_EMPTY)
+ ret = 0;
+ else
+ set_shrinker_bit(memcg, nid, shrinker_id);
+ }
+ freed += ret;
+
+ if (rwsem_is_contended(&shrinker_rwsem)) {
+ freed = freed ? : 1;
+ goto unlock;
+ }
}
}
unlock:
--
2.30.2



2023-08-08 02:52:22

by Dave Chinner

[permalink] [raw]
Subject: Re: [PATCH v4 44/48] mm: shrinker: add a secondary array for shrinker_info::{map, nr_deferred}

On Mon, Aug 07, 2023 at 07:09:32PM +0800, Qi Zheng wrote:
> Currently, we maintain two linear arrays per node per memcg, which are
> shrinker_info::map and shrinker_info::nr_deferred. And we need to resize
> them when the shrinker_nr_max is exceeded, that is, allocate a new array,
> and then copy the old array to the new array, and finally free the old
> array by RCU.
>
> For shrinker_info::map, we do set_bit() under the RCU lock, so we may set
> the value into the old map which is about to be freed. This may cause the
> value set to be lost. The current solution is not to copy the old map when
> resizing, but to set all the corresponding bits in the new map to 1. This
> solves the data loss problem, but bring the overhead of more pointless
> loops while doing memcg slab shrink.
>
> For shrinker_info::nr_deferred, we will only modify it under the read lock
> of shrinker_rwsem, so it will not run concurrently with the resizing. But
> after we make memcg slab shrink lockless, there will be the same data loss
> problem as shrinker_info::map, and we can't work around it like the map.
>
> For such resizable arrays, the most straightforward idea is to change it
> to xarray, like we did for list_lru [1]. We need to do xa_store() in the
> list_lru_add()-->set_shrinker_bit(), but this will cause memory
> allocation, and the list_lru_add() doesn't accept failure. A possible
> solution is to pre-allocate, but the location of pre-allocation is not
> well determined.

So you implemented a two level array that preallocates leaf
nodes to work around it? It's remarkable complex for what it does,
I can't help but think a radix tree using a special holder for
nr_deferred values of zero would end up being simpler...

> Therefore, this commit chooses to introduce a secondary array for
> shrinker_info::{map, nr_deferred}, so that we only need to copy this
> secondary array every time the size is resized. Then even if we get the
> old secondary array under the RCU lock, the found map and nr_deferred are
> also true, so no data is lost.

I don't understand what you are trying to describe here. If we get
the old array, then don't we get either a stale nr_deferred value,
or the update we do gets lost because the next shrinker lookup will
find the new array and os the deferred value stored to the old one
is never seen again?

>
> [1]. https://lore.kernel.org/all/[email protected]/
>
> Signed-off-by: Qi Zheng <[email protected]>
> Reviewed-by: Muchun Song <[email protected]>
> ---
.....
> diff --git a/mm/shrinker.c b/mm/shrinker.c
> index a27779ed3798..1911c06b8af5 100644
> --- a/mm/shrinker.c
> +++ b/mm/shrinker.c
> @@ -12,15 +12,50 @@ DECLARE_RWSEM(shrinker_rwsem);
> #ifdef CONFIG_MEMCG
> static int shrinker_nr_max;
>
> -/* The shrinker_info is expanded in a batch of BITS_PER_LONG */
> -static inline int shrinker_map_size(int nr_items)
> +static inline int shrinker_unit_size(int nr_items)
> {
> - return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
> + return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *));
> }
>
> -static inline int shrinker_defer_size(int nr_items)
> +static inline void shrinker_unit_free(struct shrinker_info *info, int start)
> {
> - return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
> + struct shrinker_info_unit **unit;
> + int nr, i;
> +
> + if (!info)
> + return;
> +
> + unit = info->unit;
> + nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS);
> +
> + for (i = start; i < nr; i++) {
> + if (!unit[i])
> + break;
> +
> + kvfree(unit[i]);
> + unit[i] = NULL;
> + }
> +}
> +
> +static inline int shrinker_unit_alloc(struct shrinker_info *new,
> + struct shrinker_info *old, int nid)
> +{
> + struct shrinker_info_unit *unit;
> + int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS);
> + int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0;
> + int i;
> +
> + for (i = start; i < nr; i++) {
> + unit = kvzalloc_node(sizeof(*unit), GFP_KERNEL, nid);

A unit is 576 bytes. Why is this using kvzalloc_node()?

> + if (!unit) {
> + shrinker_unit_free(new, start);
> + return -ENOMEM;
> + }
> +
> + new->unit[i] = unit;
> + }
> +
> + return 0;
> }
>
> void free_shrinker_info(struct mem_cgroup *memcg)
> @@ -32,6 +67,7 @@ void free_shrinker_info(struct mem_cgroup *memcg)
> for_each_node(nid) {
> pn = memcg->nodeinfo[nid];
> info = rcu_dereference_protected(pn->shrinker_info, true);
> + shrinker_unit_free(info, 0);
> kvfree(info);
> rcu_assign_pointer(pn->shrinker_info, NULL);
> }

Why is this safe? The info and maps are looked up by RCU, so why is
freeing them without a RCU grace period expiring safe?

Yes, it was safe to do this when it was all under a semaphore, but
now the lookup and use is under RCU, so this freeing isn't
serialised against lookups anymore...


> @@ -40,28 +76,27 @@ void free_shrinker_info(struct mem_cgroup *memcg)
> int alloc_shrinker_info(struct mem_cgroup *memcg)
> {
> struct shrinker_info *info;
> - int nid, size, ret = 0;
> - int map_size, defer_size = 0;
> + int nid, ret = 0;
> + int array_size = 0;
>
> down_write(&shrinker_rwsem);
> - map_size = shrinker_map_size(shrinker_nr_max);
> - defer_size = shrinker_defer_size(shrinker_nr_max);
> - size = map_size + defer_size;
> + array_size = shrinker_unit_size(shrinker_nr_max);
> for_each_node(nid) {
> - info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
> - if (!info) {
> - free_shrinker_info(memcg);
> - ret = -ENOMEM;
> - break;
> - }
> - info->nr_deferred = (atomic_long_t *)(info + 1);
> - info->map = (void *)info->nr_deferred + defer_size;
> + info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid);
> + if (!info)
> + goto err;
> info->map_nr_max = shrinker_nr_max;
> + if (shrinker_unit_alloc(info, NULL, nid))
> + goto err;

That's going to now do a lot of small memory allocation when we have
lots of shrinkers active....

> @@ -150,17 +175,34 @@ static int expand_shrinker_info(int new_id)
> return ret;
> }
>
> +static inline int shriner_id_to_index(int shrinker_id)

shrinker_id_to_index

> +{
> + return shrinker_id / SHRINKER_UNIT_BITS;
> +}
> +
> +static inline int shriner_id_to_offset(int shrinker_id)

shrinker_id_to_offset

> +{
> + return shrinker_id % SHRINKER_UNIT_BITS;
> +}

....
> @@ -209,26 +251,31 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
> struct mem_cgroup *memcg)
> {
> struct shrinker_info *info;
> + struct shrinker_info_unit *unit;
>
> info = shrinker_info_protected(memcg, nid);
> - return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
> + unit = info->unit[shriner_id_to_index(shrinker->id)];
> + return atomic_long_xchg(&unit->nr_deferred[shriner_id_to_offset(shrinker->id)], 0);
> }
>
> static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
> struct mem_cgroup *memcg)
> {
> struct shrinker_info *info;
> + struct shrinker_info_unit *unit;
>
> info = shrinker_info_protected(memcg, nid);
> - return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
> + unit = info->unit[shriner_id_to_index(shrinker->id)];
> + return atomic_long_add_return(nr, &unit->nr_deferred[shriner_id_to_offset(shrinker->id)]);
> }
>
> void reparent_shrinker_deferred(struct mem_cgroup *memcg)
> {
> - int i, nid;
> + int nid, index, offset;
> long nr;
> struct mem_cgroup *parent;
> struct shrinker_info *child_info, *parent_info;
> + struct shrinker_info_unit *child_unit, *parent_unit;
>
> parent = parent_mem_cgroup(memcg);
> if (!parent)
> @@ -239,9 +286,13 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
> for_each_node(nid) {
> child_info = shrinker_info_protected(memcg, nid);
> parent_info = shrinker_info_protected(parent, nid);
> - for (i = 0; i < child_info->map_nr_max; i++) {
> - nr = atomic_long_read(&child_info->nr_deferred[i]);
> - atomic_long_add(nr, &parent_info->nr_deferred[i]);
> + for (index = 0; index < shriner_id_to_index(child_info->map_nr_max); index++) {
> + child_unit = child_info->unit[index];
> + parent_unit = parent_info->unit[index];
> + for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) {
> + nr = atomic_long_read(&child_unit->nr_deferred[offset]);
> + atomic_long_add(nr, &parent_unit->nr_deferred[offset]);
> + }
> }
> }
> up_read(&shrinker_rwsem);
> @@ -407,7 +458,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
> {
> struct shrinker_info *info;
> unsigned long ret, freed = 0;
> - int i;
> + int offset, index = 0;
>
> if (!mem_cgroup_online(memcg))
> return 0;
> @@ -419,56 +470,63 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
> if (unlikely(!info))
> goto unlock;
>
> - for_each_set_bit(i, info->map, info->map_nr_max) {
> - struct shrink_control sc = {
> - .gfp_mask = gfp_mask,
> - .nid = nid,
> - .memcg = memcg,
> - };
> - struct shrinker *shrinker;
> + for (; index < shriner_id_to_index(info->map_nr_max); index++) {
> + struct shrinker_info_unit *unit;

This adds another layer of indent to shrink_slab_memcg(). Please
factor it first so that the code ends up being readable. Doing that
first as a separate patch will also make the actual algorithm
changes in this patch be much more obvious - this huge hunk of
diff is pretty much impossible to review...

-Dave.
--
Dave Chinner
[email protected]

2023-08-08 15:54:57

by Qi Zheng

[permalink] [raw]
Subject: Re: [PATCH v4 44/48] mm: shrinker: add a secondary array for shrinker_info::{map, nr_deferred}

Hi Dave,

On 2023/8/8 10:12, Dave Chinner wrote:
> On Mon, Aug 07, 2023 at 07:09:32PM +0800, Qi Zheng wrote:
>> Currently, we maintain two linear arrays per node per memcg, which are
>> shrinker_info::map and shrinker_info::nr_deferred. And we need to resize
>> them when the shrinker_nr_max is exceeded, that is, allocate a new array,
>> and then copy the old array to the new array, and finally free the old
>> array by RCU.
>>
>> For shrinker_info::map, we do set_bit() under the RCU lock, so we may set
>> the value into the old map which is about to be freed. This may cause the
>> value set to be lost. The current solution is not to copy the old map when
>> resizing, but to set all the corresponding bits in the new map to 1. This
>> solves the data loss problem, but bring the overhead of more pointless
>> loops while doing memcg slab shrink.
>>
>> For shrinker_info::nr_deferred, we will only modify it under the read lock
>> of shrinker_rwsem, so it will not run concurrently with the resizing. But
>> after we make memcg slab shrink lockless, there will be the same data loss
>> problem as shrinker_info::map, and we can't work around it like the map.
>>
>> For such resizable arrays, the most straightforward idea is to change it
>> to xarray, like we did for list_lru [1]. We need to do xa_store() in the
>> list_lru_add()-->set_shrinker_bit(), but this will cause memory
>> allocation, and the list_lru_add() doesn't accept failure. A possible
>> solution is to pre-allocate, but the location of pre-allocation is not
>> well determined.
>
> So you implemented a two level array that preallocates leaf
> nodes to work around it? It's remarkable complex for what it does,

Yes, here I have implemented a two level array like the following:

+---------------+--------+--------+-----+
| shrinker_info | unit 0 | unit 1 | ... | (secondary array)
+---------------+--------+--------+-----+
^
|
+---------------+-----+
| nr_deferred[] | map | (leaf array)
+---------------+-----+
(shrinker_info_unit)

The leaf array is never freed unless the memcg is destroyed. The
secondary array will be resized every time the shrinker id exceeds
shrinker_nr_max.

> I can't help but think a radix tree using a special holder for
> nr_deferred values of zero would end up being simpler...

I tried. If the shrinker uses list_lru, then we can preallocate
xa node where list_lru_one is pre-allocated. But for other types of
shrinkers, the location of pre-allocation is not easy to determine
(Such as deferred_split_shrinker). And we can't force all memcg aware
shrinkers to use list_lru, so I gave up using xarray and implemented the
above two-level array.

>
>> Therefore, this commit chooses to introduce a secondary array for
>> shrinker_info::{map, nr_deferred}, so that we only need to copy this
>> secondary array every time the size is resized. Then even if we get the
>> old secondary array under the RCU lock, the found map and nr_deferred are
>> also true, so no data is lost.
>
> I don't understand what you are trying to describe here. If we get
> the old array, then don't we get either a stale nr_deferred value,
> or the update we do gets lost because the next shrinker lookup will
> find the new array and os the deferred value stored to the old one
> is never seen again?

As shown above, the leaf array will not be freed when shrinker_info is
expanded, so the shrinker_info_unit can be indexed from both the old
and the new shrinker_info->unit[x]. So the updated nr_deferred and map
will not be lost.

>
>>
>> [1]. https://lore.kernel.org/all/[email protected]/
>>
>> Signed-off-by: Qi Zheng <[email protected]>
>> Reviewed-by: Muchun Song <[email protected]>
>> ---
> .....
>> diff --git a/mm/shrinker.c b/mm/shrinker.c
>> index a27779ed3798..1911c06b8af5 100644
>> --- a/mm/shrinker.c
>> +++ b/mm/shrinker.c
>> @@ -12,15 +12,50 @@ DECLARE_RWSEM(shrinker_rwsem);
>> #ifdef CONFIG_MEMCG
>> static int shrinker_nr_max;
>>
>> -/* The shrinker_info is expanded in a batch of BITS_PER_LONG */
>> -static inline int shrinker_map_size(int nr_items)
>> +static inline int shrinker_unit_size(int nr_items)
>> {
>> - return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
>> + return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *));
>> }
>>
>> -static inline int shrinker_defer_size(int nr_items)
>> +static inline void shrinker_unit_free(struct shrinker_info *info, int start)
>> {
>> - return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
>> + struct shrinker_info_unit **unit;
>> + int nr, i;
>> +
>> + if (!info)
>> + return;
>> +
>> + unit = info->unit;
>> + nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS);
>> +
>> + for (i = start; i < nr; i++) {
>> + if (!unit[i])
>> + break;
>> +
>> + kvfree(unit[i]);
>> + unit[i] = NULL;
>> + }
>> +}
>> +
>> +static inline int shrinker_unit_alloc(struct shrinker_info *new,
>> + struct shrinker_info *old, int nid)
>> +{
>> + struct shrinker_info_unit *unit;
>> + int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS);
>> + int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0;
>> + int i;
>> +
>> + for (i = start; i < nr; i++) {
>> + unit = kvzalloc_node(sizeof(*unit), GFP_KERNEL, nid);
>
> A unit is 576 bytes. Why is this using kvzalloc_node()?

Ah, will use kzalloc_node() in the next version.

>
>> + if (!unit) {
>> + shrinker_unit_free(new, start);
>> + return -ENOMEM;
>> + }
>> +
>> + new->unit[i] = unit;
>> + }
>> +
>> + return 0;
>> }
>>
>> void free_shrinker_info(struct mem_cgroup *memcg)
>> @@ -32,6 +67,7 @@ void free_shrinker_info(struct mem_cgroup *memcg)
>> for_each_node(nid) {
>> pn = memcg->nodeinfo[nid];
>> info = rcu_dereference_protected(pn->shrinker_info, true);
>> + shrinker_unit_free(info, 0);
>> kvfree(info);
>> rcu_assign_pointer(pn->shrinker_info, NULL);
>> }
>
> Why is this safe? The info and maps are looked up by RCU, so why is
> freeing them without a RCU grace period expiring safe?

The free_shrinker_info() will be called in alloc_shrinker_info() and
mem_cgroup_css_free().

In alloc_shrinker_info(), it will only be called in the error path, so
shrinker_info_unit and shrinker_info can be safely freed.

In mem_cgroup_css_free(), when we get here, the traversal of this memcg
has ended and will not be found again. That is to say, the corresponding
shrink_slab() is also over, so shrinker_info_unit and shrinker_info can
also be safely freed here.

>
> Yes, it was safe to do this when it was all under a semaphore, but
> now the lookup and use is under RCU, so this freeing isn't
> serialised against lookups anymore...
>
>
>> @@ -40,28 +76,27 @@ void free_shrinker_info(struct mem_cgroup *memcg)
>> int alloc_shrinker_info(struct mem_cgroup *memcg)
>> {
>> struct shrinker_info *info;
>> - int nid, size, ret = 0;
>> - int map_size, defer_size = 0;
>> + int nid, ret = 0;
>> + int array_size = 0;
>>
>> down_write(&shrinker_rwsem);
>> - map_size = shrinker_map_size(shrinker_nr_max);
>> - defer_size = shrinker_defer_size(shrinker_nr_max);
>> - size = map_size + defer_size;
>> + array_size = shrinker_unit_size(shrinker_nr_max);
>> for_each_node(nid) {
>> - info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
>> - if (!info) {
>> - free_shrinker_info(memcg);
>> - ret = -ENOMEM;
>> - break;
>> - }
>> - info->nr_deferred = (atomic_long_t *)(info + 1);
>> - info->map = (void *)info->nr_deferred + defer_size;
>> + info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid);
>> + if (!info)
>> + goto err;
>> info->map_nr_max = shrinker_nr_max;
>> + if (shrinker_unit_alloc(info, NULL, nid))
>> + goto err;
>
> That's going to now do a lot of small memory allocation when we have
> lots of shrinkers active....
>
>> @@ -150,17 +175,34 @@ static int expand_shrinker_info(int new_id)
>> return ret;
>> }
>>
>> +static inline int shriner_id_to_index(int shrinker_id)
>
> shrinker_id_to_index

Will fix.

>
>> +{
>> + return shrinker_id / SHRINKER_UNIT_BITS;
>> +}
>> +
>> +static inline int shriner_id_to_offset(int shrinker_id)
>
> shrinker_id_to_offset

Will fix.

>
>> +{
>> + return shrinker_id % SHRINKER_UNIT_BITS;
>> +}
>
> ....
>> @@ -209,26 +251,31 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
>> struct mem_cgroup *memcg)
>> {
>> struct shrinker_info *info;
>> + struct shrinker_info_unit *unit;
>>
>> info = shrinker_info_protected(memcg, nid);
>> - return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
>> + unit = info->unit[shriner_id_to_index(shrinker->id)];
>> + return atomic_long_xchg(&unit->nr_deferred[shriner_id_to_offset(shrinker->id)], 0);
>> }
>>
>> static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
>> struct mem_cgroup *memcg)
>> {
>> struct shrinker_info *info;
>> + struct shrinker_info_unit *unit;
>>
>> info = shrinker_info_protected(memcg, nid);
>> - return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
>> + unit = info->unit[shriner_id_to_index(shrinker->id)];
>> + return atomic_long_add_return(nr, &unit->nr_deferred[shriner_id_to_offset(shrinker->id)]);
>> }
>>
>> void reparent_shrinker_deferred(struct mem_cgroup *memcg)
>> {
>> - int i, nid;
>> + int nid, index, offset;
>> long nr;
>> struct mem_cgroup *parent;
>> struct shrinker_info *child_info, *parent_info;
>> + struct shrinker_info_unit *child_unit, *parent_unit;
>>
>> parent = parent_mem_cgroup(memcg);
>> if (!parent)
>> @@ -239,9 +286,13 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
>> for_each_node(nid) {
>> child_info = shrinker_info_protected(memcg, nid);
>> parent_info = shrinker_info_protected(parent, nid);
>> - for (i = 0; i < child_info->map_nr_max; i++) {
>> - nr = atomic_long_read(&child_info->nr_deferred[i]);
>> - atomic_long_add(nr, &parent_info->nr_deferred[i]);
>> + for (index = 0; index < shriner_id_to_index(child_info->map_nr_max); index++) {
>> + child_unit = child_info->unit[index];
>> + parent_unit = parent_info->unit[index];
>> + for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) {
>> + nr = atomic_long_read(&child_unit->nr_deferred[offset]);
>> + atomic_long_add(nr, &parent_unit->nr_deferred[offset]);
>> + }
>> }
>> }
>> up_read(&shrinker_rwsem);
>> @@ -407,7 +458,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>> {
>> struct shrinker_info *info;
>> unsigned long ret, freed = 0;
>> - int i;
>> + int offset, index = 0;
>>
>> if (!mem_cgroup_online(memcg))
>> return 0;
>> @@ -419,56 +470,63 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>> if (unlikely(!info))
>> goto unlock;
>>
>> - for_each_set_bit(i, info->map, info->map_nr_max) {
>> - struct shrink_control sc = {
>> - .gfp_mask = gfp_mask,
>> - .nid = nid,
>> - .memcg = memcg,
>> - };
>> - struct shrinker *shrinker;
>> + for (; index < shriner_id_to_index(info->map_nr_max); index++) {
>> + struct shrinker_info_unit *unit;
>
> This adds another layer of indent to shrink_slab_memcg(). Please
> factor it first so that the code ends up being readable. Doing that
> first as a separate patch will also make the actual algorithm
> changes in this patch be much more obvious - this huge hunk of
> diff is pretty much impossible to review...

OK, I will send this patch together with PATCH v4 01/02/03/43 as
a single cleanup patchset.

Thanks,
Qi

>
> -Dave.