If we run 10k containers in the system, the size of the
list_lru_memcg->lrus can be ~96KB per list_lru. When we decrease the
number containers, the size of the array will not be shrinked. It is
not scalable. The xarray is a good choice for this case. We can save
a lot of memory when there are tens of thousands continers in the
system. If we use xarray, we also can remove the logic code of
resizing array, which can simplify the code.
Signed-off-by: Muchun Song <[email protected]>
---
include/linux/list_lru.h | 13 +--
include/linux/memcontrol.h | 23 ------
mm/list_lru.c | 202 +++++++++++++++------------------------------
mm/memcontrol.c | 77 ++---------------
4 files changed, 73 insertions(+), 242 deletions(-)
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 4b00fd8cb373..572c263561ac 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -11,6 +11,7 @@
#include <linux/list.h>
#include <linux/nodemask.h>
#include <linux/shrinker.h>
+#include <linux/xarray.h>
struct mem_cgroup;
@@ -37,12 +38,6 @@ struct list_lru_per_memcg {
struct list_lru_one node[];
};
-struct list_lru_memcg {
- struct rcu_head rcu;
- /* array of per cgroup lists, indexed by memcg_cache_id */
- struct list_lru_per_memcg __rcu *mlru[];
-};
-
struct list_lru_node {
/* protects all lists on the node, including per cgroup */
spinlock_t lock;
@@ -57,10 +52,7 @@ struct list_lru {
struct list_head list;
int shrinker_id;
bool memcg_aware;
- /* protects ->mlrus->mlru[i] */
- spinlock_t lock;
- /* for cgroup aware lrus points to per cgroup lists, otherwise NULL */
- struct list_lru_memcg __rcu *mlrus;
+ struct xarray xa;
#endif
};
@@ -77,7 +69,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
gfp_t gfp);
-int memcg_update_all_list_lrus(int num_memcgs);
void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent);
/**
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0d6d838cadfd..1fe44ec5aa03 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1686,18 +1686,6 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);
extern struct static_key_false memcg_kmem_enabled_key;
-extern int memcg_nr_cache_ids;
-void memcg_get_cache_ids(void);
-void memcg_put_cache_ids(void);
-
-/*
- * Helper macro to loop through all memcg-specific caches. Callers must still
- * check if the cache is valid (it is either valid or NULL).
- * the slab_mutex must be held when looping through those caches
- */
-#define for_each_memcg_cache_index(_idx) \
- for ((_idx) = 0; (_idx) < memcg_nr_cache_ids; (_idx)++)
-
static inline bool memcg_kmem_enabled(void)
{
return static_branch_likely(&memcg_kmem_enabled_key);
@@ -1754,9 +1742,6 @@ static inline void __memcg_kmem_uncharge_page(struct page *page, int order)
{
}
-#define for_each_memcg_cache_index(_idx) \
- for (; NULL; )
-
static inline bool memcg_kmem_enabled(void)
{
return false;
@@ -1767,14 +1752,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
return -1;
}
-static inline void memcg_get_cache_ids(void)
-{
-}
-
-static inline void memcg_put_cache_ids(void)
-{
-}
-
static inline struct mem_cgroup *mem_cgroup_from_obj(void *p)
{
return NULL;
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 488dacd1f8ff..8dc1dabb9f05 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -52,21 +52,12 @@ static int lru_shrinker_id(struct list_lru *lru)
static inline struct list_lru_one *
list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
{
- struct list_lru_memcg *mlrus;
- struct list_lru_node *nlru = &lru->node[nid];
+ if (list_lru_memcg_aware(lru) && idx >= 0) {
+ struct list_lru_per_memcg *mlru = xa_load(&lru->xa, idx);
- /*
- * Either lock or RCU protects the array of per cgroup lists
- * from relocation (see memcg_update_list_lru).
- */
- mlrus = rcu_dereference_check(lru->mlrus, lockdep_is_held(&nlru->lock));
- if (mlrus && idx >= 0) {
- struct list_lru_per_memcg *mlru;
-
- mlru = rcu_dereference_check(mlrus->mlru[idx], true);
return mlru ? &mlru->node[nid] : NULL;
}
- return &nlru->lru;
+ return &lru->node[nid].lru;
}
static inline struct list_lru_one *
@@ -77,7 +68,7 @@ list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr,
struct list_lru_one *l = &nlru->lru;
struct mem_cgroup *memcg = NULL;
- if (!lru->mlrus)
+ if (!list_lru_memcg_aware(lru))
goto out;
memcg = mem_cgroup_from_obj(ptr);
@@ -309,16 +300,20 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
unsigned long *nr_to_walk)
{
long isolated = 0;
- int memcg_idx;
isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
nr_to_walk);
+
+#ifdef CONFIG_MEMCG_KMEM
if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
- for_each_memcg_cache_index(memcg_idx) {
+ struct list_lru_per_memcg *mlru;
+ unsigned long index;
+
+ xa_for_each(&lru->xa, index, mlru) {
struct list_lru_node *nlru = &lru->node[nid];
spin_lock(&nlru->lock);
- isolated += __list_lru_walk_one(lru, nid, memcg_idx,
+ isolated += __list_lru_walk_one(lru, nid, index,
isolate, cb_arg,
nr_to_walk);
spin_unlock(&nlru->lock);
@@ -327,6 +322,8 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
break;
}
}
+#endif
+
return isolated;
}
EXPORT_SYMBOL_GPL(list_lru_walk_node);
@@ -338,15 +335,6 @@ static void init_one_lru(struct list_lru_one *l)
}
#ifdef CONFIG_MEMCG_KMEM
-static void memcg_destroy_list_lru_range(struct list_lru_memcg *mlrus,
- int begin, int end)
-{
- int i;
-
- for (i = begin; i < end; i++)
- kfree(mlrus->mlru[i]);
-}
-
static struct list_lru_per_memcg *memcg_init_list_lru_one(gfp_t gfp)
{
int nid;
@@ -364,14 +352,7 @@ static struct list_lru_per_memcg *memcg_init_list_lru_one(gfp_t gfp)
static void memcg_list_lru_free(struct list_lru *lru, int src_idx)
{
- struct list_lru_memcg *mlrus;
- struct list_lru_per_memcg *mlru;
-
- spin_lock_irq(&lru->lock);
- mlrus = rcu_dereference_protected(lru->mlrus, true);
- mlru = rcu_dereference_protected(mlrus->mlru[src_idx], true);
- rcu_assign_pointer(mlrus->mlru[src_idx], NULL);
- spin_unlock_irq(&lru->lock);
+ struct list_lru_per_memcg *mlru = xa_erase_irq(&lru->xa, src_idx);
/*
* The __list_lru_walk_one() can walk the list of this node.
@@ -383,78 +364,27 @@ static void memcg_list_lru_free(struct list_lru *lru, int src_idx)
kvfree_rcu(mlru, rcu);
}
-static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
- struct list_lru_memcg *mlrus;
- int size = memcg_nr_cache_ids;
-
+ if (memcg_aware)
+ xa_init_flags(&lru->xa, XA_FLAGS_LOCK_IRQ);
lru->memcg_aware = memcg_aware;
- if (!memcg_aware)
- return 0;
-
- spin_lock_init(&lru->lock);
-
- mlrus = kvzalloc(struct_size(mlrus, mlru, size), GFP_KERNEL);
- if (!mlrus)
- return -ENOMEM;
-
- RCU_INIT_POINTER(lru->mlrus, mlrus);
-
- return 0;
}
static void memcg_destroy_list_lru(struct list_lru *lru)
{
- struct list_lru_memcg *mlrus;
+ XA_STATE(xas, &lru->xa, 0);
+ struct list_lru_per_memcg *mlru;
if (!list_lru_memcg_aware(lru))
return;
- /*
- * This is called when shrinker has already been unregistered,
- * and nobody can use it. So, there is no need to use kvfree_rcu().
- */
- mlrus = rcu_dereference_protected(lru->mlrus, true);
- memcg_destroy_list_lru_range(mlrus, 0, memcg_nr_cache_ids);
- kvfree(mlrus);
-}
-
-static int memcg_update_list_lru(struct list_lru *lru, int old_size, int new_size)
-{
- struct list_lru_memcg *old, *new;
-
- BUG_ON(old_size > new_size);
-
- old = rcu_dereference_protected(lru->mlrus,
- lockdep_is_held(&list_lrus_mutex));
- new = kvmalloc(struct_size(new, mlru, new_size), GFP_KERNEL);
- if (!new)
- return -ENOMEM;
-
- spin_lock_irq(&lru->lock);
- memcpy(&new->mlru, &old->mlru, flex_array_size(new, mlru, old_size));
- memset(&new->mlru[old_size], 0, flex_array_size(new, mlru, new_size - old_size));
- rcu_assign_pointer(lru->mlrus, new);
- spin_unlock_irq(&lru->lock);
-
- kvfree_rcu(old, rcu);
- return 0;
-}
-
-int memcg_update_all_list_lrus(int new_size)
-{
- int ret = 0;
- struct list_lru *lru;
- int old_size = memcg_nr_cache_ids;
-
- mutex_lock(&list_lrus_mutex);
- list_for_each_entry(lru, &memcg_list_lrus, list) {
- ret = memcg_update_list_lru(lru, old_size, new_size);
- if (ret)
- break;
+ xas_lock_irq(&xas);
+ xas_for_each(&xas, mlru, ULONG_MAX) {
+ kfree(mlru);
+ xas_store(&xas, NULL);
}
- mutex_unlock(&list_lrus_mutex);
- return ret;
+ xas_unlock_irq(&xas);
}
static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid,
@@ -521,7 +451,7 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
struct mem_cgroup *child;
child = mem_cgroup_from_css(css);
- child->kmemcg_id = parent->kmemcg_id;
+ WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id);
}
rcu_read_unlock();
@@ -531,21 +461,12 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
mutex_unlock(&list_lrus_mutex);
}
-static bool memcg_list_lru_allocated(struct mem_cgroup *memcg,
- struct list_lru *lru)
+static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg,
+ struct list_lru *lru)
{
- bool allocated;
- int idx;
+ int idx = memcg->kmemcg_id;
- idx = memcg->kmemcg_id;
- if (unlikely(idx < 0))
- return true;
-
- rcu_read_lock();
- allocated = !!rcu_access_pointer(rcu_dereference(lru->mlrus)->mlru[idx]);
- rcu_read_unlock();
-
- return allocated;
+ return idx < 0 || xa_load(&lru->xa, idx);
}
int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
@@ -558,6 +479,7 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
struct list_lru_per_memcg *mlru;
struct mem_cgroup *memcg;
} *table;
+ XA_STATE(xas, &lru->xa, 0);
if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru))
return 0;
@@ -586,27 +508,48 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
}
}
- spin_lock_irqsave(&lru->lock, flags);
- mlrus = rcu_dereference_protected(lru->mlrus, true);
+ xas_lock_irqsave(&xas, flags);
while (i--) {
- int index = table[i].memcg->kmemcg_id;
+ int index = READ_ONCE(table[i].memcg->kmemcg_id);
struct list_lru_per_memcg *mlru = table[i].mlru;
- if (index < 0 || rcu_dereference_protected(mlrus->mlru[index], true))
+ xas_set(&xas, index);
+retry:
+ if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) {
kfree(mlru);
- else
- rcu_assign_pointer(mlrus->mlru[index], mlru);
+ } else {
+ xas_store(&xas, mlru);
+ if (xas_error(&xas) == -ENOMEM) {
+ xas_unlock_irqrestore(&xas, flags);
+ if (xas_nomem(&xas, gfp))
+ xas_set_err(&xas, 0);
+ xas_lock_irqsave(&xas, flags);
+ /*
+ * The xas lock has been released, this memcg
+ * can be reparented before us. So reload
+ * memcg id. More details see the comments
+ * in memcg_reparent_list_lrus().
+ */
+ index = READ_ONCE(table[i].memcg->kmemcg_id);
+ if (index < 0)
+ xas_set_err(&xas, 0);
+ else if (!xas_error(&xas) && index != xas.xa_index)
+ xas_set(&xas, index);
+ goto retry;
+ }
+ }
}
- spin_unlock_irqrestore(&lru->lock, flags);
-
+ /* xas_nomem() is used to free memory instead of memory allocation. */
+ if (xas.xa_alloc)
+ xas_nomem(&xas, gfp);
+ xas_unlock_irqrestore(&xas, flags);
kfree(table);
- return 0;
+ return xas_error(&xas);
}
#else
-static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
- return 0;
}
static void memcg_destroy_list_lru(struct list_lru *lru)
@@ -618,7 +561,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
struct lock_class_key *key, struct shrinker *shrinker)
{
int i;
- int err = -ENOMEM;
#ifdef CONFIG_MEMCG_KMEM
if (shrinker)
@@ -626,11 +568,10 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
else
lru->shrinker_id = -1;
#endif
- memcg_get_cache_ids();
lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL);
if (!lru->node)
- goto out;
+ return -ENOMEM;
for_each_node(i) {
spin_lock_init(&lru->node[i].lock);
@@ -639,18 +580,10 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
init_one_lru(&lru->node[i].lru);
}
- err = memcg_init_list_lru(lru, memcg_aware);
- if (err) {
- kfree(lru->node);
- /* Do this so a list_lru_destroy() doesn't crash: */
- lru->node = NULL;
- goto out;
- }
-
+ memcg_init_list_lru(lru, memcg_aware);
list_lru_register(lru);
-out:
- memcg_put_cache_ids();
- return err;
+
+ return 0;
}
EXPORT_SYMBOL_GPL(__list_lru_init);
@@ -660,8 +593,6 @@ void list_lru_destroy(struct list_lru *lru)
if (!lru->node)
return;
- memcg_get_cache_ids();
-
list_lru_unregister(lru);
memcg_destroy_list_lru(lru);
@@ -671,6 +602,5 @@ void list_lru_destroy(struct list_lru *lru)
#ifdef CONFIG_MEMCG_KMEM
lru->shrinker_id = -1;
#endif
- memcg_put_cache_ids();
}
EXPORT_SYMBOL_GPL(list_lru_destroy);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 87ee5b431c05..361ac289d8e9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -351,42 +351,17 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
* This will be used as a shrinker list's index.
* The main reason for not using cgroup id for this:
* this works better in sparse environments, where we have a lot of memcgs,
- * but only a few kmem-limited. Or also, if we have, for instance, 200
- * memcgs, and none but the 200th is kmem-limited, we'd have to have a
- * 200 entry array for that.
- *
- * The current size of the caches array is stored in memcg_nr_cache_ids. It
- * will double each time we have to increase it.
+ * but only a few kmem-limited.
*/
static DEFINE_IDA(memcg_cache_ida);
-int memcg_nr_cache_ids;
-
-/* Protects memcg_nr_cache_ids */
-static DECLARE_RWSEM(memcg_cache_ids_sem);
-
-void memcg_get_cache_ids(void)
-{
- down_read(&memcg_cache_ids_sem);
-}
-
-void memcg_put_cache_ids(void)
-{
- up_read(&memcg_cache_ids_sem);
-}
/*
- * MIN_SIZE is different than 1, because we would like to avoid going through
- * the alloc/free process all the time. In a small machine, 4 kmem-limited
- * cgroups is a reasonable guess. In the future, it could be a parameter or
- * tunable, but that is strictly not necessary.
- *
* MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
* this constant directly from cgroup, but it is understandable that this is
* better kept as an internal representation in cgroup.c. In any case, the
* cgrp_id space is not getting any smaller, and we don't have to necessarily
* increase ours as well if it increases.
*/
-#define MEMCG_CACHES_MIN_SIZE 4
#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
/*
@@ -2922,49 +2897,6 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
return objcg;
}
-static int memcg_alloc_cache_id(void)
-{
- int id, size;
- int err;
-
- id = ida_simple_get(&memcg_cache_ida,
- 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
- if (id < 0)
- return id;
-
- if (id < memcg_nr_cache_ids)
- return id;
-
- /*
- * There's no space for the new id in memcg_caches arrays,
- * so we have to grow them.
- */
- down_write(&memcg_cache_ids_sem);
-
- size = 2 * (id + 1);
- if (size < MEMCG_CACHES_MIN_SIZE)
- size = MEMCG_CACHES_MIN_SIZE;
- else if (size > MEMCG_CACHES_MAX_SIZE)
- size = MEMCG_CACHES_MAX_SIZE;
-
- err = memcg_update_all_list_lrus(size);
- if (!err)
- memcg_nr_cache_ids = size;
-
- up_write(&memcg_cache_ids_sem);
-
- if (err) {
- ida_simple_remove(&memcg_cache_ida, id);
- return err;
- }
- return id;
-}
-
-static void memcg_free_cache_id(int id)
-{
- ida_simple_remove(&memcg_cache_ida, id);
-}
-
/*
* obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
* @objcg: object cgroup to uncharge
@@ -3619,13 +3551,14 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
if (unlikely(mem_cgroup_is_root(memcg)))
return 0;
- memcg_id = memcg_alloc_cache_id();
+ memcg_id = ida_alloc_max(&memcg_cache_ida, MEMCG_CACHES_MAX_SIZE - 1,
+ GFP_KERNEL);
if (memcg_id < 0)
return memcg_id;
objcg = obj_cgroup_alloc();
if (!objcg) {
- memcg_free_cache_id(memcg_id);
+ ida_free(&memcg_cache_ida, memcg_id);
return -ENOMEM;
}
objcg->memcg = memcg;
@@ -3669,7 +3602,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
*/
memcg_reparent_list_lrus(memcg, parent);
- memcg_free_cache_id(kmemcg_id);
+ ida_free(&memcg_cache_ida, kmemcg_id);
}
#else
static int memcg_online_kmem(struct mem_cgroup *memcg)
--
2.11.0
On Thu, Mar 31, 2022 at 11:26 AM NeilBrown <[email protected]> wrote:
>
> On Mon, 28 Feb 2022, Muchun Song wrote:
> > If we run 10k containers in the system, the size of the
> > list_lru_memcg->lrus can be ~96KB per list_lru. When we decrease the
> > number containers, the size of the array will not be shrinked. It is
> > not scalable. The xarray is a good choice for this case. We can save
> > a lot of memory when there are tens of thousands continers in the
> > system. If we use xarray, we also can remove the logic code of
> > resizing array, which can simplify the code.
> >
> > Signed-off-by: Muchun Song <[email protected]>
>
> Hi,
> I've just tried some simple testing on NFS (xfstests generic/???) and
> it reliably crashes in this code.
> Specifically in list_lru_add(), list_lru_from_kmem() returns NULL,
> which results in a NULL deref.
> list_lru_from_kmem() returns NULL because an xa_load() returns NULL.
Did you test with the patch [1].
[1] https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=ae085d7f9365de7da27ab5c0d16b12d51ea7fca9
>
> The patch doesn't make any sense to me. It replaces an array of
> structures with an xarray, which is an array of pointers.
> It seems to assume that all the pointers in the array get magically
> allocated to the required structures. I certainly cannot find when
> the 'struct list_lru_node' structures get allocated. So xa_load() will
> *always* return NULL in this code.
struct list_lru_node is allocated via kmem_cache_alloc_lru().
>
> I can provide more details of the failure stack if needed, but I doubt
> that would be necessary.
>
If the above fix cannot fix your issue, would you mind providing
the .config and stack trace?
Thanks for your report.
On Mon, 28 Feb 2022, Muchun Song wrote:
> If we run 10k containers in the system, the size of the
> list_lru_memcg->lrus can be ~96KB per list_lru. When we decrease the
> number containers, the size of the array will not be shrinked. It is
> not scalable. The xarray is a good choice for this case. We can save
> a lot of memory when there are tens of thousands continers in the
> system. If we use xarray, we also can remove the logic code of
> resizing array, which can simplify the code.
>
> Signed-off-by: Muchun Song <[email protected]>
Hi,
I've just tried some simple testing on NFS (xfstests generic/???) and
it reliably crashes in this code.
Specifically in list_lru_add(), list_lru_from_kmem() returns NULL,
which results in a NULL deref.
list_lru_from_kmem() returns NULL because an xa_load() returns NULL.
The patch doesn't make any sense to me. It replaces an array of
structures with an xarray, which is an array of pointers.
It seems to assume that all the pointers in the array get magically
allocated to the required structures. I certainly cannot find when
the 'struct list_lru_node' structures get allocated. So xa_load() will
*always* return NULL in this code.
I can provide more details of the failure stack if needed, but I doubt
that would be necessary.
Thanks,
NeilBrown
> ---
> include/linux/list_lru.h | 13 +--
> include/linux/memcontrol.h | 23 ------
> mm/list_lru.c | 202 +++++++++++++++------------------------------
> mm/memcontrol.c | 77 ++---------------
> 4 files changed, 73 insertions(+), 242 deletions(-)
>
> diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
> index 4b00fd8cb373..572c263561ac 100644
> --- a/include/linux/list_lru.h
> +++ b/include/linux/list_lru.h
> @@ -11,6 +11,7 @@
> #include <linux/list.h>
> #include <linux/nodemask.h>
> #include <linux/shrinker.h>
> +#include <linux/xarray.h>
>
> struct mem_cgroup;
>
> @@ -37,12 +38,6 @@ struct list_lru_per_memcg {
> struct list_lru_one node[];
> };
>
> -struct list_lru_memcg {
> - struct rcu_head rcu;
> - /* array of per cgroup lists, indexed by memcg_cache_id */
> - struct list_lru_per_memcg __rcu *mlru[];
> -};
> -
> struct list_lru_node {
> /* protects all lists on the node, including per cgroup */
> spinlock_t lock;
> @@ -57,10 +52,7 @@ struct list_lru {
> struct list_head list;
> int shrinker_id;
> bool memcg_aware;
> - /* protects ->mlrus->mlru[i] */
> - spinlock_t lock;
> - /* for cgroup aware lrus points to per cgroup lists, otherwise NULL */
> - struct list_lru_memcg __rcu *mlrus;
> + struct xarray xa;
> #endif
> };
>
> @@ -77,7 +69,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
>
> int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
> gfp_t gfp);
> -int memcg_update_all_list_lrus(int num_memcgs);
> void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent);
>
> /**
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 0d6d838cadfd..1fe44ec5aa03 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -1686,18 +1686,6 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);
>
> extern struct static_key_false memcg_kmem_enabled_key;
>
> -extern int memcg_nr_cache_ids;
> -void memcg_get_cache_ids(void);
> -void memcg_put_cache_ids(void);
> -
> -/*
> - * Helper macro to loop through all memcg-specific caches. Callers must still
> - * check if the cache is valid (it is either valid or NULL).
> - * the slab_mutex must be held when looping through those caches
> - */
> -#define for_each_memcg_cache_index(_idx) \
> - for ((_idx) = 0; (_idx) < memcg_nr_cache_ids; (_idx)++)
> -
> static inline bool memcg_kmem_enabled(void)
> {
> return static_branch_likely(&memcg_kmem_enabled_key);
> @@ -1754,9 +1742,6 @@ static inline void __memcg_kmem_uncharge_page(struct page *page, int order)
> {
> }
>
> -#define for_each_memcg_cache_index(_idx) \
> - for (; NULL; )
> -
> static inline bool memcg_kmem_enabled(void)
> {
> return false;
> @@ -1767,14 +1752,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
> return -1;
> }
>
> -static inline void memcg_get_cache_ids(void)
> -{
> -}
> -
> -static inline void memcg_put_cache_ids(void)
> -{
> -}
> -
> static inline struct mem_cgroup *mem_cgroup_from_obj(void *p)
> {
> return NULL;
> diff --git a/mm/list_lru.c b/mm/list_lru.c
> index 488dacd1f8ff..8dc1dabb9f05 100644
> --- a/mm/list_lru.c
> +++ b/mm/list_lru.c
> @@ -52,21 +52,12 @@ static int lru_shrinker_id(struct list_lru *lru)
> static inline struct list_lru_one *
> list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
> {
> - struct list_lru_memcg *mlrus;
> - struct list_lru_node *nlru = &lru->node[nid];
> + if (list_lru_memcg_aware(lru) && idx >= 0) {
> + struct list_lru_per_memcg *mlru = xa_load(&lru->xa, idx);
>
> - /*
> - * Either lock or RCU protects the array of per cgroup lists
> - * from relocation (see memcg_update_list_lru).
> - */
> - mlrus = rcu_dereference_check(lru->mlrus, lockdep_is_held(&nlru->lock));
> - if (mlrus && idx >= 0) {
> - struct list_lru_per_memcg *mlru;
> -
> - mlru = rcu_dereference_check(mlrus->mlru[idx], true);
> return mlru ? &mlru->node[nid] : NULL;
> }
> - return &nlru->lru;
> + return &lru->node[nid].lru;
> }
>
> static inline struct list_lru_one *
> @@ -77,7 +68,7 @@ list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr,
> struct list_lru_one *l = &nlru->lru;
> struct mem_cgroup *memcg = NULL;
>
> - if (!lru->mlrus)
> + if (!list_lru_memcg_aware(lru))
> goto out;
>
> memcg = mem_cgroup_from_obj(ptr);
> @@ -309,16 +300,20 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
> unsigned long *nr_to_walk)
> {
> long isolated = 0;
> - int memcg_idx;
>
> isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
> nr_to_walk);
> +
> +#ifdef CONFIG_MEMCG_KMEM
> if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
> - for_each_memcg_cache_index(memcg_idx) {
> + struct list_lru_per_memcg *mlru;
> + unsigned long index;
> +
> + xa_for_each(&lru->xa, index, mlru) {
> struct list_lru_node *nlru = &lru->node[nid];
>
> spin_lock(&nlru->lock);
> - isolated += __list_lru_walk_one(lru, nid, memcg_idx,
> + isolated += __list_lru_walk_one(lru, nid, index,
> isolate, cb_arg,
> nr_to_walk);
> spin_unlock(&nlru->lock);
> @@ -327,6 +322,8 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
> break;
> }
> }
> +#endif
> +
> return isolated;
> }
> EXPORT_SYMBOL_GPL(list_lru_walk_node);
> @@ -338,15 +335,6 @@ static void init_one_lru(struct list_lru_one *l)
> }
>
> #ifdef CONFIG_MEMCG_KMEM
> -static void memcg_destroy_list_lru_range(struct list_lru_memcg *mlrus,
> - int begin, int end)
> -{
> - int i;
> -
> - for (i = begin; i < end; i++)
> - kfree(mlrus->mlru[i]);
> -}
> -
> static struct list_lru_per_memcg *memcg_init_list_lru_one(gfp_t gfp)
> {
> int nid;
> @@ -364,14 +352,7 @@ static struct list_lru_per_memcg *memcg_init_list_lru_one(gfp_t gfp)
>
> static void memcg_list_lru_free(struct list_lru *lru, int src_idx)
> {
> - struct list_lru_memcg *mlrus;
> - struct list_lru_per_memcg *mlru;
> -
> - spin_lock_irq(&lru->lock);
> - mlrus = rcu_dereference_protected(lru->mlrus, true);
> - mlru = rcu_dereference_protected(mlrus->mlru[src_idx], true);
> - rcu_assign_pointer(mlrus->mlru[src_idx], NULL);
> - spin_unlock_irq(&lru->lock);
> + struct list_lru_per_memcg *mlru = xa_erase_irq(&lru->xa, src_idx);
>
> /*
> * The __list_lru_walk_one() can walk the list of this node.
> @@ -383,78 +364,27 @@ static void memcg_list_lru_free(struct list_lru *lru, int src_idx)
> kvfree_rcu(mlru, rcu);
> }
>
> -static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
> +static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
> {
> - struct list_lru_memcg *mlrus;
> - int size = memcg_nr_cache_ids;
> -
> + if (memcg_aware)
> + xa_init_flags(&lru->xa, XA_FLAGS_LOCK_IRQ);
> lru->memcg_aware = memcg_aware;
> - if (!memcg_aware)
> - return 0;
> -
> - spin_lock_init(&lru->lock);
> -
> - mlrus = kvzalloc(struct_size(mlrus, mlru, size), GFP_KERNEL);
> - if (!mlrus)
> - return -ENOMEM;
> -
> - RCU_INIT_POINTER(lru->mlrus, mlrus);
> -
> - return 0;
> }
>
> static void memcg_destroy_list_lru(struct list_lru *lru)
> {
> - struct list_lru_memcg *mlrus;
> + XA_STATE(xas, &lru->xa, 0);
> + struct list_lru_per_memcg *mlru;
>
> if (!list_lru_memcg_aware(lru))
> return;
>
> - /*
> - * This is called when shrinker has already been unregistered,
> - * and nobody can use it. So, there is no need to use kvfree_rcu().
> - */
> - mlrus = rcu_dereference_protected(lru->mlrus, true);
> - memcg_destroy_list_lru_range(mlrus, 0, memcg_nr_cache_ids);
> - kvfree(mlrus);
> -}
> -
> -static int memcg_update_list_lru(struct list_lru *lru, int old_size, int new_size)
> -{
> - struct list_lru_memcg *old, *new;
> -
> - BUG_ON(old_size > new_size);
> -
> - old = rcu_dereference_protected(lru->mlrus,
> - lockdep_is_held(&list_lrus_mutex));
> - new = kvmalloc(struct_size(new, mlru, new_size), GFP_KERNEL);
> - if (!new)
> - return -ENOMEM;
> -
> - spin_lock_irq(&lru->lock);
> - memcpy(&new->mlru, &old->mlru, flex_array_size(new, mlru, old_size));
> - memset(&new->mlru[old_size], 0, flex_array_size(new, mlru, new_size - old_size));
> - rcu_assign_pointer(lru->mlrus, new);
> - spin_unlock_irq(&lru->lock);
> -
> - kvfree_rcu(old, rcu);
> - return 0;
> -}
> -
> -int memcg_update_all_list_lrus(int new_size)
> -{
> - int ret = 0;
> - struct list_lru *lru;
> - int old_size = memcg_nr_cache_ids;
> -
> - mutex_lock(&list_lrus_mutex);
> - list_for_each_entry(lru, &memcg_list_lrus, list) {
> - ret = memcg_update_list_lru(lru, old_size, new_size);
> - if (ret)
> - break;
> + xas_lock_irq(&xas);
> + xas_for_each(&xas, mlru, ULONG_MAX) {
> + kfree(mlru);
> + xas_store(&xas, NULL);
> }
> - mutex_unlock(&list_lrus_mutex);
> - return ret;
> + xas_unlock_irq(&xas);
> }
>
> static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid,
> @@ -521,7 +451,7 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
> struct mem_cgroup *child;
>
> child = mem_cgroup_from_css(css);
> - child->kmemcg_id = parent->kmemcg_id;
> + WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id);
> }
> rcu_read_unlock();
>
> @@ -531,21 +461,12 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
> mutex_unlock(&list_lrus_mutex);
> }
>
> -static bool memcg_list_lru_allocated(struct mem_cgroup *memcg,
> - struct list_lru *lru)
> +static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg,
> + struct list_lru *lru)
> {
> - bool allocated;
> - int idx;
> + int idx = memcg->kmemcg_id;
>
> - idx = memcg->kmemcg_id;
> - if (unlikely(idx < 0))
> - return true;
> -
> - rcu_read_lock();
> - allocated = !!rcu_access_pointer(rcu_dereference(lru->mlrus)->mlru[idx]);
> - rcu_read_unlock();
> -
> - return allocated;
> + return idx < 0 || xa_load(&lru->xa, idx);
> }
>
> int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
> @@ -558,6 +479,7 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
> struct list_lru_per_memcg *mlru;
> struct mem_cgroup *memcg;
> } *table;
> + XA_STATE(xas, &lru->xa, 0);
>
> if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru))
> return 0;
> @@ -586,27 +508,48 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
> }
> }
>
> - spin_lock_irqsave(&lru->lock, flags);
> - mlrus = rcu_dereference_protected(lru->mlrus, true);
> + xas_lock_irqsave(&xas, flags);
> while (i--) {
> - int index = table[i].memcg->kmemcg_id;
> + int index = READ_ONCE(table[i].memcg->kmemcg_id);
> struct list_lru_per_memcg *mlru = table[i].mlru;
>
> - if (index < 0 || rcu_dereference_protected(mlrus->mlru[index], true))
> + xas_set(&xas, index);
> +retry:
> + if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) {
> kfree(mlru);
> - else
> - rcu_assign_pointer(mlrus->mlru[index], mlru);
> + } else {
> + xas_store(&xas, mlru);
> + if (xas_error(&xas) == -ENOMEM) {
> + xas_unlock_irqrestore(&xas, flags);
> + if (xas_nomem(&xas, gfp))
> + xas_set_err(&xas, 0);
> + xas_lock_irqsave(&xas, flags);
> + /*
> + * The xas lock has been released, this memcg
> + * can be reparented before us. So reload
> + * memcg id. More details see the comments
> + * in memcg_reparent_list_lrus().
> + */
> + index = READ_ONCE(table[i].memcg->kmemcg_id);
> + if (index < 0)
> + xas_set_err(&xas, 0);
> + else if (!xas_error(&xas) && index != xas.xa_index)
> + xas_set(&xas, index);
> + goto retry;
> + }
> + }
> }
> - spin_unlock_irqrestore(&lru->lock, flags);
> -
> + /* xas_nomem() is used to free memory instead of memory allocation. */
> + if (xas.xa_alloc)
> + xas_nomem(&xas, gfp);
> + xas_unlock_irqrestore(&xas, flags);
> kfree(table);
>
> - return 0;
> + return xas_error(&xas);
> }
> #else
> -static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
> +static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
> {
> - return 0;
> }
>
> static void memcg_destroy_list_lru(struct list_lru *lru)
> @@ -618,7 +561,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
> struct lock_class_key *key, struct shrinker *shrinker)
> {
> int i;
> - int err = -ENOMEM;
>
> #ifdef CONFIG_MEMCG_KMEM
> if (shrinker)
> @@ -626,11 +568,10 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
> else
> lru->shrinker_id = -1;
> #endif
> - memcg_get_cache_ids();
>
> lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL);
> if (!lru->node)
> - goto out;
> + return -ENOMEM;
>
> for_each_node(i) {
> spin_lock_init(&lru->node[i].lock);
> @@ -639,18 +580,10 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
> init_one_lru(&lru->node[i].lru);
> }
>
> - err = memcg_init_list_lru(lru, memcg_aware);
> - if (err) {
> - kfree(lru->node);
> - /* Do this so a list_lru_destroy() doesn't crash: */
> - lru->node = NULL;
> - goto out;
> - }
> -
> + memcg_init_list_lru(lru, memcg_aware);
> list_lru_register(lru);
> -out:
> - memcg_put_cache_ids();
> - return err;
> +
> + return 0;
> }
> EXPORT_SYMBOL_GPL(__list_lru_init);
>
> @@ -660,8 +593,6 @@ void list_lru_destroy(struct list_lru *lru)
> if (!lru->node)
> return;
>
> - memcg_get_cache_ids();
> -
> list_lru_unregister(lru);
>
> memcg_destroy_list_lru(lru);
> @@ -671,6 +602,5 @@ void list_lru_destroy(struct list_lru *lru)
> #ifdef CONFIG_MEMCG_KMEM
> lru->shrinker_id = -1;
> #endif
> - memcg_put_cache_ids();
> }
> EXPORT_SYMBOL_GPL(list_lru_destroy);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 87ee5b431c05..361ac289d8e9 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -351,42 +351,17 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
> * This will be used as a shrinker list's index.
> * The main reason for not using cgroup id for this:
> * this works better in sparse environments, where we have a lot of memcgs,
> - * but only a few kmem-limited. Or also, if we have, for instance, 200
> - * memcgs, and none but the 200th is kmem-limited, we'd have to have a
> - * 200 entry array for that.
> - *
> - * The current size of the caches array is stored in memcg_nr_cache_ids. It
> - * will double each time we have to increase it.
> + * but only a few kmem-limited.
> */
> static DEFINE_IDA(memcg_cache_ida);
> -int memcg_nr_cache_ids;
> -
> -/* Protects memcg_nr_cache_ids */
> -static DECLARE_RWSEM(memcg_cache_ids_sem);
> -
> -void memcg_get_cache_ids(void)
> -{
> - down_read(&memcg_cache_ids_sem);
> -}
> -
> -void memcg_put_cache_ids(void)
> -{
> - up_read(&memcg_cache_ids_sem);
> -}
>
> /*
> - * MIN_SIZE is different than 1, because we would like to avoid going through
> - * the alloc/free process all the time. In a small machine, 4 kmem-limited
> - * cgroups is a reasonable guess. In the future, it could be a parameter or
> - * tunable, but that is strictly not necessary.
> - *
> * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
> * this constant directly from cgroup, but it is understandable that this is
> * better kept as an internal representation in cgroup.c. In any case, the
> * cgrp_id space is not getting any smaller, and we don't have to necessarily
> * increase ours as well if it increases.
> */
> -#define MEMCG_CACHES_MIN_SIZE 4
> #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
>
> /*
> @@ -2922,49 +2897,6 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
> return objcg;
> }
>
> -static int memcg_alloc_cache_id(void)
> -{
> - int id, size;
> - int err;
> -
> - id = ida_simple_get(&memcg_cache_ida,
> - 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
> - if (id < 0)
> - return id;
> -
> - if (id < memcg_nr_cache_ids)
> - return id;
> -
> - /*
> - * There's no space for the new id in memcg_caches arrays,
> - * so we have to grow them.
> - */
> - down_write(&memcg_cache_ids_sem);
> -
> - size = 2 * (id + 1);
> - if (size < MEMCG_CACHES_MIN_SIZE)
> - size = MEMCG_CACHES_MIN_SIZE;
> - else if (size > MEMCG_CACHES_MAX_SIZE)
> - size = MEMCG_CACHES_MAX_SIZE;
> -
> - err = memcg_update_all_list_lrus(size);
> - if (!err)
> - memcg_nr_cache_ids = size;
> -
> - up_write(&memcg_cache_ids_sem);
> -
> - if (err) {
> - ida_simple_remove(&memcg_cache_ida, id);
> - return err;
> - }
> - return id;
> -}
> -
> -static void memcg_free_cache_id(int id)
> -{
> - ida_simple_remove(&memcg_cache_ida, id);
> -}
> -
> /*
> * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
> * @objcg: object cgroup to uncharge
> @@ -3619,13 +3551,14 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
> if (unlikely(mem_cgroup_is_root(memcg)))
> return 0;
>
> - memcg_id = memcg_alloc_cache_id();
> + memcg_id = ida_alloc_max(&memcg_cache_ida, MEMCG_CACHES_MAX_SIZE - 1,
> + GFP_KERNEL);
> if (memcg_id < 0)
> return memcg_id;
>
> objcg = obj_cgroup_alloc();
> if (!objcg) {
> - memcg_free_cache_id(memcg_id);
> + ida_free(&memcg_cache_ida, memcg_id);
> return -ENOMEM;
> }
> objcg->memcg = memcg;
> @@ -3669,7 +3602,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
> */
> memcg_reparent_list_lrus(memcg, parent);
>
> - memcg_free_cache_id(kmemcg_id);
> + ida_free(&memcg_cache_ida, kmemcg_id);
> }
> #else
> static int memcg_online_kmem(struct mem_cgroup *memcg)
> --
> 2.11.0
>
>
On Thu, Mar 31, 2022 at 12:25 PM NeilBrown <[email protected]> wrote:
>
> On Thu, 31 Mar 2022, Muchun Song wrote:
> >
> > If the above fix cannot fix your issue, would you mind providing
> > the .config and stack trace?
>
> The kernel I'm using is
> 74164d284b2909de0ba13518cc063e9ea9334749
> plus one patch in fs/namei.c
> So it does include the commit you mentioned.
>
> Config is below
>
> I run
> ./check -nfs generic/037
> in xfstests, and crash is quick.
>
> Stack trace is
>
> [ 121.557601] BUG: kernel NULL pointer dereference, address: 0000000000000008
> [ 121.558003] #PF: supervisor read access in kernel mode
> [ 121.558299] #PF: error_code(0x0000) - not-present page
> [ 121.558598] PGD 0 P4D 0
> [ 121.558750] Oops: 0000 [#1] PREEMPT SMP
> [ 121.558978] CPU: 2 PID: 1116 Comm: setfattr Not tainted 5.17.0-dev #455
> [ 121.559360] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b-rebuilt.opensuse.org 04/01/2014
> [ 121.560009] RIP: 0010:list_lru_add+0x58/0xae
> [ 121.560267] Code: 00 48 8d 58 48 74 23 48 89 ef e8 93 08 03 00 49 89 c5 48 85 c0 74 13 8b 90 40 0e 00 00 31 f6 4c 89 e7 e8 66 fb ff ff 48 3
> [ 121.561353] RSP: 0018:ffffc900016dfbd0 EFLAGS: 00010246
> [ 121.561668] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000094fd1aeb
> [ 121.562076] RDX: ffff888007768be8 RSI: ffffffff826b4914 RDI: ffffffff82745064
> [ 121.562484] RBP: ffff8880097b3888 R08: ffffffffffffffff R09: ffff888007768b40
> [ 121.562890] R10: ffffc900016dfa98 R11: 0000000000008f0c R12: ffffffff8482e7a0
> [ 121.563296] R13: ffff888007766000 R14: ffff888005e72300 R15: 0000000000000000
> [ 121.563702] FS: 00007f558ef08580(0000) GS:ffff88801f200000(0000) knlGS:0000000000000000
> [ 121.564166] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 121.564499] CR2: 0000000000000008 CR3: 00000000084c4000 CR4: 00000000000006e0
> [ 121.564905] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [ 121.565314] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [ 121.565719] Call Trace:
> [ 121.565860] <TASK>
> [ 121.565985] nfs4_xattr_get_cache+0x131/0x169
> [ 121.566239] nfs4_xattr_cache_add+0x47/0x15a
> [ 121.566485] nfs4_xattr_set_nfs4_user+0xcb/0xef
> [ 121.566748] __vfs_setxattr+0x66/0x72
> [ 121.566961] __vfs_setxattr_noperm+0x6e/0xf5
> [ 121.567211] vfs_setxattr+0xa7/0x12a
> [ 121.567419] setxattr+0x115/0x14d
> [ 121.567612] ? check_chain_key+0xde/0x11f
> [ 121.567846] path_setxattr+0x78/0xcf
> [ 121.568053] __x64_sys_setxattr+0x22/0x25
> [ 121.568287] do_syscall_64+0x6d/0x80
> [ 121.568497] entry_SYSCALL_64_after_hwframe+0x44/0xae
>
Thanks for your report. I knew the reason. It is because the following
patch in this series was missed upstream. Could you help me test if it
works properly?
[v6,06/16] nfs42: use a specific kmem_cache to allocate nfs4_xattr_entry
Hi Andrew,
Would you mind picking it up?
Thanks.
On Mon, Feb 28, 2022 at 08:21:22PM +0800, Muchun Song wrote:
> @@ -586,27 +508,48 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
> }
> }
>
> + xas_lock_irqsave(&xas, flags);
> while (i--) {
> + int index = READ_ONCE(table[i].memcg->kmemcg_id);
> struct list_lru_per_memcg *mlru = table[i].mlru;
>
> + xas_set(&xas, index);
> +retry:
> + if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) {
> kfree(mlru);
> + } else {
> + xas_store(&xas, mlru);
> + if (xas_error(&xas) == -ENOMEM) {
> + xas_unlock_irqrestore(&xas, flags);
> + if (xas_nomem(&xas, gfp))
> + xas_set_err(&xas, 0);
> + xas_lock_irqsave(&xas, flags);
> + /*
> + * The xas lock has been released, this memcg
> + * can be reparented before us. So reload
> + * memcg id. More details see the comments
> + * in memcg_reparent_list_lrus().
> + */
> + index = READ_ONCE(table[i].memcg->kmemcg_id);
> + if (index < 0)
> + xas_set_err(&xas, 0);
> + else if (!xas_error(&xas) && index != xas.xa_index)
> + xas_set(&xas, index);
> + goto retry;
> + }
> + }
> }
> + /* xas_nomem() is used to free memory instead of memory allocation. */
> + if (xas.xa_alloc)
> + xas_nomem(&xas, gfp);
> + xas_unlock_irqrestore(&xas, flags);
> kfree(table);
>
> + return xas_error(&xas);
> }
This is really unidiomatic. And so much more complicated than it needs
to be.
while (i--) {
do {
int index = READ_ONCE(table[i].memcg->kmemcg_id);
struct list_lru_per_memcg *mlru = table[i].mlru;
xas_set(&xas, index);
xas_lock_irqsave(&xas, flags);
if (index < 0 || xas_load(&xas))
xas_set_err(&xas, -EBUSY);
xas_store(&xas, mlru);
if (!xas_error(&xas))
break;
xas_unlock_irqrestore(&xas, flags);
} while (xas_nomem(&xas, gfp));
if (xas_error(&xas))
kfree(mlru);
}
kfree(table);
return xas_error(&xas);
(you might want to mask out the EBUSY error here)
On Fri, Apr 1, 2022 at 6:36 AM NeilBrown <[email protected]> wrote:
>
> On Thu, 31 Mar 2022, Muchun Song wrote:
> >
> > Thanks for your report. I knew the reason. It is because the following
> > patch in this series was missed upstream. Could you help me test if it
> > works properly?
> >
> > [v6,06/16] nfs42: use a specific kmem_cache to allocate nfs4_xattr_entry
> >
>
> Thanks for the quick response! That patch helps, but has a bug. My
> testing resulted in refcount underflow errors.
>
> Problem is that kref_init() is called in nfs4_xattr_entry_init_once().
> This means that it is initialised to '1' the first time an entry is
> allocated, but it is left as zero the second time.
> I applied:
> --- a/fs/nfs/nfs42xattr.c
> +++ b/fs/nfs/nfs42xattr.c
> @@ -191,6 +191,7 @@ nfs4_xattr_alloc_entry(const char *name, const void *value,
> entry = kmem_cache_alloc_lru(nfs4_xattr_entry_cachep, lru, gfp);
> if (!entry)
> return NULL;
> + kref_init(&entry->ref);
> namep = kstrdup_const(name, gfp);
> if (!namep && name)
> goto free_buf;
> @@ -974,7 +975,6 @@ static void nfs4_xattr_entry_init_once(void *p)
> {
> struct nfs4_xattr_entry *entry = p;
>
> - kref_init(&entry->ref);
> entry->bucket = NULL;
> INIT_LIST_HEAD(&entry->lru);
> INIT_LIST_HEAD(&entry->dispose);
>
> and now it seems to work.
>
> The complete patch that I applied is below. I haven't reviewed it, just
> tested it.
> Tested-by: NeilBrown <[email protected]>
>
Thanks for your test. I have looked at the latest code.
I found the following patch has removed GFP_ACCOUNT
on allocation.
5c60e89e71f8 ("NFSv4.2: Fix up an invalid combination of memory
allocation flags")
But this commit forgot to remove SLAB_ACCOUNT when creating
nfs4_xattr_cache_cachep (I think it is a bug). So I think the following
patch could work.
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index ad3405c64b9e..e7b34f7e0614 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -997,7 +997,7 @@ int __init nfs4_xattr_cache_init(void)
nfs4_xattr_cache_cachep = kmem_cache_create("nfs4_xattr_cache_cache",
sizeof(struct nfs4_xattr_cache), 0,
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
nfs4_xattr_cache_init_once);
if (nfs4_xattr_cache_cachep == NULL)
return -ENOMEM;
On Thu, 31 Mar 2022, Muchun Song wrote:
>
> Thanks for your report. I knew the reason. It is because the following
> patch in this series was missed upstream. Could you help me test if it
> works properly?
>
> [v6,06/16] nfs42: use a specific kmem_cache to allocate nfs4_xattr_entry
>
Thanks for the quick response! That patch helps, but has a bug. My
testing resulted in refcount underflow errors.
Problem is that kref_init() is called in nfs4_xattr_entry_init_once().
This means that it is initialised to '1' the first time an entry is
allocated, but it is left as zero the second time.
I applied:
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -191,6 +191,7 @@ nfs4_xattr_alloc_entry(const char *name, const void *value,
entry = kmem_cache_alloc_lru(nfs4_xattr_entry_cachep, lru, gfp);
if (!entry)
return NULL;
+ kref_init(&entry->ref);
namep = kstrdup_const(name, gfp);
if (!namep && name)
goto free_buf;
@@ -974,7 +975,6 @@ static void nfs4_xattr_entry_init_once(void *p)
{
struct nfs4_xattr_entry *entry = p;
- kref_init(&entry->ref);
entry->bucket = NULL;
INIT_LIST_HEAD(&entry->lru);
INIT_LIST_HEAD(&entry->dispose);
and now it seems to work.
The complete patch that I applied is below. I haven't reviewed it, just
tested it.
Tested-by: NeilBrown <[email protected]>
Thanks,
NeilBrown
From: Muchun Song <[email protected]>
Date: Mon, 28 Feb 2022 20:21:16 +0800
Subject: [PATCH] nfs42: use a specific kmem_cache to allocate nfs4_xattr_entry
If we want to add the allocated objects to its list_lru, we should use
kmem_cache_alloc_lru() to allocate objects. So intruduce
nfs4_xattr_entry_cachep which is used to allocate nfs4_xattr_entry.
Signed-off-by: Muchun Song <[email protected]>
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index ad3405c64b9e..d4163c46acf1 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -81,7 +81,7 @@ struct nfs4_xattr_entry {
struct hlist_node hnode;
struct list_head lru;
struct list_head dispose;
- char *xattr_name;
+ const char *xattr_name;
void *xattr_value;
size_t xattr_size;
struct nfs4_xattr_bucket *bucket;
@@ -98,6 +98,7 @@ static struct list_lru nfs4_xattr_entry_lru;
static struct list_lru nfs4_xattr_large_entry_lru;
static struct kmem_cache *nfs4_xattr_cache_cachep;
+static struct kmem_cache *nfs4_xattr_entry_cachep;
/*
* Hashing helper functions.
@@ -177,49 +178,28 @@ nfs4_xattr_alloc_entry(const char *name, const void *value,
{
struct nfs4_xattr_entry *entry;
void *valp;
- char *namep;
- size_t alloclen, slen;
- char *buf;
- uint32_t flags;
+ const char *namep;
+ uint32_t flags = len > PAGE_SIZE ? NFS4_XATTR_ENTRY_EXTVAL : 0;
+ gfp_t gfp = GFP_KERNEL;
+ struct list_lru *lru;
BUILD_BUG_ON(sizeof(struct nfs4_xattr_entry) +
XATTR_NAME_MAX + 1 > PAGE_SIZE);
- alloclen = sizeof(struct nfs4_xattr_entry);
- if (name != NULL) {
- slen = strlen(name) + 1;
- alloclen += slen;
- } else
- slen = 0;
-
- if (alloclen + len <= PAGE_SIZE) {
- alloclen += len;
- flags = 0;
- } else {
- flags = NFS4_XATTR_ENTRY_EXTVAL;
- }
-
- buf = kmalloc(alloclen, GFP_KERNEL);
- if (buf == NULL)
+ lru = flags & NFS4_XATTR_ENTRY_EXTVAL ? &nfs4_xattr_large_entry_lru :
+ &nfs4_xattr_entry_lru;
+ entry = kmem_cache_alloc_lru(nfs4_xattr_entry_cachep, lru, gfp);
+ if (!entry)
return NULL;
- entry = (struct nfs4_xattr_entry *)buf;
-
- if (name != NULL) {
- namep = buf + sizeof(struct nfs4_xattr_entry);
- memcpy(namep, name, slen);
- } else {
- namep = NULL;
- }
-
-
- if (flags & NFS4_XATTR_ENTRY_EXTVAL) {
- valp = kvmalloc(len, GFP_KERNEL);
- if (valp == NULL) {
- kfree(buf);
- return NULL;
- }
- } else if (len != 0) {
- valp = buf + sizeof(struct nfs4_xattr_entry) + slen;
+ kref_init(&entry->ref);
+ namep = kstrdup_const(name, gfp);
+ if (!namep && name)
+ goto free_buf;
+
+ if (len != 0) {
+ valp = kvmalloc(len, gfp);
+ if (!valp)
+ goto free_name;
} else
valp = NULL;
@@ -232,23 +212,23 @@ nfs4_xattr_alloc_entry(const char *name, const void *value,
entry->flags = flags;
entry->xattr_value = valp;
- kref_init(&entry->ref);
entry->xattr_name = namep;
entry->xattr_size = len;
- entry->bucket = NULL;
- INIT_LIST_HEAD(&entry->lru);
- INIT_LIST_HEAD(&entry->dispose);
- INIT_HLIST_NODE(&entry->hnode);
return entry;
+free_name:
+ kfree_const(namep);
+free_buf:
+ kmem_cache_free(nfs4_xattr_entry_cachep, entry);
+ return NULL;
}
static void
nfs4_xattr_free_entry(struct nfs4_xattr_entry *entry)
{
- if (entry->flags & NFS4_XATTR_ENTRY_EXTVAL)
- kvfree(entry->xattr_value);
- kfree(entry);
+ kvfree(entry->xattr_value);
+ kfree_const(entry->xattr_name);
+ kmem_cache_free(nfs4_xattr_entry_cachep, entry);
}
static void
@@ -289,7 +269,7 @@ nfs4_xattr_alloc_cache(void)
{
struct nfs4_xattr_cache *cache;
- cache = kmem_cache_alloc(nfs4_xattr_cache_cachep, GFP_KERNEL);
+ cache = kmem_cache_alloc_lru(nfs4_xattr_cache_cachep, &nfs4_xattr_cache_lru, GFP_KERNEL);
if (cache == NULL)
return NULL;
@@ -991,6 +971,16 @@ static void nfs4_xattr_cache_init_once(void *p)
INIT_LIST_HEAD(&cache->dispose);
}
+static void nfs4_xattr_entry_init_once(void *p)
+{
+ struct nfs4_xattr_entry *entry = p;
+
+ entry->bucket = NULL;
+ INIT_LIST_HEAD(&entry->lru);
+ INIT_LIST_HEAD(&entry->dispose);
+ INIT_HLIST_NODE(&entry->hnode);
+}
+
int __init nfs4_xattr_cache_init(void)
{
int ret = 0;
@@ -1002,6 +992,13 @@ int __init nfs4_xattr_cache_init(void)
if (nfs4_xattr_cache_cachep == NULL)
return -ENOMEM;
+ nfs4_xattr_entry_cachep = kmem_cache_create("nfs4_xattr_entry",
+ sizeof(struct nfs4_xattr_entry), 0,
+ (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT),
+ nfs4_xattr_entry_init_once);
+ if (!nfs4_xattr_entry_cachep)
+ goto out5;
+
ret = list_lru_init_memcg(&nfs4_xattr_large_entry_lru,
&nfs4_xattr_large_entry_shrinker);
if (ret)
@@ -1039,6 +1036,8 @@ int __init nfs4_xattr_cache_init(void)
out3:
list_lru_destroy(&nfs4_xattr_large_entry_lru);
out4:
+ kmem_cache_destroy(nfs4_xattr_entry_cachep);
+out5:
kmem_cache_destroy(nfs4_xattr_cache_cachep);
return ret;
On Thu, Mar 31, 2022 at 11:01 PM Matthew Wilcox <[email protected]> wrote:
>
> On Mon, Feb 28, 2022 at 08:21:22PM +0800, Muchun Song wrote:
> > @@ -586,27 +508,48 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
> > }
> > }
> >
> > + xas_lock_irqsave(&xas, flags);
> > while (i--) {
> > + int index = READ_ONCE(table[i].memcg->kmemcg_id);
> > struct list_lru_per_memcg *mlru = table[i].mlru;
> >
> > + xas_set(&xas, index);
> > +retry:
> > + if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) {
> > kfree(mlru);
> > + } else {
> > + xas_store(&xas, mlru);
> > + if (xas_error(&xas) == -ENOMEM) {
> > + xas_unlock_irqrestore(&xas, flags);
> > + if (xas_nomem(&xas, gfp))
> > + xas_set_err(&xas, 0);
> > + xas_lock_irqsave(&xas, flags);
> > + /*
> > + * The xas lock has been released, this memcg
> > + * can be reparented before us. So reload
> > + * memcg id. More details see the comments
> > + * in memcg_reparent_list_lrus().
> > + */
> > + index = READ_ONCE(table[i].memcg->kmemcg_id);
> > + if (index < 0)
> > + xas_set_err(&xas, 0);
> > + else if (!xas_error(&xas) && index != xas.xa_index)
> > + xas_set(&xas, index);
> > + goto retry;
> > + }
> > + }
> > }
> > + /* xas_nomem() is used to free memory instead of memory allocation. */
> > + if (xas.xa_alloc)
> > + xas_nomem(&xas, gfp);
> > + xas_unlock_irqrestore(&xas, flags);
> > kfree(table);
> >
> > + return xas_error(&xas);
> > }
>
> This is really unidiomatic. And so much more complicated than it needs
> to be.
>
> while (i--) {
> do {
> int index = READ_ONCE(table[i].memcg->kmemcg_id);
> struct list_lru_per_memcg *mlru = table[i].mlru;
>
> xas_set(&xas, index);
> xas_lock_irqsave(&xas, flags);
> if (index < 0 || xas_load(&xas))
> xas_set_err(&xas, -EBUSY);
> xas_store(&xas, mlru);
> if (!xas_error(&xas))
> break;
> xas_unlock_irqrestore(&xas, flags);
> } while (xas_nomem(&xas, gfp));
>
> if (xas_error(&xas))
> kfree(mlru);
> }
>
> kfree(table);
> return xas_error(&xas);
>
> (you might want to mask out the EBUSY error here)
>
I'm not familiar with xarray APIs. I'll dig deeper to see if this code works
properly. Thanks for your suggestions.