LinuxLists.cc - Re: [PATCH 1/2] mm/zswap: global lru and shrinker shared by all zswap

2024-02-13 13:00:14

Subject: Re: [PATCH 1/2] mm/zswap: global lru and shrinker shared by all zswap_pools

On Sun, Feb 11, 2024 at 01:57:04PM +0000, Chengming Zhou wrote:
> Dynamic zswap_pool creation may create/reuse to have multiple
> zswap_pools in a list, only the first will be current used.
>
> Each zswap_pool has its own lru and shrinker, which is not
> necessary and has its problem:
>
> 1. When memory has pressure, all shrinker of zswap_pools will
> try to shrink its own lru, there is no order between them.
>
> 2. When zswap limit hit, only the last zswap_pool's shrink_work
> will try to shrink its lru, which is inefficient.
>
> Anyway, having a global lru and shrinker shared by all zswap_pools
> is better and efficient.

It is also a great simplification.

>
> Signed-off-by: Chengming Zhou <[email protected]>
> ---
> mm/zswap.c | 153 ++++++++++++++++++++++---------------------------------------
> 1 file changed, 55 insertions(+), 98 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 62fe307521c9..7668db8c10e3 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -176,14 +176,17 @@ struct zswap_pool {
> struct kref kref;
> struct list_head list;
> struct work_struct release_work;
> - struct work_struct shrink_work;
> struct hlist_node node;
> char tfm_name[CRYPTO_MAX_ALG_NAME];
> +};
> +
> +struct {

static?

> struct list_lru list_lru;
> - struct mem_cgroup *next_shrink;
> - struct shrinker *shrinker;

Just curious, any reason to change the relative ordering of members
here? It produces a couple more lines of diff :)

> atomic_t nr_stored;
> -};
> + struct shrinker *shrinker;
> + struct work_struct shrink_work;
> + struct mem_cgroup *next_shrink;
> +} zswap;
>
> /*
> * struct zswap_entry
> @@ -301,9 +304,6 @@ static void zswap_update_total_size(void)
> * pool functions
> **********************************/
>
> -static void zswap_alloc_shrinker(struct zswap_pool *pool);
> -static void shrink_worker(struct work_struct *w);
> -
> static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> {
> int i;
> @@ -353,30 +353,16 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> if (ret)
> goto error;
>
> - zswap_alloc_shrinker(pool);
> - if (!pool->shrinker)
> - goto error;
> -
> - pr_debug("using %s compressor\n", pool->tfm_name);
> -

Why are we removing this debug print?

> /* being the current pool takes 1 ref; this func expects the
> * caller to always add the new pool as the current pool
> */
> kref_init(&pool->kref);
> INIT_LIST_HEAD(&pool->list);
> - if (list_lru_init_memcg(&pool->list_lru, pool->shrinker))
> - goto lru_fail;
> - shrinker_register(pool->shrinker);
> - INIT_WORK(&pool->shrink_work, shrink_worker);
> - atomic_set(&pool->nr_stored, 0);
>
> zswap_pool_debug("created", pool);
>
> return pool;
>
> -lru_fail:
> - list_lru_destroy(&pool->list_lru);
> - shrinker_free(pool->shrinker);
> error:
> if (pool->acomp_ctx)
> free_percpu(pool->acomp_ctx);
[..]
> @@ -816,14 +777,10 @@ void zswap_folio_swapin(struct folio *folio)
>
> void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
> {
> - struct zswap_pool *pool;
> -
> - /* lock out zswap pools list modification */
> + /* lock out zswap shrinker walking memcg tree */
> spin_lock(&zswap_pools_lock);
> - list_for_each_entry(pool, &zswap_pools, list) {
> - if (pool->next_shrink == memcg)
> - pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
> - }
> + if (zswap.next_shrink == memcg)
> + zswap.next_shrink = mem_cgroup_iter(NULL, zswap.next_shrink, NULL);

Now that next_shrink has nothing to do with zswap pools, it feels weird
that we are using zswap_pools_lock for its synchronization. Does it make
sense to have a separate lock for it just for semantic purposes?

> spin_unlock(&zswap_pools_lock);
> }
>
[..]
> @@ -1328,7 +1284,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
> static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
> struct shrink_control *sc)
> {
> - struct zswap_pool *pool = shrinker->private_data;
> struct mem_cgroup *memcg = sc->memcg;
> struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
> unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
> @@ -1343,7 +1298,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
> #else
> /* use pool stats instead of memcg stats */
> nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT;

"pool" is still being used here.

> - nr_stored = atomic_read(&pool->nr_stored);
> + nr_stored = atomic_read(&zswap.nr_stored);
> #endif
>
> if (!nr_stored)
[..]
> @@ -1804,6 +1749,21 @@ static int zswap_setup(void)
> if (ret)
> goto hp_fail;
>
> + shrink_wq = alloc_workqueue("zswap-shrink",
> + WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
> + if (!shrink_wq)
> + goto hp_fail;

I think we need a new label here to call cpuhp_remove_multi_state(), but
apparently this is missing from the current code for some reason.

> +
> + zswap.shrinker = zswap_alloc_shrinker();
> + if (!zswap.shrinker)
> + goto shrinker_fail;
> + if (list_lru_init_memcg(&zswap.list_lru, zswap.shrinker))
> + goto lru_fail;
> + shrinker_register(zswap.shrinker);
> +
> + INIT_WORK(&zswap.shrink_work, shrink_worker);
> + atomic_set(&zswap.nr_stored, 0);
> +
> pool = __zswap_pool_create_fallback();
> if (pool) {
> pr_info("loaded using pool %s/%s\n", pool->tfm_name,
> @@ -1815,19 +1775,16 @@ static int zswap_setup(void)
> zswap_enabled = false;
> }
>
> - shrink_wq = alloc_workqueue("zswap-shrink",
> - WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
> - if (!shrink_wq)
> - goto fallback_fail;
> -
> if (zswap_debugfs_init())
> pr_warn("debugfs initialization failed\n");
> zswap_init_state = ZSWAP_INIT_SUCCEED;
> return 0;
>
> -fallback_fail:
> - if (pool)
> - zswap_pool_destroy(pool);
> +lru_fail:
> + list_lru_destroy(&zswap.list_lru);

Do we need to call list_lru_destroy() here? I know it is currently being
called if list_lru_init_memcg() fails, but I fail to understand why. It
seems like list_lru_destroy() will do nothing anyway.

> + shrinker_free(zswap.shrinker);
> +shrinker_fail:
> + destroy_workqueue(shrink_wq);
> hp_fail:
> kmem_cache_destroy(zswap_entry_cache);
> cache_fail:
>
> --
> b4 0.10.1

2024-02-13 14:21:16

by Chengming Zhou

[permalink] [raw]

Subject: Re: [PATCH 1/2] mm/zswap: global lru and shrinker shared by all zswap_pools

On 2024/2/13 20:57, Yosry Ahmed wrote:
> On Sun, Feb 11, 2024 at 01:57:04PM +0000, Chengming Zhou wrote:
>> Dynamic zswap_pool creation may create/reuse to have multiple
>> zswap_pools in a list, only the first will be current used.
>>
>> Each zswap_pool has its own lru and shrinker, which is not
>> necessary and has its problem:
>>
>> 1. When memory has pressure, all shrinker of zswap_pools will
>> try to shrink its own lru, there is no order between them.
>>
>> 2. When zswap limit hit, only the last zswap_pool's shrink_work
>> will try to shrink its lru, which is inefficient.
>>
>> Anyway, having a global lru and shrinker shared by all zswap_pools
>> is better and efficient.
>
> It is also a great simplification.
>
>>
>> Signed-off-by: Chengming Zhou <[email protected]>
>> ---
>> mm/zswap.c | 153 ++++++++++++++++++++++---------------------------------------
>> 1 file changed, 55 insertions(+), 98 deletions(-)
>>
>> diff --git a/mm/zswap.c b/mm/zswap.c
>> index 62fe307521c9..7668db8c10e3 100644
>> --- a/mm/zswap.c
>> +++ b/mm/zswap.c
>> @@ -176,14 +176,17 @@ struct zswap_pool {
>> struct kref kref;
>> struct list_head list;
>> struct work_struct release_work;
>> - struct work_struct shrink_work;
>> struct hlist_node node;
>> char tfm_name[CRYPTO_MAX_ALG_NAME];
>> +};
>> +
>> +struct {
>
> static?

Ah, right, will add static.

>
>> struct list_lru list_lru;
>> - struct mem_cgroup *next_shrink;
>> - struct shrinker *shrinker;
>
> Just curious, any reason to change the relative ordering of members
> here? It produces a couple more lines of diff :)

The list_lru and nr_stored atomic variable are used in zswap_store/load
hotpath, the other shrinker related sound like cold path. I thought it's
normal and clearer to put them according to their usages.

>
>> atomic_t nr_stored;
>> -};
>> + struct shrinker *shrinker;
>> + struct work_struct shrink_work;
>> + struct mem_cgroup *next_shrink;
>> +} zswap;
>>
>> /*
>> * struct zswap_entry
>> @@ -301,9 +304,6 @@ static void zswap_update_total_size(void)
>> * pool functions
>> **********************************/
>>
>> -static void zswap_alloc_shrinker(struct zswap_pool *pool);
>> -static void shrink_worker(struct work_struct *w);
>> -
>> static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>> {
>> int i;
>> @@ -353,30 +353,16 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>> if (ret)
>> goto error;
>>
>> - zswap_alloc_shrinker(pool);
>> - if (!pool->shrinker)
>> - goto error;
>> -
>> - pr_debug("using %s compressor\n", pool->tfm_name);
>> -
>
> Why are we removing this debug print?

Oh, I just noticed it's only necessary to print dmesg when "create" success,
the below "zswap_pool_debug()" will print its compressor too.

>
>> /* being the current pool takes 1 ref; this func expects the
>> * caller to always add the new pool as the current pool
>> */
>> kref_init(&pool->kref);
>> INIT_LIST_HEAD(&pool->list);
>> - if (list_lru_init_memcg(&pool->list_lru, pool->shrinker))
>> - goto lru_fail;
>> - shrinker_register(pool->shrinker);
>> - INIT_WORK(&pool->shrink_work, shrink_worker);
>> - atomic_set(&pool->nr_stored, 0);
>>
>> zswap_pool_debug("created", pool);
>>
>> return pool;
>>
>> -lru_fail:
>> - list_lru_destroy(&pool->list_lru);
>> - shrinker_free(pool->shrinker);
>> error:
>> if (pool->acomp_ctx)
>> free_percpu(pool->acomp_ctx);
> [..]
>> @@ -816,14 +777,10 @@ void zswap_folio_swapin(struct folio *folio)
>>
>> void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
>> {
>> - struct zswap_pool *pool;
>> -
>> - /* lock out zswap pools list modification */
>> + /* lock out zswap shrinker walking memcg tree */
>> spin_lock(&zswap_pools_lock);
>> - list_for_each_entry(pool, &zswap_pools, list) {
>> - if (pool->next_shrink == memcg)
>> - pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
>> - }
>> + if (zswap.next_shrink == memcg)
>> + zswap.next_shrink = mem_cgroup_iter(NULL, zswap.next_shrink, NULL);
>
> Now that next_shrink has nothing to do with zswap pools, it feels weird
> that we are using zswap_pools_lock for its synchronization. Does it make
> sense to have a separate lock for it just for semantic purposes?

Agree, I think so, it's clearer to have another lock.

>
>> spin_unlock(&zswap_pools_lock);
>> }
>>
> [..]
>> @@ -1328,7 +1284,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
>> static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
>> struct shrink_control *sc)
>> {
>> - struct zswap_pool *pool = shrinker->private_data;
>> struct mem_cgroup *memcg = sc->memcg;
>> struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
>> unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
>> @@ -1343,7 +1298,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
>> #else
>> /* use pool stats instead of memcg stats */
>> nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT;
>
> "pool" is still being used here.

Oops, should be changed to zswap_pool_total_size here.

>
>> - nr_stored = atomic_read(&pool->nr_stored);
>> + nr_stored = atomic_read(&zswap.nr_stored);
>> #endif
>>
>> if (!nr_stored)
> [..]
>> @@ -1804,6 +1749,21 @@ static int zswap_setup(void)
>> if (ret)
>> goto hp_fail;
>>
>> + shrink_wq = alloc_workqueue("zswap-shrink",
>> + WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
>> + if (!shrink_wq)
>> + goto hp_fail;
>
> I think we need a new label here to call cpuhp_remove_multi_state(), but
> apparently this is missing from the current code for some reason.

You are right! This should use a new label to "cpuhp_remove_multi_state()",
will fix it.

>
>> +
>> + zswap.shrinker = zswap_alloc_shrinker();
>> + if (!zswap.shrinker)
>> + goto shrinker_fail;
>> + if (list_lru_init_memcg(&zswap.list_lru, zswap.shrinker))
>> + goto lru_fail;
>> + shrinker_register(zswap.shrinker);
>> +
>> + INIT_WORK(&zswap.shrink_work, shrink_worker);
>> + atomic_set(&zswap.nr_stored, 0);
>> +
>> pool = __zswap_pool_create_fallback();
>> if (pool) {
>> pr_info("loaded using pool %s/%s\n", pool->tfm_name,
>> @@ -1815,19 +1775,16 @@ static int zswap_setup(void)
>> zswap_enabled = false;
>> }
>>
>> - shrink_wq = alloc_workqueue("zswap-shrink",
>> - WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
>> - if (!shrink_wq)
>> - goto fallback_fail;
>> -
>> if (zswap_debugfs_init())
>> pr_warn("debugfs initialization failed\n");
>> zswap_init_state = ZSWAP_INIT_SUCCEED;
>> return 0;
>>
>> -fallback_fail:
>> - if (pool)
>> - zswap_pool_destroy(pool);
>> +lru_fail:
>> + list_lru_destroy(&zswap.list_lru);
>
> Do we need to call list_lru_destroy() here? I know it is currently being
> called if list_lru_init_memcg() fails, but I fail to understand why. It
> seems like list_lru_destroy() will do nothing anyway.

Right, it's not needed to call list_lru_destroy() here, it should do nothing,
will delete it.

Thanks!

>
>> + shrinker_free(zswap.shrinker);
>> +shrinker_fail:
>> + destroy_workqueue(shrink_wq);
>> hp_fail:
>> kmem_cache_destroy(zswap_entry_cache);
>> cache_fail:
>>
>> --
>> b4 0.10.1

2024-02-13 17:43:47

by Yosry Ahmed

[permalink] [raw]

Subject: Re: [PATCH 1/2] mm/zswap: global lru and shrinker shared by all zswap_pools

> >> @@ -353,30 +353,16 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> >> if (ret)
> >> goto error;
> >>
> >> - zswap_alloc_shrinker(pool);
> >> - if (!pool->shrinker)
> >> - goto error;
> >> -
> >> - pr_debug("using %s compressor\n", pool->tfm_name);
> >> -
> >
> > Why are we removing this debug print?

This pr_debug() was introduced when dynamic zswap pools were introduced,
and it was supposed to be printed right after the compressor is
initialized. IOW, it is supposed to be after the call to
cpuhp_state_add_instance() succeeds. The call to zswap_alloc_shrinker()
was mistakenly added above that pr_debug() call.

Anyway, I just realized you are now removing all failure cases between
than pr_debug() and the zswap_pool_debug() below, so there is no need to
keep both. You are right.

I am wondering if these debug prints are useful at all now, but that's a
question for another day :)