2024-06-06 16:53:34

by Takero Funaki

[permalink] [raw]
Subject: [PATCH] mm: zswap: limit number of zpools based on CPU and RAM

This patch limits the number of zpools used by zswap on smaller systems.

Currently, zswap allocates 32 pools unconditionally. This was
implemented to reduce contention on per-zpool locks. However, it incurs
allocation overhead by distributing pages across pools, wasting memory
on systems with fewer CPUs and less RAM.

This patch allocates approximately 2*CPU zpools, with a minimum of 1
zpool for single-CPU systems and up to 32 zpools for systems with 16 or
more CPUs. This number is sufficient to keep the probability of
busy-waiting by a thread under 40%. The upper limit of 32 zpools remains
unchanged.

For memory, it limits to 1 zpool per 60MB of memory for the 20% default
max pool size limit, assuming the best case with no fragmentation in
zspages. It expects 90% pool usage for zsmalloc.

Signed-off-by: Takero Funaki <[email protected]>
---
mm/zswap.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 60 insertions(+), 7 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 4de342a63bc2..e957bfdeaf70 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -124,8 +124,11 @@ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */
module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
uint, 0644);

-/* Number of zpools in zswap_pool (empirically determined for scalability) */
-#define ZSWAP_NR_ZPOOLS 32
+/*
+ * Number of max zpools in zswap_pool (empirically determined for scalability)
+ * This must be order of 2, for pointer hashing.
+ */
+#define ZSWAP_NR_ZPOOLS_MAX 32

/* Enable/disable memory pressure-based shrinker. */
static bool zswap_shrinker_enabled = IS_ENABLED(
@@ -157,12 +160,13 @@ struct crypto_acomp_ctx {
* needs to be verified that it's still valid in the tree.
*/
struct zswap_pool {
- struct zpool *zpools[ZSWAP_NR_ZPOOLS];
+ struct zpool *zpools[ZSWAP_NR_ZPOOLS_MAX];
struct crypto_acomp_ctx __percpu *acomp_ctx;
struct percpu_ref ref;
struct list_head list;
struct work_struct release_work;
struct hlist_node node;
+ unsigned char nr_zpools_order;
char tfm_name[CRYPTO_MAX_ALG_NAME];
};

@@ -243,11 +247,55 @@ static inline struct xarray *swap_zswap_tree(swp_entry_t swp)
pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
zpool_get_type((p)->zpools[0]))

+static unsigned long zswap_max_pages(void);
+
/*********************************
* pool functions
**********************************/
static void __zswap_pool_empty(struct percpu_ref *ref);

+/*
+ * Estimate the optimal number of zpools based on CPU and memory.
+ *
+ * For CPUs, aim for 40% or lower probability of busy-waiting from a thread,
+ * assuming all cores are accessing zswap concurrently.
+ * The threshold is chosen for the simplicity of the formula:
+ * The probability is 1-(1-(1/pool))^(thr-1). For 40% threshold, this is
+ * approximately pool = 2 * threads rounded up to orders of 2.
+ * Threads \ Pools
+ * 2 4 8 16 32
+ * 2 0.50 0.25 < 0.13 0.06 0.03
+ * 4 0.88 0.58 0.33 < 0.18 0.09
+ * 6 0.97 0.76 0.49 0.28 < 0.15
+ * 8 0.99 0.87 0.61 0.36 < 0.20
+ * 10 1.00 0.92 0.70 0.44 0.25 <
+ * 16 1.00 0.99 0.87 0.62 0.38 <
+ * 18 1.00 0.99 0.90 0.67 0.42
+ *
+ * For memory, expect 90% pool usage for zsmalloc in the best case.
+ * Assuming uniform distribution, we need to store:
+ * 590 : sum of pages_per_zspage
+ * * 0.5 : about half of zspage is empty if no fragmentation
+ * / (1-0.9) : 90% target usage
+ * = 2950 : expected max pages of a zpool,
+ * equivalent to 60MB RAM for a 20% max_pool_percent.
+ */
+static void __zswap_set_nr_zpools(struct zswap_pool *pool)
+{
+ unsigned long mem = zswap_max_pages();
+ unsigned long cpu = num_online_cpus();
+
+ mem = DIV_ROUND_UP(mem, 2950);
+ mem = min(max(1, mem), ZSWAP_NR_ZPOOLS_MAX);
+
+ if (cpu <= 1)
+ cpu = 1;
+ else
+ cpu = 1 << ilog2(min(cpu * 2, ZSWAP_NR_ZPOOLS_MAX);
+
+ pool->nr_zpools_order = ilog2(min(mem, cpu));
+}
+
static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
{
int i;
@@ -271,7 +319,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
if (!pool)
return NULL;

- for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) {
+ __zswap_set_nr_zpools(pool);
+
+ for (i = 0; i < (1 << pool->nr_zpools_order); i++) {
/* unique name for each pool specifically required by zsmalloc */
snprintf(name, 38, "zswap%x",
atomic_inc_return(&zswap_pools_count));
@@ -372,7 +422,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
free_percpu(pool->acomp_ctx);

- for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
+ for (i = 0; i < (1 << pool->nr_zpools_order); i++)
zpool_destroy_pool(pool->zpools[i]);
kfree(pool);
}
@@ -513,7 +563,7 @@ unsigned long zswap_total_pages(void)
list_for_each_entry_rcu(pool, &zswap_pools, list) {
int i;

- for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
+ for (i = 0; i < (1 << pool->nr_zpools_order); i++)
total += zpool_get_total_pages(pool->zpools[i]);
}
rcu_read_unlock();
@@ -822,7 +872,10 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)

static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
{
- return entry->pool->zpools[hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS))];
+ if (entry->pool->nr_zpools_order == 0)
+ return entry->pool->zpools[0];
+
+ return entry->pool->zpools[hash_ptr(entry, entry->pool->nr_zpools_order)];
}

/*
--
2.43.0



2024-06-06 17:50:58

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH] mm: zswap: limit number of zpools based on CPU and RAM

On Thu, Jun 6, 2024 at 9:53 AM Takero Funaki <[email protected]> wrote:
>
> This patch limits the number of zpools used by zswap on smaller systems.
>
> Currently, zswap allocates 32 pools unconditionally. This was
> implemented to reduce contention on per-zpool locks. However, it incurs
> allocation overhead by distributing pages across pools, wasting memory
> on systems with fewer CPUs and less RAM.
>
> This patch allocates approximately 2*CPU zpools, with a minimum of 1
> zpool for single-CPU systems and up to 32 zpools for systems with 16 or
> more CPUs. This number is sufficient to keep the probability of
> busy-waiting by a thread under 40%. The upper limit of 32 zpools remains
> unchanged.
>
> For memory, it limits to 1 zpool per 60MB of memory for the 20% default
> max pool size limit, assuming the best case with no fragmentation in
> zspages. It expects 90% pool usage for zsmalloc.
>
> Signed-off-by: Takero Funaki <[email protected]>

There are a lot of magic numbers in this patch, and it seems like it's
all based on theory. I don't object to making the number of zpools
dynamic in some way, but unless we do it in a data-driven way where we
understand the implications, I think the added complexity and
inconsistency is not justified.

For example, 2*CPU zpools is an overkill and will cause a lot of
fragmentation. We use 32 zpools right now for machines with 100s of
CPUs. I know that you are keeping 32 as the limit, but why 2*CPUs if
nr_cpus <= 16?

Also, the limitation based on memory size assumes that zsmalloc is the
only allocator used by zswap, which is unfortunately not the case.

The current implementation using 32 zpools all the time is not
perfect, and I did write a patch to make it at least be min(nr_cpus,
32), but it is simple and it works. Complexity should be justified.

> ---
> mm/zswap.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++------
> 1 file changed, 60 insertions(+), 7 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 4de342a63bc2..e957bfdeaf70 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -124,8 +124,11 @@ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */
> module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
> uint, 0644);
>
> -/* Number of zpools in zswap_pool (empirically determined for scalability) */
> -#define ZSWAP_NR_ZPOOLS 32
> +/*
> + * Number of max zpools in zswap_pool (empirically determined for scalability)
> + * This must be order of 2, for pointer hashing.
> + */
> +#define ZSWAP_NR_ZPOOLS_MAX 32
>
> /* Enable/disable memory pressure-based shrinker. */
> static bool zswap_shrinker_enabled = IS_ENABLED(
> @@ -157,12 +160,13 @@ struct crypto_acomp_ctx {
> * needs to be verified that it's still valid in the tree.
> */
> struct zswap_pool {
> - struct zpool *zpools[ZSWAP_NR_ZPOOLS];
> + struct zpool *zpools[ZSWAP_NR_ZPOOLS_MAX];
> struct crypto_acomp_ctx __percpu *acomp_ctx;
> struct percpu_ref ref;
> struct list_head list;
> struct work_struct release_work;
> struct hlist_node node;
> + unsigned char nr_zpools_order;
> char tfm_name[CRYPTO_MAX_ALG_NAME];
> };
>
> @@ -243,11 +247,55 @@ static inline struct xarray *swap_zswap_tree(swp_entry_t swp)
> pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
> zpool_get_type((p)->zpools[0]))
>
> +static unsigned long zswap_max_pages(void);
> +
> /*********************************
> * pool functions
> **********************************/
> static void __zswap_pool_empty(struct percpu_ref *ref);
>
> +/*
> + * Estimate the optimal number of zpools based on CPU and memory.
> + *
> + * For CPUs, aim for 40% or lower probability of busy-waiting from a thread,
> + * assuming all cores are accessing zswap concurrently.
> + * The threshold is chosen for the simplicity of the formula:
> + * The probability is 1-(1-(1/pool))^(thr-1). For 40% threshold, this is
> + * approximately pool = 2 * threads rounded up to orders of 2.
> + * Threads \ Pools
> + * 2 4 8 16 32
> + * 2 0.50 0.25 < 0.13 0.06 0.03
> + * 4 0.88 0.58 0.33 < 0.18 0.09
> + * 6 0.97 0.76 0.49 0.28 < 0.15
> + * 8 0.99 0.87 0.61 0.36 < 0.20
> + * 10 1.00 0.92 0.70 0.44 0.25 <
> + * 16 1.00 0.99 0.87 0.62 0.38 <
> + * 18 1.00 0.99 0.90 0.67 0.42
> + *
> + * For memory, expect 90% pool usage for zsmalloc in the best case.
> + * Assuming uniform distribution, we need to store:
> + * 590 : sum of pages_per_zspage
> + * * 0.5 : about half of zspage is empty if no fragmentation
> + * / (1-0.9) : 90% target usage
> + * = 2950 : expected max pages of a zpool,
> + * equivalent to 60MB RAM for a 20% max_pool_percent.
> + */
> +static void __zswap_set_nr_zpools(struct zswap_pool *pool)
> +{
> + unsigned long mem = zswap_max_pages();
> + unsigned long cpu = num_online_cpus();
> +
> + mem = DIV_ROUND_UP(mem, 2950);
> + mem = min(max(1, mem), ZSWAP_NR_ZPOOLS_MAX);
> +
> + if (cpu <= 1)
> + cpu = 1;
> + else
> + cpu = 1 << ilog2(min(cpu * 2, ZSWAP_NR_ZPOOLS_MAX);
> +
> + pool->nr_zpools_order = ilog2(min(mem, cpu));
> +}
> +
> static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> {
> int i;
> @@ -271,7 +319,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> if (!pool)
> return NULL;
>
> - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) {
> + __zswap_set_nr_zpools(pool);
> +
> + for (i = 0; i < (1 << pool->nr_zpools_order); i++) {
> /* unique name for each pool specifically required by zsmalloc */
> snprintf(name, 38, "zswap%x",
> atomic_inc_return(&zswap_pools_count));
> @@ -372,7 +422,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
> cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
> free_percpu(pool->acomp_ctx);
>
> - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> + for (i = 0; i < (1 << pool->nr_zpools_order); i++)
> zpool_destroy_pool(pool->zpools[i]);
> kfree(pool);
> }
> @@ -513,7 +563,7 @@ unsigned long zswap_total_pages(void)
> list_for_each_entry_rcu(pool, &zswap_pools, list) {
> int i;
>
> - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> + for (i = 0; i < (1 << pool->nr_zpools_order); i++)
> total += zpool_get_total_pages(pool->zpools[i]);
> }
> rcu_read_unlock();
> @@ -822,7 +872,10 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
>
> static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
> {
> - return entry->pool->zpools[hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS))];
> + if (entry->pool->nr_zpools_order == 0)
> + return entry->pool->zpools[0];
> +
> + return entry->pool->zpools[hash_ptr(entry, entry->pool->nr_zpools_order)];
> }
>
> /*
> --
> 2.43.0
>

2024-06-07 01:01:06

by Takero Funaki

[permalink] [raw]
Subject: Re: [PATCH] mm: zswap: limit number of zpools based on CPU and RAM

2024年6月7日(金) 2:46 Yosry Ahmed <[email protected]>:

>
> There are a lot of magic numbers in this patch, and it seems like it's
> all based on theory. I don't object to making the number of zpools
> dynamic in some way, but unless we do it in a data-driven way where we
> understand the implications, I think the added complexity and
> inconsistency is not justified.
>
> For example, 2*CPU zpools is an overkill and will cause a lot of
> fragmentation. We use 32 zpools right now for machines with 100s of
> CPUs. I know that you are keeping 32 as the limit, but why 2*CPUs if
> nr_cpus <= 16?
>
> Also, the limitation based on memory size assumes that zsmalloc is the
> only allocator used by zswap, which is unfortunately not the case.
>
> The current implementation using 32 zpools all the time is not
> perfect, and I did write a patch to make it at least be min(nr_cpus,
> 32), but it is simple and it works. Complexity should be justified.
>

Thanks for your comments.
I agree the 2*cpu is too much. it was conservatively chosen assuming
1/2 contention while all cores are accessing zswap. Much smaller
factor or non-linear scale as your comments in the main thread would
be better.

I found your patch from the main thread.
One point I'm afraid, this hashing will fail if nr_zswap_zpools is 1
or is not rounded to order of 2. hash_ptr crashes when bit is 0.

> + return entry->pool->zpools[hash_ptr(entry, ilog2(nr_zswap_zpools))];

2024-06-07 05:00:04

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH] mm: zswap: limit number of zpools based on CPU and RAM

On Thu, Jun 6, 2024 at 6:01 PM Takero Funaki <[email protected]> wrote:
>
> 2024年6月7日(金) 2:46 Yosry Ahmed <[email protected]>:
>
> >
> > There are a lot of magic numbers in this patch, and it seems like it's
> > all based on theory. I don't object to making the number of zpools
> > dynamic in some way, but unless we do it in a data-driven way where we
> > understand the implications, I think the added complexity and
> > inconsistency is not justified.
> >
> > For example, 2*CPU zpools is an overkill and will cause a lot of
> > fragmentation. We use 32 zpools right now for machines with 100s of
> > CPUs. I know that you are keeping 32 as the limit, but why 2*CPUs if
> > nr_cpus <= 16?
> >
> > Also, the limitation based on memory size assumes that zsmalloc is the
> > only allocator used by zswap, which is unfortunately not the case.
> >
> > The current implementation using 32 zpools all the time is not
> > perfect, and I did write a patch to make it at least be min(nr_cpus,
> > 32), but it is simple and it works. Complexity should be justified.
> >
>
> Thanks for your comments.
> I agree the 2*cpu is too much. it was conservatively chosen assuming
> 1/2 contention while all cores are accessing zswap. Much smaller
> factor or non-linear scale as your comments in the main thread would
> be better.

Chengming is currently experimenting with fixing the lock contention
problem in zsmalloc by making the lock more granular. Based on the
data he finds, we may be able to just drop the multiple zpools patch
from zswap.

I'd wait for his findings before investing more into improving this.

>
> I found your patch from the main thread.
> One point I'm afraid, this hashing will fail if nr_zswap_zpools is 1
> or is not rounded to order of 2. hash_ptr crashes when bit is 0.

Yeah that patch was just for experimenting, I did not test it well.
Thanks for taking a look.

2024-06-07 05:50:32

by Chengming Zhou

[permalink] [raw]
Subject: Re: [PATCH] mm: zswap: limit number of zpools based on CPU and RAM

On 2024/6/7 12:58, Yosry Ahmed wrote:
> On Thu, Jun 6, 2024 at 6:01 PM Takero Funaki <[email protected]> wrote:
>>
>> 2024年6月7日(金) 2:46 Yosry Ahmed <[email protected]>:
>>
>>>
>>> There are a lot of magic numbers in this patch, and it seems like it's
>>> all based on theory. I don't object to making the number of zpools
>>> dynamic in some way, but unless we do it in a data-driven way where we
>>> understand the implications, I think the added complexity and
>>> inconsistency is not justified.
>>>
>>> For example, 2*CPU zpools is an overkill and will cause a lot of
>>> fragmentation. We use 32 zpools right now for machines with 100s of
>>> CPUs. I know that you are keeping 32 as the limit, but why 2*CPUs if
>>> nr_cpus <= 16?
>>>
>>> Also, the limitation based on memory size assumes that zsmalloc is the
>>> only allocator used by zswap, which is unfortunately not the case.
>>>
>>> The current implementation using 32 zpools all the time is not
>>> perfect, and I did write a patch to make it at least be min(nr_cpus,
>>> 32), but it is simple and it works. Complexity should be justified.
>>>
>>
>> Thanks for your comments.
>> I agree the 2*cpu is too much. it was conservatively chosen assuming
>> 1/2 contention while all cores are accessing zswap. Much smaller
>> factor or non-linear scale as your comments in the main thread would
>> be better.
>
> Chengming is currently experimenting with fixing the lock contention
> problem in zsmalloc by making the lock more granular. Based on the
> data he finds, we may be able to just drop the multiple zpools patch
> from zswap.

Hope so, not sure now, will test and compare one pool with multiple pools.

>
> I'd wait for his findings before investing more into improving this.

Ok, I will get back with code and some testing data a few days later.

Thanks.

>
>>
>> I found your patch from the main thread.
>> One point I'm afraid, this hashing will fail if nr_zswap_zpools is 1
>> or is not rounded to order of 2. hash_ptr crashes when bit is 0.
>
> Yeah that patch was just for experimenting, I did not test it well.
> Thanks for taking a look.

2024-06-07 09:26:42

by Nhat Pham

[permalink] [raw]
Subject: Re: [PATCH] mm: zswap: limit number of zpools based on CPU and RAM

On Thu, Jun 6, 2024 at 5:53 PM Takero Funaki <[email protected]> wrote:
>
> This patch limits the number of zpools used by zswap on smaller systems.
>
> Currently, zswap allocates 32 pools unconditionally. This was
> implemented to reduce contention on per-zpool locks. However, it incurs
> allocation overhead by distributing pages across pools, wasting memory
> on systems with fewer CPUs and less RAM.
>
> This patch allocates approximately 2*CPU zpools, with a minimum of 1
> zpool for single-CPU systems and up to 32 zpools for systems with 16 or
> more CPUs. This number is sufficient to keep the probability of
> busy-waiting by a thread under 40%. The upper limit of 32 zpools remains
> unchanged.
>
> For memory, it limits to 1 zpool per 60MB of memory for the 20% default
> max pool size limit, assuming the best case with no fragmentation in
> zspages. It expects 90% pool usage for zsmalloc.
>
> Signed-off-by: Takero Funaki <[email protected]>
> ---

I think this needs benchmarking. Theoretical justification is nice,
but I'm not convinced it'll translate neatly to the messy world of
real life systems.