Zswap creates multiple zpools to improve concurrency. Each zsmalloc
zpool creates its own 'zs_handle' and 'zspage' slab caches. Currently we
end up with 32 slab caches of each type.
Since each slab cache holds some free objects, we end up with a lot of
free objects distributed among the separate zpool caches. Slab caches
are designed to handle concurrent allocations by using percpu
structures, so having a single instance of each cache should be enough,
and avoids wasting more memory than needed due to fragmentation.
Additionally, having more slab caches than needed unnecessarily slows
down code paths that iterate slab_caches.
In the results reported by Eric in [1], the amount of unused slab memory
in these caches goes down from 242808 bytes to 29216 bytes (-88%). This
is calculated by (num_objs - active_objs) * objsize for each 'zs_handle'
and 'zspage' cache. Although this patch did not help with the allocation
failure reported by Eric with zswap + zsmalloc, I think it is still
worth merging on its own.
[1]https://lore.kernel.org/lkml/20240604134458.3ae4396a@yea/
Signed-off-by: Yosry Ahmed <[email protected]>
---
mm/zsmalloc.c | 87 ++++++++++++++++++++++++++-------------------------
1 file changed, 44 insertions(+), 43 deletions(-)
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b42d3545ca856..76d9976442a4a 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -220,8 +220,6 @@ struct zs_pool {
const char *name;
struct size_class *size_class[ZS_SIZE_CLASSES];
- struct kmem_cache *handle_cachep;
- struct kmem_cache *zspage_cachep;
atomic_long_t pages_allocated;
@@ -289,50 +287,29 @@ static void init_deferred_free(struct zs_pool *pool) {}
static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
#endif
-static int create_cache(struct zs_pool *pool)
-{
- pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
- 0, 0, NULL);
- if (!pool->handle_cachep)
- return 1;
-
- pool->zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage),
- 0, 0, NULL);
- if (!pool->zspage_cachep) {
- kmem_cache_destroy(pool->handle_cachep);
- pool->handle_cachep = NULL;
- return 1;
- }
-
- return 0;
-}
+static struct kmem_cache *zs_handle_cache;
+static struct kmem_cache *zspage_cache;
-static void destroy_cache(struct zs_pool *pool)
+static unsigned long cache_alloc_handle(gfp_t gfp)
{
- kmem_cache_destroy(pool->handle_cachep);
- kmem_cache_destroy(pool->zspage_cachep);
-}
-
-static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
-{
- return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
+ return (unsigned long)kmem_cache_alloc(zs_handle_cache,
gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
}
-static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
+static void cache_free_handle(unsigned long handle)
{
- kmem_cache_free(pool->handle_cachep, (void *)handle);
+ kmem_cache_free(zs_handle_cache, (void *)handle);
}
-static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
+static struct zspage *cache_alloc_zspage(gfp_t flags)
{
- return kmem_cache_zalloc(pool->zspage_cachep,
+ return kmem_cache_zalloc(zspage_cache,
flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
}
-static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
+static void cache_free_zspage(struct zspage *zspage)
{
- kmem_cache_free(pool->zspage_cachep, zspage);
+ kmem_cache_free(zspage_cache, zspage);
}
/* pool->lock(which owns the handle) synchronizes races */
@@ -837,7 +814,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
page = next;
} while (page != NULL);
- cache_free_zspage(pool, zspage);
+ cache_free_zspage(zspage);
class_stat_dec(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated);
@@ -950,7 +927,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
{
int i;
struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE];
- struct zspage *zspage = cache_alloc_zspage(pool, gfp);
+ struct zspage *zspage = cache_alloc_zspage(gfp);
if (!zspage)
return NULL;
@@ -967,7 +944,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
dec_zone_page_state(pages[i], NR_ZSPAGES);
__free_page(pages[i]);
}
- cache_free_zspage(pool, zspage);
+ cache_free_zspage(zspage);
return NULL;
}
@@ -1338,7 +1315,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
if (unlikely(size > ZS_MAX_ALLOC_SIZE))
return (unsigned long)ERR_PTR(-ENOSPC);
- handle = cache_alloc_handle(pool, gfp);
+ handle = cache_alloc_handle(gfp);
if (!handle)
return (unsigned long)ERR_PTR(-ENOMEM);
@@ -1363,7 +1340,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
zspage = alloc_zspage(pool, class, gfp);
if (!zspage) {
- cache_free_handle(pool, handle);
+ cache_free_handle(handle);
return (unsigned long)ERR_PTR(-ENOMEM);
}
@@ -1441,7 +1418,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
free_zspage(pool, class, zspage);
spin_unlock(&pool->lock);
- cache_free_handle(pool, handle);
+ cache_free_handle(handle);
}
EXPORT_SYMBOL_GPL(zs_free);
@@ -2111,9 +2088,6 @@ struct zs_pool *zs_create_pool(const char *name)
if (!pool->name)
goto err;
- if (create_cache(pool))
- goto err;
-
/*
* Iterate reversely, because, size of size_class that we want to use
* for merging should be larger or equal to current size.
@@ -2234,16 +2208,41 @@ void zs_destroy_pool(struct zs_pool *pool)
kfree(class);
}
- destroy_cache(pool);
kfree(pool->name);
kfree(pool);
}
EXPORT_SYMBOL_GPL(zs_destroy_pool);
+static void zs_destroy_caches(void)
+{
+ kmem_cache_destroy(zs_handle_cache);
+ kmem_cache_destroy(zspage_cache);
+ zs_handle_cache = NULL;
+ zspage_cache = NULL;
+}
+
+static int zs_create_caches(void)
+{
+ zs_handle_cache = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
+ 0, 0, NULL);
+ zspage_cache = kmem_cache_create("zspage", sizeof(struct zspage),
+ 0, 0, NULL);
+
+ if (!zs_handle_cache || !zspage_cache) {
+ zs_destroy_caches();
+ return -1;
+ }
+ return 0;
+}
+
static int __init zs_init(void)
{
int ret;
+ ret = zs_create_caches();
+ if (ret)
+ goto out;
+
ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare",
zs_cpu_prepare, zs_cpu_dead);
if (ret)
@@ -2258,6 +2257,7 @@ static int __init zs_init(void)
return 0;
out:
+ zs_destroy_caches();
return ret;
}
@@ -2269,6 +2269,7 @@ static void __exit zs_exit(void)
cpuhp_remove_state(CPUHP_MM_ZS_PREPARE);
zs_stat_exit();
+ zs_destroy_caches();
}
module_init(zs_init);
--
2.45.1.288.g0e0cd299f1-goog
On 6/4/24 7:53 PM, Yosry Ahmed wrote:
> Zswap creates multiple zpools to improve concurrency. Each zsmalloc
> zpool creates its own 'zs_handle' and 'zspage' slab caches. Currently we
> end up with 32 slab caches of each type.
>
> Since each slab cache holds some free objects, we end up with a lot of
> free objects distributed among the separate zpool caches. Slab caches
> are designed to handle concurrent allocations by using percpu
> structures, so having a single instance of each cache should be enough,
> and avoids wasting more memory than needed due to fragmentation.
>
> Additionally, having more slab caches than needed unnecessarily slows
> down code paths that iterate slab_caches.
>
> In the results reported by Eric in [1], the amount of unused slab memory
> in these caches goes down from 242808 bytes to 29216 bytes (-88%). This
> is calculated by (num_objs - active_objs) * objsize for each 'zs_handle'
> and 'zspage' cache. Although this patch did not help with the allocation
> failure reported by Eric with zswap + zsmalloc, I think it is still
> worth merging on its own.
>
> [1]https://lore.kernel.org/lkml/20240604134458.3ae4396a@yea/
>
> Signed-off-by: Yosry Ahmed <[email protected]>
As mentioned in the thread, CONFIG_SLAB_MERGE_DEFAULT would normally cover
the zsmalloc caches case too, but was disabled. I agree with the arguments
for this change though, so it doesn't depend on the general merging.
Acked-by: Vlastimil Babka <[email protected]>
> ---
> mm/zsmalloc.c | 87 ++++++++++++++++++++++++++-------------------------
> 1 file changed, 44 insertions(+), 43 deletions(-)
>
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index b42d3545ca856..76d9976442a4a 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -220,8 +220,6 @@ struct zs_pool {
> const char *name;
>
> struct size_class *size_class[ZS_SIZE_CLASSES];
> - struct kmem_cache *handle_cachep;
> - struct kmem_cache *zspage_cachep;
>
> atomic_long_t pages_allocated;
>
> @@ -289,50 +287,29 @@ static void init_deferred_free(struct zs_pool *pool) {}
> static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
> #endif
>
> -static int create_cache(struct zs_pool *pool)
> -{
> - pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
> - 0, 0, NULL);
> - if (!pool->handle_cachep)
> - return 1;
> -
> - pool->zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage),
> - 0, 0, NULL);
> - if (!pool->zspage_cachep) {
> - kmem_cache_destroy(pool->handle_cachep);
> - pool->handle_cachep = NULL;
> - return 1;
> - }
> -
> - return 0;
> -}
> +static struct kmem_cache *zs_handle_cache;
> +static struct kmem_cache *zspage_cache;
>
> -static void destroy_cache(struct zs_pool *pool)
> +static unsigned long cache_alloc_handle(gfp_t gfp)
> {
> - kmem_cache_destroy(pool->handle_cachep);
> - kmem_cache_destroy(pool->zspage_cachep);
> -}
> -
> -static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
> -{
> - return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
> + return (unsigned long)kmem_cache_alloc(zs_handle_cache,
> gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
> }
>
> -static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
> +static void cache_free_handle(unsigned long handle)
> {
> - kmem_cache_free(pool->handle_cachep, (void *)handle);
> + kmem_cache_free(zs_handle_cache, (void *)handle);
> }
>
> -static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
> +static struct zspage *cache_alloc_zspage(gfp_t flags)
> {
> - return kmem_cache_zalloc(pool->zspage_cachep,
> + return kmem_cache_zalloc(zspage_cache,
> flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
> }
>
> -static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
> +static void cache_free_zspage(struct zspage *zspage)
> {
> - kmem_cache_free(pool->zspage_cachep, zspage);
> + kmem_cache_free(zspage_cache, zspage);
> }
>
> /* pool->lock(which owns the handle) synchronizes races */
> @@ -837,7 +814,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
> page = next;
> } while (page != NULL);
>
> - cache_free_zspage(pool, zspage);
> + cache_free_zspage(zspage);
>
> class_stat_dec(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
> atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated);
> @@ -950,7 +927,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
> {
> int i;
> struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE];
> - struct zspage *zspage = cache_alloc_zspage(pool, gfp);
> + struct zspage *zspage = cache_alloc_zspage(gfp);
>
> if (!zspage)
> return NULL;
> @@ -967,7 +944,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
> dec_zone_page_state(pages[i], NR_ZSPAGES);
> __free_page(pages[i]);
> }
> - cache_free_zspage(pool, zspage);
> + cache_free_zspage(zspage);
> return NULL;
> }
>
> @@ -1338,7 +1315,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
> if (unlikely(size > ZS_MAX_ALLOC_SIZE))
> return (unsigned long)ERR_PTR(-ENOSPC);
>
> - handle = cache_alloc_handle(pool, gfp);
> + handle = cache_alloc_handle(gfp);
> if (!handle)
> return (unsigned long)ERR_PTR(-ENOMEM);
>
> @@ -1363,7 +1340,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
>
> zspage = alloc_zspage(pool, class, gfp);
> if (!zspage) {
> - cache_free_handle(pool, handle);
> + cache_free_handle(handle);
> return (unsigned long)ERR_PTR(-ENOMEM);
> }
>
> @@ -1441,7 +1418,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
> free_zspage(pool, class, zspage);
>
> spin_unlock(&pool->lock);
> - cache_free_handle(pool, handle);
> + cache_free_handle(handle);
> }
> EXPORT_SYMBOL_GPL(zs_free);
>
> @@ -2111,9 +2088,6 @@ struct zs_pool *zs_create_pool(const char *name)
> if (!pool->name)
> goto err;
>
> - if (create_cache(pool))
> - goto err;
> -
> /*
> * Iterate reversely, because, size of size_class that we want to use
> * for merging should be larger or equal to current size.
> @@ -2234,16 +2208,41 @@ void zs_destroy_pool(struct zs_pool *pool)
> kfree(class);
> }
>
> - destroy_cache(pool);
> kfree(pool->name);
> kfree(pool);
> }
> EXPORT_SYMBOL_GPL(zs_destroy_pool);
>
> +static void zs_destroy_caches(void)
> +{
> + kmem_cache_destroy(zs_handle_cache);
> + kmem_cache_destroy(zspage_cache);
> + zs_handle_cache = NULL;
> + zspage_cache = NULL;
> +}
> +
> +static int zs_create_caches(void)
> +{
> + zs_handle_cache = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
> + 0, 0, NULL);
> + zspage_cache = kmem_cache_create("zspage", sizeof(struct zspage),
> + 0, 0, NULL);
> +
> + if (!zs_handle_cache || !zspage_cache) {
> + zs_destroy_caches();
> + return -1;
> + }
> + return 0;
> +}
> +
> static int __init zs_init(void)
> {
> int ret;
>
> + ret = zs_create_caches();
> + if (ret)
> + goto out;
> +
> ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare",
> zs_cpu_prepare, zs_cpu_dead);
> if (ret)
> @@ -2258,6 +2257,7 @@ static int __init zs_init(void)
> return 0;
>
> out:
> + zs_destroy_caches();
> return ret;
> }
>
> @@ -2269,6 +2269,7 @@ static void __exit zs_exit(void)
> cpuhp_remove_state(CPUHP_MM_ZS_PREPARE);
>
> zs_stat_exit();
> + zs_destroy_caches();
> }
>
> module_init(zs_init);
On (24/06/04 17:53), Yosry Ahmed wrote:
> Zswap creates multiple zpools to improve concurrency. Each zsmalloc
> zpool creates its own 'zs_handle' and 'zspage' slab caches. Currently we
> end up with 32 slab caches of each type.
>
> Since each slab cache holds some free objects, we end up with a lot of
> free objects distributed among the separate zpool caches. Slab caches
> are designed to handle concurrent allocations by using percpu
> structures, so having a single instance of each cache should be enough,
> and avoids wasting more memory than needed due to fragmentation.
>
> Additionally, having more slab caches than needed unnecessarily slows
> down code paths that iterate slab_caches.
>
> In the results reported by Eric in [1], the amount of unused slab memory
> in these caches goes down from 242808 bytes to 29216 bytes (-88%). This
> is calculated by (num_objs - active_objs) * objsize for each 'zs_handle'
> and 'zspage' cache. Although this patch did not help with the allocation
> failure reported by Eric with zswap + zsmalloc, I think it is still
> worth merging on its own.
>
> [1]https://lore.kernel.org/lkml/20240604134458.3ae4396a@yea/
>
> Signed-off-by: Yosry Ahmed <[email protected]>
Makes perfect sense, thanks.
Reviewed-by: Sergey Senozhatsky <[email protected]>
On Tue, Jun 04, 2024 at 05:53:40PM +0000, Yosry Ahmed wrote:
> Zswap creates multiple zpools to improve concurrency. Each zsmalloc
> zpool creates its own 'zs_handle' and 'zspage' slab caches. Currently we
> end up with 32 slab caches of each type.
>
> Since each slab cache holds some free objects, we end up with a lot of
> free objects distributed among the separate zpool caches. Slab caches
> are designed to handle concurrent allocations by using percpu
> structures, so having a single instance of each cache should be enough,
> and avoids wasting more memory than needed due to fragmentation.
>
> Additionally, having more slab caches than needed unnecessarily slows
> down code paths that iterate slab_caches.
>
> In the results reported by Eric in [1], the amount of unused slab memory
> in these caches goes down from 242808 bytes to 29216 bytes (-88%). This
> is calculated by (num_objs - active_objs) * objsize for each 'zs_handle'
> and 'zspage' cache. Although this patch did not help with the allocation
> failure reported by Eric with zswap + zsmalloc, I think it is still
> worth merging on its own.
>
> [1]https://lore.kernel.org/lkml/20240604134458.3ae4396a@yea/
I doubt this is the right direction.
Zsmalloc is used for various purposes, each with different object
lifecycles. For example, swap operations relatively involve short-lived
objects, while filesystem use cases might have longer-lived objects.
This mix of lifecycles could lead to fragmentation with this approach.
I believe the original problem arose when zsmalloc reduced its lock
granularity from the class level to a global level. And then, Zswap went
to mitigate the issue with multiple zpools, but it's essentially another
bandaid on top of the existing problem, IMO.
The correct approach would be to further reduce the zsmalloc lock
granularity.
>
> Signed-off-by: Yosry Ahmed <[email protected]>
> ---
> mm/zsmalloc.c | 87 ++++++++++++++++++++++++++-------------------------
> 1 file changed, 44 insertions(+), 43 deletions(-)
>
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index b42d3545ca856..76d9976442a4a 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -220,8 +220,6 @@ struct zs_pool {
> const char *name;
>
> struct size_class *size_class[ZS_SIZE_CLASSES];
> - struct kmem_cache *handle_cachep;
> - struct kmem_cache *zspage_cachep;
>
> atomic_long_t pages_allocated;
>
> @@ -289,50 +287,29 @@ static void init_deferred_free(struct zs_pool *pool) {}
> static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
> #endif
>
> -static int create_cache(struct zs_pool *pool)
> -{
> - pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
> - 0, 0, NULL);
> - if (!pool->handle_cachep)
> - return 1;
> -
> - pool->zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage),
> - 0, 0, NULL);
> - if (!pool->zspage_cachep) {
> - kmem_cache_destroy(pool->handle_cachep);
> - pool->handle_cachep = NULL;
> - return 1;
> - }
> -
> - return 0;
> -}
> +static struct kmem_cache *zs_handle_cache;
> +static struct kmem_cache *zspage_cache;
>
> -static void destroy_cache(struct zs_pool *pool)
> +static unsigned long cache_alloc_handle(gfp_t gfp)
> {
> - kmem_cache_destroy(pool->handle_cachep);
> - kmem_cache_destroy(pool->zspage_cachep);
> -}
> -
> -static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
> -{
> - return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
> + return (unsigned long)kmem_cache_alloc(zs_handle_cache,
> gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
> }
>
> -static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
> +static void cache_free_handle(unsigned long handle)
> {
> - kmem_cache_free(pool->handle_cachep, (void *)handle);
> + kmem_cache_free(zs_handle_cache, (void *)handle);
> }
>
> -static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
> +static struct zspage *cache_alloc_zspage(gfp_t flags)
> {
> - return kmem_cache_zalloc(pool->zspage_cachep,
> + return kmem_cache_zalloc(zspage_cache,
> flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
> }
>
> -static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
> +static void cache_free_zspage(struct zspage *zspage)
> {
> - kmem_cache_free(pool->zspage_cachep, zspage);
> + kmem_cache_free(zspage_cache, zspage);
> }
>
> /* pool->lock(which owns the handle) synchronizes races */
> @@ -837,7 +814,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
> page = next;
> } while (page != NULL);
>
> - cache_free_zspage(pool, zspage);
> + cache_free_zspage(zspage);
>
> class_stat_dec(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
> atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated);
> @@ -950,7 +927,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
> {
> int i;
> struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE];
> - struct zspage *zspage = cache_alloc_zspage(pool, gfp);
> + struct zspage *zspage = cache_alloc_zspage(gfp);
>
> if (!zspage)
> return NULL;
> @@ -967,7 +944,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
> dec_zone_page_state(pages[i], NR_ZSPAGES);
> __free_page(pages[i]);
> }
> - cache_free_zspage(pool, zspage);
> + cache_free_zspage(zspage);
> return NULL;
> }
>
> @@ -1338,7 +1315,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
> if (unlikely(size > ZS_MAX_ALLOC_SIZE))
> return (unsigned long)ERR_PTR(-ENOSPC);
>
> - handle = cache_alloc_handle(pool, gfp);
> + handle = cache_alloc_handle(gfp);
> if (!handle)
> return (unsigned long)ERR_PTR(-ENOMEM);
>
> @@ -1363,7 +1340,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
>
> zspage = alloc_zspage(pool, class, gfp);
> if (!zspage) {
> - cache_free_handle(pool, handle);
> + cache_free_handle(handle);
> return (unsigned long)ERR_PTR(-ENOMEM);
> }
>
> @@ -1441,7 +1418,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
> free_zspage(pool, class, zspage);
>
> spin_unlock(&pool->lock);
> - cache_free_handle(pool, handle);
> + cache_free_handle(handle);
> }
> EXPORT_SYMBOL_GPL(zs_free);
>
> @@ -2111,9 +2088,6 @@ struct zs_pool *zs_create_pool(const char *name)
> if (!pool->name)
> goto err;
>
> - if (create_cache(pool))
> - goto err;
> -
> /*
> * Iterate reversely, because, size of size_class that we want to use
> * for merging should be larger or equal to current size.
> @@ -2234,16 +2208,41 @@ void zs_destroy_pool(struct zs_pool *pool)
> kfree(class);
> }
>
> - destroy_cache(pool);
> kfree(pool->name);
> kfree(pool);
> }
> EXPORT_SYMBOL_GPL(zs_destroy_pool);
>
> +static void zs_destroy_caches(void)
> +{
> + kmem_cache_destroy(zs_handle_cache);
> + kmem_cache_destroy(zspage_cache);
> + zs_handle_cache = NULL;
> + zspage_cache = NULL;
> +}
> +
> +static int zs_create_caches(void)
> +{
> + zs_handle_cache = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
> + 0, 0, NULL);
> + zspage_cache = kmem_cache_create("zspage", sizeof(struct zspage),
> + 0, 0, NULL);
> +
> + if (!zs_handle_cache || !zspage_cache) {
> + zs_destroy_caches();
> + return -1;
> + }
> + return 0;
> +}
> +
> static int __init zs_init(void)
> {
> int ret;
>
> + ret = zs_create_caches();
> + if (ret)
> + goto out;
> +
> ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare",
> zs_cpu_prepare, zs_cpu_dead);
> if (ret)
> @@ -2258,6 +2257,7 @@ static int __init zs_init(void)
> return 0;
>
> out:
> + zs_destroy_caches();
> return ret;
> }
>
> @@ -2269,6 +2269,7 @@ static void __exit zs_exit(void)
> cpuhp_remove_state(CPUHP_MM_ZS_PREPARE);
>
> zs_stat_exit();
> + zs_destroy_caches();
> }
>
> module_init(zs_init);
> --
> 2.45.1.288.g0e0cd299f1-goog
>
On Thu, Jun 6, 2024 at 3:36 PM Minchan Kim <[email protected]> wrote:
>
> On Tue, Jun 04, 2024 at 05:53:40PM +0000, Yosry Ahmed wrote:
> > Zswap creates multiple zpools to improve concurrency. Each zsmalloc
> > zpool creates its own 'zs_handle' and 'zspage' slab caches. Currently we
> > end up with 32 slab caches of each type.
> >
> > Since each slab cache holds some free objects, we end up with a lot of
> > free objects distributed among the separate zpool caches. Slab caches
> > are designed to handle concurrent allocations by using percpu
> > structures, so having a single instance of each cache should be enough,
> > and avoids wasting more memory than needed due to fragmentation.
> >
> > Additionally, having more slab caches than needed unnecessarily slows
> > down code paths that iterate slab_caches.
> >
> > In the results reported by Eric in [1], the amount of unused slab memory
> > in these caches goes down from 242808 bytes to 29216 bytes (-88%). This
> > is calculated by (num_objs - active_objs) * objsize for each 'zs_handle'
> > and 'zspage' cache. Although this patch did not help with the allocation
> > failure reported by Eric with zswap + zsmalloc, I think it is still
> > worth merging on its own.
> >
> > [1]https://lore.kernel.org/lkml/20240604134458.3ae4396a@yea/
>
> I doubt this is the right direction.
>
> Zsmalloc is used for various purposes, each with different object
> lifecycles. For example, swap operations relatively involve short-lived
> objects, while filesystem use cases might have longer-lived objects.
> This mix of lifecycles could lead to fragmentation with this approach.
Even in a swapfile, some objects can be short-lived and some objects
can be long-lived, and the line between swap and file systems both
becomes blurry with shmem/tmpfs. I don't think having separate caches
here is vital, but I am not generally familiar with the file system
use cases and I don't have data to prove/disprove it.
>
> I believe the original problem arose when zsmalloc reduced its lock
> granularity from the class level to a global level. And then, Zswap went
> to mitigate the issue with multiple zpools, but it's essentially another
> bandaid on top of the existing problem, IMO.
IIRC we reduced the granularity when we added writeback support to
zsmalloc, which was relatively recent. I think we have seen lock
contention with zsmalloc long before that. We have had a similar patch
internally to use multiple zpools in zswap for many years now.
+Yu Zhao
Yu has more historical context about this, I am hoping he will shed
more light about this.
>
> The correct approach would be to further reduce the zsmalloc lock
> granularity.
I definitely agree that the correct approach should be to fix the lock
contention at the source and drop zswap's usage of multiple zpools.
Nonetheless, I think this patch provides value in the meantime. The
fragmentation within the slab caches is real with zswap's use case.
OTOH, sharing a cache between swap and file system use cases leading
to fragmentation within the same slab cache is a less severe problem
in my opinion.
That being said, I don't feel strongly. If you really don't like this
patch I am fine with dropping it.
On Thu, Jun 6, 2024 at 4:03 PM Yosry Ahmed <[email protected]> wrote:
>
> On Thu, Jun 6, 2024 at 3:36 PM Minchan Kim <[email protected]> wrote:
> >
> > On Tue, Jun 04, 2024 at 05:53:40PM +0000, Yosry Ahmed wrote:
> > > Zswap creates multiple zpools to improve concurrency. Each zsmalloc
> > > zpool creates its own 'zs_handle' and 'zspage' slab caches. Currently we
> > > end up with 32 slab caches of each type.
> > >
> > > Since each slab cache holds some free objects, we end up with a lot of
> > > free objects distributed among the separate zpool caches. Slab caches
> > > are designed to handle concurrent allocations by using percpu
> > > structures, so having a single instance of each cache should be enough,
> > > and avoids wasting more memory than needed due to fragmentation.
> > >
> > > Additionally, having more slab caches than needed unnecessarily slows
> > > down code paths that iterate slab_caches.
> > >
> > > In the results reported by Eric in [1], the amount of unused slab memory
> > > in these caches goes down from 242808 bytes to 29216 bytes (-88%). This
> > > is calculated by (num_objs - active_objs) * objsize for each 'zs_handle'
> > > and 'zspage' cache. Although this patch did not help with the allocation
> > > failure reported by Eric with zswap + zsmalloc, I think it is still
> > > worth merging on its own.
> > >
> > > [1]https://lore.kernel.org/lkml/20240604134458.3ae4396a@yea/
> >
> > I doubt this is the right direction.
> >
> > Zsmalloc is used for various purposes, each with different object
> > lifecycles. For example, swap operations relatively involve short-lived
> > objects, while filesystem use cases might have longer-lived objects.
> > This mix of lifecycles could lead to fragmentation with this approach.
>
> Even in a swapfile, some objects can be short-lived and some objects
> can be long-lived, and the line between swap and file systems both
> becomes blurry with shmem/tmpfs. I don't think having separate caches
> here is vital, but I am not generally familiar with the file system
> use cases and I don't have data to prove/disprove it.
>
> >
> > I believe the original problem arose when zsmalloc reduced its lock
> > granularity from the class level to a global level. And then, Zswap went
> > to mitigate the issue with multiple zpools, but it's essentially another
> > bandaid on top of the existing problem, IMO.
>
> IIRC we reduced the granularity when we added writeback support to
> zsmalloc, which was relatively recent. I think we have seen lock
> contention with zsmalloc long before that. We have had a similar patch
> internally to use multiple zpools in zswap for many years now.
>
> +Yu Zhao
>
> Yu has more historical context about this, I am hoping he will shed
> more light about this.
>
> >
> > The correct approach would be to further reduce the zsmalloc lock
> > granularity.
>
> I definitely agree that the correct approach should be to fix the lock
> contention at the source and drop zswap's usage of multiple zpools.
> Nonetheless, I think this patch provides value in the meantime. The
> fragmentation within the slab caches is real with zswap's use case.
> OTOH, sharing a cache between swap and file system use cases leading
> to fragmentation within the same slab cache is a less severe problem
> in my opinion.
>
> That being said, I don't feel strongly. If you really don't like this
> patch I am fine with dropping it.
Oh and I forgot to mention, Chengming said he is already working on
restoring the per-class lock and collecting lock contention data, so
maybe that will be enough after all. Ideally we want to compare:
- single zpool with per-pool lock
- multiple zpools with per-pool lock (current)
- single zpool with per-class locks
On Thu, Jun 06, 2024 at 04:03:55PM -0700, Yosry Ahmed wrote:
> On Thu, Jun 6, 2024 at 3:36 PM Minchan Kim <[email protected]> wrote:
> >
> > On Tue, Jun 04, 2024 at 05:53:40PM +0000, Yosry Ahmed wrote:
> > > Zswap creates multiple zpools to improve concurrency. Each zsmalloc
> > > zpool creates its own 'zs_handle' and 'zspage' slab caches. Currently we
> > > end up with 32 slab caches of each type.
> > >
> > > Since each slab cache holds some free objects, we end up with a lot of
> > > free objects distributed among the separate zpool caches. Slab caches
> > > are designed to handle concurrent allocations by using percpu
> > > structures, so having a single instance of each cache should be enough,
> > > and avoids wasting more memory than needed due to fragmentation.
> > >
> > > Additionally, having more slab caches than needed unnecessarily slows
> > > down code paths that iterate slab_caches.
> > >
> > > In the results reported by Eric in [1], the amount of unused slab memory
> > > in these caches goes down from 242808 bytes to 29216 bytes (-88%). This
> > > is calculated by (num_objs - active_objs) * objsize for each 'zs_handle'
> > > and 'zspage' cache. Although this patch did not help with the allocation
> > > failure reported by Eric with zswap + zsmalloc, I think it is still
> > > worth merging on its own.
> > >
> > > [1]https://lore.kernel.org/lkml/20240604134458.3ae4396a@yea/
> >
> > I doubt this is the right direction.
> >
> > Zsmalloc is used for various purposes, each with different object
> > lifecycles. For example, swap operations relatively involve short-lived
> > objects, while filesystem use cases might have longer-lived objects.
> > This mix of lifecycles could lead to fragmentation with this approach.
>
> Even in a swapfile, some objects can be short-lived and some objects
> can be long-lived, and the line between swap and file systems both
> becomes blurry with shmem/tmpfs. I don't think having separate caches
Many allocators differentiate object lifecycles to minimize
fragmentation. While this isn't a new concept, you argue it's irrelevant
without a clearcut use case.
> here is vital, but I am not generally familiar with the file system
> use cases and I don't have data to prove/disprove it.
The use case I had in mind was build output directories (e.g., Android).
These consume object files in zram until the next build.
Other potential scenarios involve separate zrams: one for foreground
apps (short-term) and another for cached apps (long-term). Even
zswap and zram could have different object lifecycles, as zswap might
write back more aggressively.
While you see no clear use cases, I disagree with dismissing this
concept without strong justification.
>
> >
> > I believe the original problem arose when zsmalloc reduced its lock
> > granularity from the class level to a global level. And then, Zswap went
> > to mitigate the issue with multiple zpools, but it's essentially another
> > bandaid on top of the existing problem, IMO.
>
> IIRC we reduced the granularity when we added writeback support to
> zsmalloc, which was relatively recent. I think we have seen lock
> contention with zsmalloc long before that. We have had a similar patch
> internally to use multiple zpools in zswap for many years now.
>
> +Yu Zhao
>
> Yu has more historical context about this, I am hoping he will shed
> more light about this.
>
> >
> > The correct approach would be to further reduce the zsmalloc lock
> > granularity.
>
> I definitely agree that the correct approach should be to fix the lock
> contention at the source and drop zswap's usage of multiple zpools.
> Nonetheless, I think this patch provides value in the meantime. The
> fragmentation within the slab caches is real with zswap's use case.
> OTOH, sharing a cache between swap and file system use cases leading
> to fragmentation within the same slab cache is a less severe problem
> in my opinion.
>
> That being said, I don't feel strongly. If you really don't like this
> patch I am fine with dropping it.
How about introducing a flag like "bool slab_merge" in zs_create_pool?
This would allow zswap to unify slabs while others don't.
On Fri, Jun 7, 2024 at 9:14 AM Minchan Kim <[email protected]> wrote:
>
> On Thu, Jun 06, 2024 at 04:03:55PM -0700, Yosry Ahmed wrote:
> > On Thu, Jun 6, 2024 at 3:36 PM Minchan Kim <[email protected]> wrote:
> > >
> > > On Tue, Jun 04, 2024 at 05:53:40PM +0000, Yosry Ahmed wrote:
> > > > Zswap creates multiple zpools to improve concurrency. Each zsmalloc
> > > > zpool creates its own 'zs_handle' and 'zspage' slab caches. Currently we
> > > > end up with 32 slab caches of each type.
> > > >
> > > > Since each slab cache holds some free objects, we end up with a lot of
> > > > free objects distributed among the separate zpool caches. Slab caches
> > > > are designed to handle concurrent allocations by using percpu
> > > > structures, so having a single instance of each cache should be enough,
> > > > and avoids wasting more memory than needed due to fragmentation.
> > > >
> > > > Additionally, having more slab caches than needed unnecessarily slows
> > > > down code paths that iterate slab_caches.
> > > >
> > > > In the results reported by Eric in [1], the amount of unused slab memory
> > > > in these caches goes down from 242808 bytes to 29216 bytes (-88%). This
> > > > is calculated by (num_objs - active_objs) * objsize for each 'zs_handle'
> > > > and 'zspage' cache. Although this patch did not help with the allocation
> > > > failure reported by Eric with zswap + zsmalloc, I think it is still
> > > > worth merging on its own.
> > > >
> > > > [1]https://lore.kernel.org/lkml/20240604134458.3ae4396a@yea/
> > >
> > > I doubt this is the right direction.
> > >
> > > Zsmalloc is used for various purposes, each with different object
> > > lifecycles. For example, swap operations relatively involve short-lived
> > > objects, while filesystem use cases might have longer-lived objects.
> > > This mix of lifecycles could lead to fragmentation with this approach.
> >
> > Even in a swapfile, some objects can be short-lived and some objects
> > can be long-lived, and the line between swap and file systems both
> > becomes blurry with shmem/tmpfs. I don't think having separate caches
>
>
> Many allocators differentiate object lifecycles to minimize
> fragmentation. While this isn't a new concept, you argue it's irrelevant
> without a clearcut use case.
>
> > here is vital, but I am not generally familiar with the file system
> > use cases and I don't have data to prove/disprove it.
>
> The use case I had in mind was build output directories (e.g., Android).
> These consume object files in zram until the next build.
>
> Other potential scenarios involve separate zrams: one for foreground
> apps (short-term) and another for cached apps (long-term). Even
> zswap and zram could have different object lifecycles, as zswap might
> write back more aggressively.
>
> While you see no clear use cases, I disagree with dismissing this
> concept without strong justification.
I was just unaware of these use cases, as I mentioned. I didn't really
know how zram was used with file systems. Thanks for the examples :)
>
> >
> > >
> > > I believe the original problem arose when zsmalloc reduced its lock
> > > granularity from the class level to a global level. And then, Zswap went
> > > to mitigate the issue with multiple zpools, but it's essentially another
> > > bandaid on top of the existing problem, IMO.
> >
> > IIRC we reduced the granularity when we added writeback support to
> > zsmalloc, which was relatively recent. I think we have seen lock
> > contention with zsmalloc long before that. We have had a similar patch
> > internally to use multiple zpools in zswap for many years now.
> >
> > +Yu Zhao
> >
> > Yu has more historical context about this, I am hoping he will shed
> > more light about this.
> >
> > >
> > > The correct approach would be to further reduce the zsmalloc lock
> > > granularity.
> >
> > I definitely agree that the correct approach should be to fix the lock
> > contention at the source and drop zswap's usage of multiple zpools.
> > Nonetheless, I think this patch provides value in the meantime. The
> > fragmentation within the slab caches is real with zswap's use case.
> > OTOH, sharing a cache between swap and file system use cases leading
> > to fragmentation within the same slab cache is a less severe problem
> > in my opinion.
> >
> > That being said, I don't feel strongly. If you really don't like this
> > patch I am fine with dropping it.
>
> How about introducing a flag like "bool slab_merge" in zs_create_pool?
> This would allow zswap to unify slabs while others don't.
Yeah this should work. But I'll wait until we have more data and we
know whether we need to keep using multiple zpools for zswap.
I sent this patch because I thought it would be generally useful to
share caches (e.g. if we have zram and zswap on the same system), but
given that you said it is actually preferable that the caches are
separate, it may not be. I'll wait for more data before sending any
more patches to address this.
Andrew, could you please take this out of mm-unstable?
Thanks!