This commit adds the writeback mechanism for zsmalloc, analogous to the
zbud allocator. Zsmalloc will attempt to determine the coldest zspage
(i.e least recently used) in the pool, and attempt to write back all the
stored compressed objects via the pool's evict handler.
Signed-off-by: Nhat Pham <[email protected]>
---
mm/zsmalloc.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 177 insertions(+), 15 deletions(-)
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 76ff2ed839d0..c79cbd3f46f3 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -279,10 +279,13 @@ struct zspage {
/* links the zspage to the lru list in the pool */
struct list_head lru;
+ bool under_reclaim;
+
+ /* list of unfreed handles whose objects have been reclaimed */
+ unsigned long *deferred_handles;
+
struct zs_pool *pool;
-#ifdef CONFIG_COMPACTION
rwlock_t lock;
-#endif
};
struct mapping_area {
@@ -303,10 +306,11 @@ static bool ZsHugePage(struct zspage *zspage)
return zspage->huge;
}
-#ifdef CONFIG_COMPACTION
static void migrate_lock_init(struct zspage *zspage);
static void migrate_read_lock(struct zspage *zspage);
static void migrate_read_unlock(struct zspage *zspage);
+
+#ifdef CONFIG_COMPACTION
static void migrate_write_lock(struct zspage *zspage);
static void migrate_write_lock_nested(struct zspage *zspage);
static void migrate_write_unlock(struct zspage *zspage);
@@ -314,9 +318,6 @@ static void kick_deferred_free(struct zs_pool *pool);
static void init_deferred_free(struct zs_pool *pool);
static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage);
#else
-static void migrate_lock_init(struct zspage *zspage) {}
-static void migrate_read_lock(struct zspage *zspage) {}
-static void migrate_read_unlock(struct zspage *zspage) {}
static void migrate_write_lock(struct zspage *zspage) {}
static void migrate_write_lock_nested(struct zspage *zspage) {}
static void migrate_write_unlock(struct zspage *zspage) {}
@@ -444,6 +445,27 @@ static void zs_zpool_free(void *pool, unsigned long handle)
zs_free(pool, handle);
}
+static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries);
+
+static int zs_zpool_shrink(void *pool, unsigned int pages,
+ unsigned int *reclaimed)
+{
+ unsigned int total = 0;
+ int ret = -EINVAL;
+
+ while (total < pages) {
+ ret = zs_reclaim_page(pool, 8);
+ if (ret < 0)
+ break;
+ total++;
+ }
+
+ if (reclaimed)
+ *reclaimed = total;
+
+ return ret;
+}
+
static void *zs_zpool_map(void *pool, unsigned long handle,
enum zpool_mapmode mm)
{
@@ -482,6 +504,7 @@ static struct zpool_driver zs_zpool_driver = {
.malloc_support_movable = true,
.malloc = zs_zpool_malloc,
.free = zs_zpool_free,
+ .shrink = zs_zpool_shrink,
.map = zs_zpool_map,
.unmap = zs_zpool_unmap,
.total_size = zs_zpool_total_size,
@@ -955,6 +978,21 @@ static int trylock_zspage(struct zspage *zspage)
return 0;
}
+/*
+ * Free all the deferred handles whose objects are freed in zs_free.
+ */
+static void free_handles(struct zs_pool *pool, struct zspage *zspage)
+{
+ unsigned long handle = (unsigned long) zspage->deferred_handles;
+
+ while (handle) {
+ unsigned long nxt_handle = handle_to_obj(handle);
+
+ cache_free_handle(pool, handle);
+ handle = nxt_handle;
+ }
+}
+
static void __free_zspage(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage)
{
@@ -969,6 +1007,9 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
VM_BUG_ON(get_zspage_inuse(zspage));
VM_BUG_ON(fg != ZS_EMPTY);
+ /* Free all deferred handles from zs_free */
+ free_handles(pool, zspage);
+
next = page = get_first_page(zspage);
do {
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -1051,6 +1092,8 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
}
INIT_LIST_HEAD(&zspage->lru);
+ zspage->under_reclaim = false;
+ zspage->deferred_handles = NULL;
set_freeobj(zspage, 0);
}
@@ -1472,11 +1515,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
fix_fullness_group(class, zspage);
record_obj(handle, obj);
class_stat_inc(class, OBJ_USED, 1);
- /* Move the zspage to front of pool's LRU */
- move_to_front(pool, zspage);
- spin_unlock(&pool->lock);
- return handle;
+ goto out;
}
spin_unlock(&pool->lock);
@@ -1500,6 +1540,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
/* We completely set up zspage so mark them as movable */
SetZsPageMovable(pool, zspage);
+
+out:
/* Move the zspage to front of pool's LRU */
move_to_front(pool, zspage);
spin_unlock(&pool->lock);
@@ -1557,12 +1599,24 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
obj_free(class->size, obj);
class_stat_dec(class, OBJ_USED, 1);
+
+ if (zspage->under_reclaim) {
+ /*
+ * Reclaim needs the handles during writeback. It'll free
+ * them along with the zspage when it's done with them.
+ *
+ * Record current deferred handle at the memory location
+ * whose address is given by handle.
+ */
+ record_obj(handle, (unsigned long) zspage->deferred_handles);
+ zspage->deferred_handles = (unsigned long *) handle;
+ spin_unlock(&pool->lock);
+ return;
+ }
fullness = fix_fullness_group(class, zspage);
- if (fullness != ZS_EMPTY)
- goto out;
+ if (fullness == ZS_EMPTY)
+ free_zspage(pool, class, zspage);
- free_zspage(pool, class, zspage);
-out:
spin_unlock(&pool->lock);
cache_free_handle(pool, handle);
}
@@ -1762,7 +1816,6 @@ static enum fullness_group putback_zspage(struct size_class *class,
return fullness;
}
-#ifdef CONFIG_COMPACTION
/*
* To prevent zspage destroy during migration, zspage freeing should
* hold locks of all pages in the zspage.
@@ -1805,6 +1858,21 @@ static void lock_zspage(struct zspage *zspage)
migrate_read_unlock(zspage);
}
+/*
+ * Unlocks all the pages of the zspage.
+ *
+ * pool->lock must be held before this function is called
+ * to prevent the underlying pages from migrating.
+ */
+static void unlock_zspage(struct zspage *zspage)
+{
+ struct page *page = get_first_page(zspage);
+
+ do {
+ unlock_page(page);
+ } while ((page = get_next_page(page)) != NULL);
+}
+
static void migrate_lock_init(struct zspage *zspage)
{
rwlock_init(&zspage->lock);
@@ -1820,6 +1888,7 @@ static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock)
read_unlock(&zspage->lock);
}
+#ifdef CONFIG_COMPACTION
static void migrate_write_lock(struct zspage *zspage)
{
write_lock(&zspage->lock);
@@ -2380,6 +2449,99 @@ void zs_destroy_pool(struct zs_pool *pool)
}
EXPORT_SYMBOL_GPL(zs_destroy_pool);
+static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
+{
+ int i, obj_idx, ret = 0;
+ unsigned long handle;
+ struct zspage *zspage;
+ struct page *page;
+ enum fullness_group fullness;
+
+ /* Lock LRU and fullness list */
+ spin_lock(&pool->lock);
+ if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
+ retries == 0) {
+ spin_unlock(&pool->lock);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < retries; i++) {
+ struct size_class *class;
+
+ zspage = list_last_entry(&pool->lru, struct zspage, lru);
+ list_del(&zspage->lru);
+
+ /* zs_free may free objects, but not the zspage and handles */
+ zspage->under_reclaim = true;
+
+ /* Lock backing pages into place */
+ lock_zspage(zspage);
+
+ class = zspage_class(pool, zspage);
+ fullness = get_fullness_group(class, zspage);
+
+ /* Lock out object allocations and object compaction */
+ remove_zspage(class, zspage, fullness);
+
+ spin_unlock(&pool->lock);
+
+ obj_idx = 0;
+ page = zspage->first_page;
+ while (1) {
+ handle = find_alloced_obj(class, page, &obj_idx);
+ if (!handle) {
+ page = get_next_page(page);
+ if (!page)
+ break;
+ obj_idx = 0;
+ continue;
+ }
+
+ /*
+ * This will write the object and call
+ * zs_free.
+ *
+ * zs_free will free the object, but the
+ * under_reclaim flag prevents it from freeing
+ * the zspage altogether. This is necessary so
+ * that we can continue working with the
+ * zspage potentially after the last object
+ * has been freed.
+ */
+ ret = pool->ops->evict(pool, handle);
+ if (ret)
+ goto next;
+
+ obj_idx++;
+ }
+
+next:
+ /* For freeing the zspage, or putting it back in the pool and LRU list. */
+ spin_lock(&pool->lock);
+ zspage->under_reclaim = false;
+
+ if (!get_zspage_inuse(zspage)) {
+ /*
+ * Fullness went stale as zs_free() won't touch it
+ * while the page is removed from the pool. Fix it
+ * up for the check in __free_zspage().
+ */
+ zspage->fullness = ZS_EMPTY;
+
+ __free_zspage(pool, class, zspage);
+ spin_unlock(&pool->lock);
+ return 0;
+ }
+
+ putback_zspage(class, zspage);
+ list_add(&zspage->lru, &pool->lru);
+ unlock_zspage(zspage);
+ }
+
+ spin_unlock(&pool->lock);
+ return -EAGAIN;
+}
+
static int __init zs_init(void)
{
int ret;
--
2.30.2
Hi Nhat,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.1-rc2 next-20221027]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Nhat-Pham/Implement-writeback-for-zsmalloc/20221027-040711
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20221026200613.1031261-6-nphamcs%40gmail.com
patch subject: [PATCH 5/5] zsmalloc: Implement writeback mechanism for zsmalloc
config: csky-randconfig-r035-20221026
compiler: csky-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/2a23b8adb2a4ca453bb39a7991c93a803899c102
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Nhat-Pham/Implement-writeback-for-zsmalloc/20221027-040711
git checkout 2a23b8adb2a4ca453bb39a7991c93a803899c102
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=csky SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>
All errors (new ones prefixed by >>):
mm/zsmalloc.c: In function 'zs_reclaim_page':
>> mm/zsmalloc.c:2462:18: error: 'struct zs_pool' has no member named 'ops'
2462 | if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
| ^~
mm/zsmalloc.c:2462:32: error: 'struct zs_pool' has no member named 'ops'
2462 | if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
| ^~
mm/zsmalloc.c:2511:35: error: 'struct zs_pool' has no member named 'ops'
2511 | ret = pool->ops->evict(pool, handle);
| ^~
mm/zsmalloc.c: At top level:
mm/zsmalloc.c:2452:12: warning: 'zs_reclaim_page' defined but not used [-Wunused-function]
2452 | static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
| ^~~~~~~~~~~~~~~
vim +2462 mm/zsmalloc.c
2451
2452 static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
2453 {
2454 int i, obj_idx, ret = 0;
2455 unsigned long handle;
2456 struct zspage *zspage;
2457 struct page *page;
2458 enum fullness_group fullness;
2459
2460 /* Lock LRU and fullness list */
2461 spin_lock(&pool->lock);
> 2462 if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
2463 retries == 0) {
2464 spin_unlock(&pool->lock);
2465 return -EINVAL;
2466 }
2467
2468 for (i = 0; i < retries; i++) {
2469 struct size_class *class;
2470
2471 zspage = list_last_entry(&pool->lru, struct zspage, lru);
2472 list_del(&zspage->lru);
2473
2474 /* zs_free may free objects, but not the zspage and handles */
2475 zspage->under_reclaim = true;
2476
2477 /* Lock backing pages into place */
2478 lock_zspage(zspage);
2479
2480 class = zspage_class(pool, zspage);
2481 fullness = get_fullness_group(class, zspage);
2482
2483 /* Lock out object allocations and object compaction */
2484 remove_zspage(class, zspage, fullness);
2485
2486 spin_unlock(&pool->lock);
2487
2488 obj_idx = 0;
2489 page = zspage->first_page;
2490 while (1) {
2491 handle = find_alloced_obj(class, page, &obj_idx);
2492 if (!handle) {
2493 page = get_next_page(page);
2494 if (!page)
2495 break;
2496 obj_idx = 0;
2497 continue;
2498 }
2499
2500 /*
2501 * This will write the object and call
2502 * zs_free.
2503 *
2504 * zs_free will free the object, but the
2505 * under_reclaim flag prevents it from freeing
2506 * the zspage altogether. This is necessary so
2507 * that we can continue working with the
2508 * zspage potentially after the last object
2509 * has been freed.
2510 */
2511 ret = pool->ops->evict(pool, handle);
2512 if (ret)
2513 goto next;
2514
2515 obj_idx++;
2516 }
2517
2518 next:
2519 /* For freeing the zspage, or putting it back in the pool and LRU list. */
2520 spin_lock(&pool->lock);
2521 zspage->under_reclaim = false;
2522
2523 if (!get_zspage_inuse(zspage)) {
2524 /*
2525 * Fullness went stale as zs_free() won't touch it
2526 * while the page is removed from the pool. Fix it
2527 * up for the check in __free_zspage().
2528 */
2529 zspage->fullness = ZS_EMPTY;
2530
2531 __free_zspage(pool, class, zspage);
2532 spin_unlock(&pool->lock);
2533 return 0;
2534 }
2535
2536 putback_zspage(class, zspage);
2537 list_add(&zspage->lru, &pool->lru);
2538 unlock_zspage(zspage);
2539 }
2540
2541 spin_unlock(&pool->lock);
2542 return -EAGAIN;
2543 }
2544
--
0-DAY CI Kernel Test Service
https://01.org/lkp
This commit adds the writeback mechanism for zsmalloc, analogous to the
zbud allocator. Zsmalloc will attempt to determine the coldest zspage
(i.e least recently used) in the pool, and attempt to write back all the
stored compressed objects via the pool's evict handler.
Signed-off-by: Nhat Pham <[email protected]>
---
mm/zsmalloc.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 183 insertions(+), 15 deletions(-)
Fix the issue reported by kernel test robot <[email protected]>
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 76ff2ed839d0..f7fa69360dd9 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -279,10 +279,13 @@ struct zspage {
/* links the zspage to the lru list in the pool */
struct list_head lru;
+ bool under_reclaim;
+
+ /* list of unfreed handles whose objects have been reclaimed */
+ unsigned long *deferred_handles;
+
struct zs_pool *pool;
-#ifdef CONFIG_COMPACTION
rwlock_t lock;
-#endif
};
struct mapping_area {
@@ -303,10 +306,11 @@ static bool ZsHugePage(struct zspage *zspage)
return zspage->huge;
}
-#ifdef CONFIG_COMPACTION
static void migrate_lock_init(struct zspage *zspage);
static void migrate_read_lock(struct zspage *zspage);
static void migrate_read_unlock(struct zspage *zspage);
+
+#ifdef CONFIG_COMPACTION
static void migrate_write_lock(struct zspage *zspage);
static void migrate_write_lock_nested(struct zspage *zspage);
static void migrate_write_unlock(struct zspage *zspage);
@@ -314,9 +318,6 @@ static void kick_deferred_free(struct zs_pool *pool);
static void init_deferred_free(struct zs_pool *pool);
static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage);
#else
-static void migrate_lock_init(struct zspage *zspage) {}
-static void migrate_read_lock(struct zspage *zspage) {}
-static void migrate_read_unlock(struct zspage *zspage) {}
static void migrate_write_lock(struct zspage *zspage) {}
static void migrate_write_lock_nested(struct zspage *zspage) {}
static void migrate_write_unlock(struct zspage *zspage) {}
@@ -444,6 +445,27 @@ static void zs_zpool_free(void *pool, unsigned long handle)
zs_free(pool, handle);
}
+static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries);
+
+static int zs_zpool_shrink(void *pool, unsigned int pages,
+ unsigned int *reclaimed)
+{
+ unsigned int total = 0;
+ int ret = -EINVAL;
+
+ while (total < pages) {
+ ret = zs_reclaim_page(pool, 8);
+ if (ret < 0)
+ break;
+ total++;
+ }
+
+ if (reclaimed)
+ *reclaimed = total;
+
+ return ret;
+}
+
static void *zs_zpool_map(void *pool, unsigned long handle,
enum zpool_mapmode mm)
{
@@ -482,6 +504,7 @@ static struct zpool_driver zs_zpool_driver = {
.malloc_support_movable = true,
.malloc = zs_zpool_malloc,
.free = zs_zpool_free,
+ .shrink = zs_zpool_shrink,
.map = zs_zpool_map,
.unmap = zs_zpool_unmap,
.total_size = zs_zpool_total_size,
@@ -955,6 +978,21 @@ static int trylock_zspage(struct zspage *zspage)
return 0;
}
+/*
+ * Free all the deferred handles whose objects are freed in zs_free.
+ */
+static void free_handles(struct zs_pool *pool, struct zspage *zspage)
+{
+ unsigned long handle = (unsigned long) zspage->deferred_handles;
+
+ while (handle) {
+ unsigned long nxt_handle = handle_to_obj(handle);
+
+ cache_free_handle(pool, handle);
+ handle = nxt_handle;
+ }
+}
+
static void __free_zspage(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage)
{
@@ -969,6 +1007,9 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
VM_BUG_ON(get_zspage_inuse(zspage));
VM_BUG_ON(fg != ZS_EMPTY);
+ /* Free all deferred handles from zs_free */
+ free_handles(pool, zspage);
+
next = page = get_first_page(zspage);
do {
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -1051,6 +1092,8 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
}
INIT_LIST_HEAD(&zspage->lru);
+ zspage->under_reclaim = false;
+ zspage->deferred_handles = NULL;
set_freeobj(zspage, 0);
}
@@ -1472,11 +1515,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
fix_fullness_group(class, zspage);
record_obj(handle, obj);
class_stat_inc(class, OBJ_USED, 1);
- /* Move the zspage to front of pool's LRU */
- move_to_front(pool, zspage);
- spin_unlock(&pool->lock);
- return handle;
+ goto out;
}
spin_unlock(&pool->lock);
@@ -1500,6 +1540,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
/* We completely set up zspage so mark them as movable */
SetZsPageMovable(pool, zspage);
+
+out:
/* Move the zspage to front of pool's LRU */
move_to_front(pool, zspage);
spin_unlock(&pool->lock);
@@ -1557,12 +1599,24 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
obj_free(class->size, obj);
class_stat_dec(class, OBJ_USED, 1);
+
+ if (zspage->under_reclaim) {
+ /*
+ * Reclaim needs the handles during writeback. It'll free
+ * them along with the zspage when it's done with them.
+ *
+ * Record current deferred handle at the memory location
+ * whose address is given by handle.
+ */
+ record_obj(handle, (unsigned long) zspage->deferred_handles);
+ zspage->deferred_handles = (unsigned long *) handle;
+ spin_unlock(&pool->lock);
+ return;
+ }
fullness = fix_fullness_group(class, zspage);
- if (fullness != ZS_EMPTY)
- goto out;
+ if (fullness == ZS_EMPTY)
+ free_zspage(pool, class, zspage);
- free_zspage(pool, class, zspage);
-out:
spin_unlock(&pool->lock);
cache_free_handle(pool, handle);
}
@@ -1762,7 +1816,7 @@ static enum fullness_group putback_zspage(struct size_class *class,
return fullness;
}
-#ifdef CONFIG_COMPACTION
+#if defined(CONFIG_ZPOOL) || defined(CONFIG_COMPACTION)
/*
* To prevent zspage destroy during migration, zspage freeing should
* hold locks of all pages in the zspage.
@@ -1804,6 +1858,24 @@ static void lock_zspage(struct zspage *zspage)
}
migrate_read_unlock(zspage);
}
+#endif /* defined(CONFIG_ZPOOL) || defined(CONFIG_COMPACTION) */
+
+#ifdef CONFIG_ZPOOL
+/*
+ * Unlocks all the pages of the zspage.
+ *
+ * pool->lock must be held before this function is called
+ * to prevent the underlying pages from migrating.
+ */
+static void unlock_zspage(struct zspage *zspage)
+{
+ struct page *page = get_first_page(zspage);
+
+ do {
+ unlock_page(page);
+ } while ((page = get_next_page(page)) != NULL);
+}
+#endif /* CONFIG_ZPOOL */
static void migrate_lock_init(struct zspage *zspage)
{
@@ -1820,6 +1892,7 @@ static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock)
read_unlock(&zspage->lock);
}
+#ifdef CONFIG_COMPACTION
static void migrate_write_lock(struct zspage *zspage)
{
write_lock(&zspage->lock);
@@ -2380,6 +2453,101 @@ void zs_destroy_pool(struct zs_pool *pool)
}
EXPORT_SYMBOL_GPL(zs_destroy_pool);
+#ifdef CONFIG_ZPOOL
+static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
+{
+ int i, obj_idx, ret = 0;
+ unsigned long handle;
+ struct zspage *zspage;
+ struct page *page;
+ enum fullness_group fullness;
+
+ /* Lock LRU and fullness list */
+ spin_lock(&pool->lock);
+ if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
+ retries == 0) {
+ spin_unlock(&pool->lock);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < retries; i++) {
+ struct size_class *class;
+
+ zspage = list_last_entry(&pool->lru, struct zspage, lru);
+ list_del(&zspage->lru);
+
+ /* zs_free may free objects, but not the zspage and handles */
+ zspage->under_reclaim = true;
+
+ /* Lock backing pages into place */
+ lock_zspage(zspage);
+
+ class = zspage_class(pool, zspage);
+ fullness = get_fullness_group(class, zspage);
+
+ /* Lock out object allocations and object compaction */
+ remove_zspage(class, zspage, fullness);
+
+ spin_unlock(&pool->lock);
+
+ obj_idx = 0;
+ page = zspage->first_page;
+ while (1) {
+ handle = find_alloced_obj(class, page, &obj_idx);
+ if (!handle) {
+ page = get_next_page(page);
+ if (!page)
+ break;
+ obj_idx = 0;
+ continue;
+ }
+
+ /*
+ * This will write the object and call
+ * zs_free.
+ *
+ * zs_free will free the object, but the
+ * under_reclaim flag prevents it from freeing
+ * the zspage altogether. This is necessary so
+ * that we can continue working with the
+ * zspage potentially after the last object
+ * has been freed.
+ */
+ ret = pool->ops->evict(pool, handle);
+ if (ret)
+ goto next;
+
+ obj_idx++;
+ }
+
+next:
+ /* For freeing the zspage, or putting it back in the pool and LRU list. */
+ spin_lock(&pool->lock);
+ zspage->under_reclaim = false;
+
+ if (!get_zspage_inuse(zspage)) {
+ /*
+ * Fullness went stale as zs_free() won't touch it
+ * while the page is removed from the pool. Fix it
+ * up for the check in __free_zspage().
+ */
+ zspage->fullness = ZS_EMPTY;
+
+ __free_zspage(pool, class, zspage);
+ spin_unlock(&pool->lock);
+ return 0;
+ }
+
+ putback_zspage(class, zspage);
+ list_add(&zspage->lru, &pool->lru);
+ unlock_zspage(zspage);
+ }
+
+ spin_unlock(&pool->lock);
+ return -EAGAIN;
+}
+#endif /* CONFIG_ZPOOL */
+
static int __init zs_init(void)
{
int ret;
--
2.30.2
On Thu, Oct 27, 2022 at 11:27:36AM -0700, Nhat Pham wrote:
> This commit adds the writeback mechanism for zsmalloc, analogous to the
> zbud allocator. Zsmalloc will attempt to determine the coldest zspage
> (i.e least recently used) in the pool, and attempt to write back all the
> stored compressed objects via the pool's evict handler.
>
> Signed-off-by: Nhat Pham <[email protected]>
Acked-by: Johannes Weiner <[email protected]>
On (22/10/27 11:27), Nhat Pham wrote:
[..]
> @@ -482,6 +504,7 @@ static struct zpool_driver zs_zpool_driver = {
> .malloc_support_movable = true,
> .malloc = zs_zpool_malloc,
> .free = zs_zpool_free,
> + .shrink = zs_zpool_shrink,
This should use tabs, just like the rest of fields.
> .map = zs_zpool_map,
> .unmap = zs_zpool_unmap,
> .total_size = zs_zpool_total_size,
> @@ -955,6 +978,21 @@ static int trylock_zspage(struct zspage *zspage)
> return 0;
> }
>
> +/*
> + * Free all the deferred handles whose objects are freed in zs_free.
> + */
> +static void free_handles(struct zs_pool *pool, struct zspage *zspage)
> +{
> + unsigned long handle = (unsigned long) zspage->deferred_handles;
A nit: we don't put a white space in cases like this.
IOW (unsigned long)zspage->.... here in everywhere else in this series.
On (22/10/27 11:27), Nhat Pham wrote:
> +
> +static int zs_zpool_shrink(void *pool, unsigned int pages,
> + unsigned int *reclaimed)
> +{
> + unsigned int total = 0;
> + int ret = -EINVAL;
> +
> + while (total < pages) {
> + ret = zs_reclaim_page(pool, 8);
> + if (ret < 0)
> + break;
> + total++;
> + }
> +
> + if (reclaimed)
> + *reclaimed = total;
> +
> + return ret;
> +}
The name collides with shrinker callbacks (compaction). That's a bit
confusing, took me some time.
> @@ -482,6 +504,7 @@ static struct zpool_driver zs_zpool_driver = {
> .malloc_support_movable = true,
> .malloc = zs_zpool_malloc,
> .free = zs_zpool_free,
> + .shrink = zs_zpool_shrink,
> .map = zs_zpool_map,
> .unmap = zs_zpool_unmap,
> .total_size = zs_zpool_total_size,
> @@ -955,6 +978,21 @@ static int trylock_zspage(struct zspage *zspage)
> return 0;
> }
[..]
> +#ifdef CONFIG_ZPOOL
> +static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
> +{
> + int i, obj_idx, ret = 0;
> + unsigned long handle;
> + struct zspage *zspage;
> + struct page *page;
> + enum fullness_group fullness;
> +
> + /* Lock LRU and fullness list */
> + spin_lock(&pool->lock);
> + if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
You don't need pool->lock for pool->ops/pool->ops->evict checks.
But, more importantly, I don't understand why is it even checked here?
Why do we use ops->evict? Why cannot we call into zsmalloc evict
directly? All of these are statically defined in zsmalloc, just don't
provide .shrink if !define CONFIG_ZPOOL? Under what circumstances
zsmalloc can provide .shrink but no ->evict?
On (22/10/27 11:27), Nhat Pham wrote:
[..]
> +static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
> +{
> + int i, obj_idx, ret = 0;
> + unsigned long handle;
> + struct zspage *zspage;
> + struct page *page;
> + enum fullness_group fullness;
> +
> + /* Lock LRU and fullness list */
> + spin_lock(&pool->lock);
> + if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
> + retries == 0) {
> + spin_unlock(&pool->lock);
> + return -EINVAL;
> + }
> +
> + for (i = 0; i < retries; i++) {
> + struct size_class *class;
> +
> + zspage = list_last_entry(&pool->lru, struct zspage, lru);
> + list_del(&zspage->lru);
> +
> + /* zs_free may free objects, but not the zspage and handles */
> + zspage->under_reclaim = true;
> +
> + /* Lock backing pages into place */
> + lock_zspage(zspage);
Does this call into the scheduler under pool->lock spinlock?
On Wed, Nov 02, 2022 at 12:42:06PM +0900, Sergey Senozhatsky wrote:
> On (22/10/27 11:27), Nhat Pham wrote:
> [..]
> > +static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
> > +{
> > + int i, obj_idx, ret = 0;
> > + unsigned long handle;
> > + struct zspage *zspage;
> > + struct page *page;
> > + enum fullness_group fullness;
> > +
> > + /* Lock LRU and fullness list */
> > + spin_lock(&pool->lock);
> > + if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
> > + retries == 0) {
> > + spin_unlock(&pool->lock);
> > + return -EINVAL;
> > + }
> > +
> > + for (i = 0; i < retries; i++) {
> > + struct size_class *class;
> > +
> > + zspage = list_last_entry(&pool->lru, struct zspage, lru);
> > + list_del(&zspage->lru);
> > +
> > + /* zs_free may free objects, but not the zspage and handles */
> > + zspage->under_reclaim = true;
> > +
> > + /* Lock backing pages into place */
> > + lock_zspage(zspage);
>
> Does this call into the scheduler under pool->lock spinlock?
Good catch!
We went back and checked our logs, and this didn't actually hit in our
production. We also couldn't trigger it with explicit compaction.
It's an easy fix, the page locks can be acquired after dropping the
pool lock.
On Wed, Nov 02, 2022 at 01:13:58PM +0900, Sergey Senozhatsky wrote:
> On (22/10/27 11:27), Nhat Pham wrote:
> > +
> > +static int zs_zpool_shrink(void *pool, unsigned int pages,
> > + unsigned int *reclaimed)
> > +{
> > + unsigned int total = 0;
> > + int ret = -EINVAL;
> > +
> > + while (total < pages) {
> > + ret = zs_reclaim_page(pool, 8);
> > + if (ret < 0)
> > + break;
> > + total++;
> > + }
> > +
> > + if (reclaimed)
> > + *reclaimed = total;
> > +
> > + return ret;
> > +}
>
> The name collides with shrinker callbacks (compaction). That's a bit
> confusing, took me some time.
Yeah this tripped us up too.
Unfortunately, this is inherited from the zpool API:
> > @@ -482,6 +504,7 @@ static struct zpool_driver zs_zpool_driver = {
> > .malloc_support_movable = true,
> > .malloc = zs_zpool_malloc,
> > .free = zs_zpool_free,
> > + .shrink = zs_zpool_shrink,
There is another terminology collision around "compaction" and
"migration": There is zs_page_migrate() which is called from physical
page migration and compaction to switch out backing struct pages of
the zspage. Then there is migrate_zspage() which is called from a
*shrinker* through *zs_compact()*, and it consolidates the objects
from partially used zspages into full ones.
We're collecting these issues and want to clean them up separately. It
seems this codebase hasn't had a lot of TLC recently, which makes it
harder to make needed changes :/ We try to clean up as we go, but some
of these issues (like the .shrink callback) run through all backends,
zpool and zswap, so they'd be better off in separate cleanup patches.
The out-of-control locking complexity in zsmalloc is another one of
these issues that we hoped we could clean up along the way, as you
have noticed...
On (22/11/03 12:45), Johannes Weiner wrote:
> On Wed, Nov 02, 2022 at 01:13:58PM +0900, Sergey Senozhatsky wrote:
> > On (22/10/27 11:27), Nhat Pham wrote:
> > > +
> > > +static int zs_zpool_shrink(void *pool, unsigned int pages,
> > > + unsigned int *reclaimed)
> > > +{
> > > + unsigned int total = 0;
> > > + int ret = -EINVAL;
> > > +
> > > + while (total < pages) {
> > > + ret = zs_reclaim_page(pool, 8);
> > > + if (ret < 0)
> > > + break;
> > > + total++;
> > > + }
> > > +
> > > + if (reclaimed)
> > > + *reclaimed = total;
> > > +
> > > + return ret;
> > > +}
> >
> > The name collides with shrinker callbacks (compaction). That's a bit
> > confusing, took me some time.
>
> Yeah this tripped us up too.
>
> Unfortunately, this is inherited from the zpool API:
Yeah, sure. I don't ask to rename zpool API, that was just a personal
note that it took me sometime to figure out that now we have two different
shrinkers.
> > > @@ -482,6 +504,7 @@ static struct zpool_driver zs_zpool_driver = {
> > > .malloc_support_movable = true,
> > > .malloc = zs_zpool_malloc,
> > > .free = zs_zpool_free,
> > > + .shrink = zs_zpool_shrink,
>
> There is another terminology collision around "compaction" and
> "migration": There is zs_page_migrate() which is called from physical
> page migration and compaction to switch out backing struct pages of
> the zspage. Then there is migrate_zspage() which is called from a
> *shrinker* through *zs_compact()*, and it consolidates the objects
> from partially used zspages into full ones.
Oh, yeah...
> We're collecting these issues and want to clean them up separately. It
> seems this codebase hasn't had a lot of TLC recently, which makes it
> harder to make needed changes :/ We try to clean up as we go, but some
> of these issues (like the .shrink callback) run through all backends,
> zpool and zswap, so they'd be better off in separate cleanup patches.
Totally agreed with the separate effort/patches.