LinuxLists.cc - [PATCH 1/2] mm: zswap: optimize zswap pool size tracking

2024-03-11 16:12:33

Subject: [PATCH 1/2] mm: zswap: optimize zswap pool size tracking

Profiling the munmap() of a zswapped memory region shows 50%(!) of the
total cycles currently going into updating the zswap_pool_total_size.

There are three consumers of this counter:
- store, to enforce the globally configured pool limit
- meminfo & debugfs, to report the size to the user
- shrink, to determine the batch size for each cycle

Instead of aggregating everytime an entry enters or exits the zswap
pool, aggregate the value from the zpools on-demand:

- Stores aggregate the counter anyway upon success. Aggregating to
check the limit instead is the same amount of work.

- Meminfo & debugfs might benefit somewhat from a pre-aggregated
counter, but aren't exactly hotpaths.

- Shrinking can aggregate once for every cycle instead of doing it for
every freed entry. As the shrinker might work on tens or hundreds of
objects per scan cycle, this is a large reduction in aggregations.

The paths that benefit dramatically are swapin, swapoff, and
unmaps. There could be millions of pages being processed until
somebody asks for the pool size again. This eliminates the pool size
updates from those paths entirely.

Signed-off-by: Johannes Weiner <[email protected]>
---
fs/proc/meminfo.c | 3 +-
include/linux/zswap.h | 2 +-
mm/zswap.c | 98 +++++++++++++++++++++----------------------
3 files changed, 49 insertions(+), 54 deletions(-)

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 45af9a989d40..245171d9164b 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -89,8 +89,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "SwapTotal: ", i.totalswap);
show_val_kb(m, "SwapFree: ", i.freeswap);
#ifdef CONFIG_ZSWAP
- seq_printf(m, "Zswap: %8lu kB\n",
- (unsigned long)(zswap_pool_total_size >> 10));
+ show_val_kb(m, "Zswap: ", zswap_total_pages());
seq_printf(m, "Zswapped: %8lu kB\n",
(unsigned long)atomic_read(&zswap_stored_pages) <<
(PAGE_SHIFT - 10));
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 341aea490070..2a85b941db97 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -7,7 +7,6 @@

struct lruvec;

-extern u64 zswap_pool_total_size;
extern atomic_t zswap_stored_pages;

#ifdef CONFIG_ZSWAP
@@ -27,6 +26,7 @@ struct zswap_lruvec_state {
atomic_long_t nr_zswap_protected;
};

+unsigned long zswap_total_pages(void);
bool zswap_store(struct folio *folio);
bool zswap_load(struct folio *folio);
void zswap_invalidate(swp_entry_t swp);
diff --git a/mm/zswap.c b/mm/zswap.c
index 9a3237752082..7c39327a7cc2 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -43,8 +43,6 @@
/*********************************
* statistics
**********************************/
-/* Total bytes used by the compressed storage */
-u64 zswap_pool_total_size;
/* The number of compressed pages currently stored in zswap */
atomic_t zswap_stored_pages = ATOMIC_INIT(0);
/* The number of same-value filled pages currently stored in zswap */
@@ -264,45 +262,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
zpool_get_type((p)->zpools[0]))

-static bool zswap_is_full(void)
-{
- return totalram_pages() * zswap_max_pool_percent / 100 <
- DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
-}
-
-static bool zswap_can_accept(void)
-{
- return totalram_pages() * zswap_accept_thr_percent / 100 *
- zswap_max_pool_percent / 100 >
- DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
-}
-
-static u64 get_zswap_pool_size(struct zswap_pool *pool)
-{
- u64 pool_size = 0;
- int i;
-
- for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
- pool_size += zpool_get_total_size(pool->zpools[i]);
-
- return pool_size;
-}
-
-static void zswap_update_total_size(void)
-{
- struct zswap_pool *pool;
- u64 total = 0;
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(pool, &zswap_pools, list)
- total += get_zswap_pool_size(pool);
-
- rcu_read_unlock();
-
- zswap_pool_total_size = total;
-}
-
/*********************************
* pool functions
**********************************/
@@ -540,6 +499,28 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
return NULL;
}

+static unsigned long zswap_max_pages(void)
+{
+ return totalram_pages() * zswap_max_pool_percent / 100;
+}
+
+unsigned long zswap_total_pages(void)
+{
+ struct zswap_pool *pool;
+ u64 total = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &zswap_pools, list) {
+ int i;
+
+ for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
+ total += zpool_get_total_size(pool->zpools[i]);
+ }
+ rcu_read_unlock();
+
+ return total >> PAGE_SHIFT;
+}
+
/*********************************
* param callbacks
**********************************/
@@ -912,7 +893,6 @@ static void zswap_entry_free(struct zswap_entry *entry)
}
zswap_entry_cache_free(entry);
atomic_dec(&zswap_stored_pages);
- zswap_update_total_size();
}

/*
@@ -1317,7 +1297,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
#else
/* use pool stats instead of memcg stats */
- nr_backing = zswap_pool_total_size >> PAGE_SHIFT;
+ nr_backing = zswap_total_pages();
nr_stored = atomic_read(&zswap_nr_stored);
#endif

@@ -1385,6 +1365,10 @@ static void shrink_worker(struct work_struct *w)
{
struct mem_cgroup *memcg;
int ret, failures = 0;
+ unsigned long thr;
+
+ /* Reclaim down to the accept threshold */
+ thr = zswap_max_pages() * zswap_accept_thr_percent / 100;

/* global reclaim will select cgroup in a round-robin fashion. */
do {
@@ -1432,10 +1416,9 @@ static void shrink_worker(struct work_struct *w)
break;
if (ret && ++failures == MAX_RECLAIM_RETRIES)
break;
-
resched:
cond_resched();
- } while (!zswap_can_accept());
+ } while (zswap_total_pages() > thr);
}

static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
@@ -1476,6 +1459,7 @@ bool zswap_store(struct folio *folio)
struct zswap_entry *entry, *dupentry;
struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg = NULL;
+ unsigned long max_pages, cur_pages;

VM_WARN_ON_ONCE(!folio_test_locked(folio));
VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
@@ -1487,6 +1471,7 @@ bool zswap_store(struct folio *folio)
if (!zswap_enabled)
goto check_old;

+ /* Check cgroup limits */
objcg = get_obj_cgroup_from_folio(folio);
if (objcg && !obj_cgroup_may_zswap(objcg)) {
memcg = get_mem_cgroup_from_objcg(objcg);
@@ -1497,15 +1482,20 @@ bool zswap_store(struct folio *folio)
mem_cgroup_put(memcg);
}

- /* reclaim space if needed */
- if (zswap_is_full()) {
+ /* Check global limits */
+ cur_pages = zswap_total_pages();
+ max_pages = zswap_max_pages();
+
+ if (cur_pages >= max_pages) {
zswap_pool_limit_hit++;
zswap_pool_reached_full = true;
goto shrink;
}

if (zswap_pool_reached_full) {
- if (!zswap_can_accept())
+ unsigned long thr = max_pages * zswap_accept_thr_percent / 100;
+
+ if (cur_pages > thr)
goto shrink;
else
zswap_pool_reached_full = false;
@@ -1581,7 +1571,6 @@ bool zswap_store(struct folio *folio)

/* update stats */
atomic_inc(&zswap_stored_pages);
- zswap_update_total_size();
count_vm_event(ZSWPOUT);

return true;
@@ -1711,6 +1700,13 @@ void zswap_swapoff(int type)

static struct dentry *zswap_debugfs_root;

+static int debugfs_get_total_size(void *data, u64 *val)
+{
+ *val = zswap_total_pages() * PAGE_SIZE;
+ return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu");
+
static int zswap_debugfs_init(void)
{
if (!debugfs_initialized())
@@ -1732,8 +1728,8 @@ static int zswap_debugfs_init(void)
zswap_debugfs_root, &zswap_reject_compress_poor);
debugfs_create_u64("written_back_pages", 0444,
zswap_debugfs_root, &zswap_written_back_pages);
- debugfs_create_u64("pool_total_size", 0444,
- zswap_debugfs_root, &zswap_pool_total_size);
+ debugfs_create_file("pool_total_size", 0444,
+ zswap_debugfs_root, NULL, &total_size_fops);
debugfs_create_atomic_t("stored_pages", 0444,
zswap_debugfs_root, &zswap_stored_pages);
debugfs_create_atomic_t("same_filled_pages", 0444,
--
2.44.0

2024-03-11 16:12:40

by Johannes Weiner

[permalink] [raw]

Subject: [PATCH 2/2] mm: zpool: return pool size in pages

All zswap backends track their pool sizes in pages. Currently they
multiply by PAGE_SIZE for zswap, only for zswap to divide again in
order to do limit math. Report pages directly.

Signed-off-by: Johannes Weiner <[email protected]>
---
mm/z3fold.c | 2 +-
mm/zbud.c | 2 +-
mm/zpool.c | 4 ++--
mm/zsmalloc.c | 2 +-
mm/zswap.c | 4 ++--
5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/mm/z3fold.c b/mm/z3fold.c
index 7ab05621052d..9bacacd4168c 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -1404,7 +1404,7 @@ static void z3fold_zpool_unmap(void *pool, unsigned long handle)

static u64 z3fold_zpool_total_size(void *pool)
{
- return z3fold_get_pool_size(pool) * PAGE_SIZE;
+ return z3fold_get_pool_size(pool);
}

static struct zpool_driver z3fold_zpool_driver = {
diff --git a/mm/zbud.c b/mm/zbud.c
index 2190cc1f37b3..b7d8a22bbf5f 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -412,7 +412,7 @@ static void zbud_zpool_unmap(void *pool, unsigned long handle)

static u64 zbud_zpool_total_size(void *pool)
{
- return zbud_get_pool_size(pool) * PAGE_SIZE;
+ return zbud_get_pool_size(pool);
}

static struct zpool_driver zbud_zpool_driver = {
diff --git a/mm/zpool.c b/mm/zpool.c
index 846410479c2f..410808aee7fe 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -324,9 +324,9 @@ void zpool_unmap_handle(struct zpool *zpool, unsigned long handle)
* zpool_get_total_size() - The total size of the pool
* @zpool: The zpool to check
*
- * This returns the total size in bytes of the pool.
+ * This returns the total size in pages of the pool.
*
- * Returns: Total size of the zpool in bytes.
+ * Returns: Total size of the zpool in pages.
*/
u64 zpool_get_total_size(struct zpool *zpool)
{
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 7d7cb3eaabe0..398f3856817f 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -401,7 +401,7 @@ static void zs_zpool_unmap(void *pool, unsigned long handle)

static u64 zs_zpool_total_size(void *pool)
{
- return zs_get_total_pages(pool) << PAGE_SHIFT;
+ return zs_get_total_pages(pool);
}

static struct zpool_driver zs_zpool_driver = {
diff --git a/mm/zswap.c b/mm/zswap.c
index 7c39327a7cc2..fe4343e416e0 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -507,7 +507,7 @@ static unsigned long zswap_max_pages(void)
unsigned long zswap_total_pages(void)
{
struct zswap_pool *pool;
- u64 total = 0;
+ unsigned long total = 0;

rcu_read_lock();
list_for_each_entry_rcu(pool, &zswap_pools, list) {
@@ -518,7 +518,7 @@ unsigned long zswap_total_pages(void)
}
rcu_read_unlock();

- return total >> PAGE_SHIFT;
+ return total;
}

/*********************************
--
2.44.0

2024-03-11 22:09:47

by Yosry Ahmed

[permalink] [raw]

Subject: Re: [PATCH 1/2] mm: zswap: optimize zswap pool size tracking

On Mon, Mar 11, 2024 at 12:12:13PM -0400, Johannes Weiner wrote:
> Profiling the munmap() of a zswapped memory region shows 50%(!) of the
> total cycles currently going into updating the zswap_pool_total_size.

Yikes. I have always hated that size update scheme FWIW.

I have also wondered whether it makes sense to just maintain the number
of pages in zswap as an atomic, like zswap_stored_pages. I guess your
proposed scheme is even cheaper for the load/invalidate paths because we
do nothing at all. It could be an option if the aggregation in other
paths ever becomes a problem, but we would need to make sure it
doesn't regress the load/invalidate paths. Just sharing some thoughts.

>
> There are three consumers of this counter:
> - store, to enforce the globally configured pool limit
> - meminfo & debugfs, to report the size to the user
> - shrink, to determine the batch size for each cycle
>
> Instead of aggregating everytime an entry enters or exits the zswap
> pool, aggregate the value from the zpools on-demand:
>
> - Stores aggregate the counter anyway upon success. Aggregating to
> check the limit instead is the same amount of work.
>
> - Meminfo & debugfs might benefit somewhat from a pre-aggregated
> counter, but aren't exactly hotpaths.
>
> - Shrinking can aggregate once for every cycle instead of doing it for
> every freed entry. As the shrinker might work on tens or hundreds of
> objects per scan cycle, this is a large reduction in aggregations.
>
> The paths that benefit dramatically are swapin, swapoff, and
> unmaps. There could be millions of pages being processed until
> somebody asks for the pool size again. This eliminates the pool size
> updates from those paths entirely.

This looks like a big win, thanks! I wonder if you have any numbers of
perf profiles to share. That would be nice to have, but I think the
benefit is clear regardless.

I also like the implicit cleanup when we switch to maintaining the
number of pages rather than bytes. The code looks much better with all
the shifts and divisions gone :)

I have a couple of comments below. With them addressed, feel free to
add:
Acked-by: Yosry Ahmed <[email protected]>

[..]
> @@ -1385,6 +1365,10 @@ static void shrink_worker(struct work_struct *w)
> {
> struct mem_cgroup *memcg;
> int ret, failures = 0;
> + unsigned long thr;
> +
> + /* Reclaim down to the accept threshold */
> + thr = zswap_max_pages() * zswap_accept_thr_percent / 100;

This calculation is repeated twice, so I'd rather keep a helper for it
as an alternative to zswap_can_accept(). Perhaps zswap_threshold_page()
or zswap_acceptance_pages()?

>
> /* global reclaim will select cgroup in a round-robin fashion. */
> do {
> @@ -1432,10 +1416,9 @@ static void shrink_worker(struct work_struct *w)
> break;
> if (ret && ++failures == MAX_RECLAIM_RETRIES)
> break;
> -
> resched:
> cond_resched();
> - } while (!zswap_can_accept());
> + } while (zswap_total_pages() > thr);
> }
[..]
> @@ -1711,6 +1700,13 @@ void zswap_swapoff(int type)
>
> static struct dentry *zswap_debugfs_root;
>
> +static int debugfs_get_total_size(void *data, u64 *val)
> +{
> + *val = zswap_total_pages() * PAGE_SIZE;
> + return 0;
> +}
> +DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu");

I think we are missing a newline here to maintain the current format
(i.e "%llu\n").

> +
> static int zswap_debugfs_init(void)
> {
> if (!debugfs_initialized())
> @@ -1732,8 +1728,8 @@ static int zswap_debugfs_init(void)
> zswap_debugfs_root, &zswap_reject_compress_poor);
> debugfs_create_u64("written_back_pages", 0444,
> zswap_debugfs_root, &zswap_written_back_pages);
> - debugfs_create_u64("pool_total_size", 0444,
> - zswap_debugfs_root, &zswap_pool_total_size);
> + debugfs_create_file("pool_total_size", 0444,
> + zswap_debugfs_root, NULL, &total_size_fops);
> debugfs_create_atomic_t("stored_pages", 0444,
> zswap_debugfs_root, &zswap_stored_pages);
> debugfs_create_atomic_t("same_filled_pages", 0444,
> --
> 2.44.0
>

2024-03-11 22:12:14

by Yosry Ahmed

[permalink] [raw]

Subject: Re: [PATCH 2/2] mm: zpool: return pool size in pages

On Mon, Mar 11, 2024 at 12:12:14PM -0400, Johannes Weiner wrote:
> All zswap backends track their pool sizes in pages. Currently they
> multiply by PAGE_SIZE for zswap, only for zswap to divide again in
> order to do limit math. Report pages directly.

Nice. Although I would prefer renaming the zpool interface to
total_pages and renaming the zpool backends functions as well to use
pages rather than size.

Either way:
Acked-by: Yosry Ahmed <[email protected]>

2024-03-12 02:34:34

by Johannes Weiner

[permalink] [raw]

Subject: Re: [PATCH 1/2] mm: zswap: optimize zswap pool size tracking

On Mon, Mar 11, 2024 at 10:09:35PM +0000, Yosry Ahmed wrote:
> On Mon, Mar 11, 2024 at 12:12:13PM -0400, Johannes Weiner wrote:
> > Profiling the munmap() of a zswapped memory region shows 50%(!) of the
> > total cycles currently going into updating the zswap_pool_total_size.
>
> Yikes. I have always hated that size update scheme FWIW.
>
> I have also wondered whether it makes sense to just maintain the number
> of pages in zswap as an atomic, like zswap_stored_pages. I guess your
> proposed scheme is even cheaper for the load/invalidate paths because we
> do nothing at all. It could be an option if the aggregation in other
> paths ever becomes a problem, but we would need to make sure it
> doesn't regress the load/invalidate paths. Just sharing some thoughts.

Agree with you there. I actually tried doing it that way at first, but
noticed zram uses zs_get_total_pages() and actually wants a per-pool
count. I didn't want the backend to have to update two atomics, so I
settled for this version.

> > There are three consumers of this counter:
> > - store, to enforce the globally configured pool limit
> > - meminfo & debugfs, to report the size to the user
> > - shrink, to determine the batch size for each cycle
> >
> > Instead of aggregating everytime an entry enters or exits the zswap
> > pool, aggregate the value from the zpools on-demand:
> >
> > - Stores aggregate the counter anyway upon success. Aggregating to
> > check the limit instead is the same amount of work.
> >
> > - Meminfo & debugfs might benefit somewhat from a pre-aggregated
> > counter, but aren't exactly hotpaths.
> >
> > - Shrinking can aggregate once for every cycle instead of doing it for
> > every freed entry. As the shrinker might work on tens or hundreds of
> > objects per scan cycle, this is a large reduction in aggregations.
> >
> > The paths that benefit dramatically are swapin, swapoff, and
> > unmaps. There could be millions of pages being processed until
> > somebody asks for the pool size again. This eliminates the pool size
> > updates from those paths entirely.
>
> This looks like a big win, thanks! I wonder if you have any numbers of
> perf profiles to share. That would be nice to have, but I think the
> benefit is clear regardless.

I deleted the perf files already, but can re-run it tomorrow.

> I also like the implicit cleanup when we switch to maintaining the
> number of pages rather than bytes. The code looks much better with all
> the shifts and divisions gone :)
>
> I have a couple of comments below. With them addressed, feel free to
> add:
> Acked-by: Yosry Ahmed <[email protected]>

Thanks!

> > @@ -1385,6 +1365,10 @@ static void shrink_worker(struct work_struct *w)
> > {
> > struct mem_cgroup *memcg;
> > int ret, failures = 0;
> > + unsigned long thr;
> > +
> > + /* Reclaim down to the accept threshold */
> > + thr = zswap_max_pages() * zswap_accept_thr_percent / 100;
>
> This calculation is repeated twice, so I'd rather keep a helper for it
> as an alternative to zswap_can_accept(). Perhaps zswap_threshold_page()
> or zswap_acceptance_pages()?

Sounds good. I went with zswap_accept_thr_pages().

> > @@ -1711,6 +1700,13 @@ void zswap_swapoff(int type)
> >
> > static struct dentry *zswap_debugfs_root;
> >
> > +static int debugfs_get_total_size(void *data, u64 *val)
> > +{
> > + *val = zswap_total_pages() * PAGE_SIZE;
> > + return 0;
> > +}
> > +DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu");
>
> I think we are missing a newline here to maintain the current format
> (i.e "%llu\n").

Oops, good catch! I had verified the debugfs file (along with the
others) with 'grep . *', which hides that this is missing. Fixed up.

Thanks for taking a look. The incremental diff is below. I'll run the
tests and recapture the numbers tomorrow, then send v2.

diff --git a/mm/zswap.c b/mm/zswap.c
index 7c39327a7cc2..1a5cc7298306 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -504,6 +504,11 @@ static unsigned long zswap_max_pages(void)
return totalram_pages() * zswap_max_pool_percent / 100;
}

+static unsigned long zswap_accept_thr_pages(void)
+{
+ return zswap_max_pages() * zswap_accept_thr_percent / 100;
+}
+
unsigned long zswap_total_pages(void)
{
struct zswap_pool *pool;
@@ -1368,7 +1373,7 @@ static void shrink_worker(struct work_struct *w)
unsigned long thr;

/* Reclaim down to the accept threshold */
- thr = zswap_max_pages() * zswap_accept_thr_percent / 100;
+ thr = zswap_accept_thr_pages();

/* global reclaim will select cgroup in a round-robin fashion. */
do {
@@ -1493,9 +1498,7 @@ bool zswap_store(struct folio *folio)
}

if (zswap_pool_reached_full) {
- unsigned long thr = max_pages * zswap_accept_thr_percent / 100;
-
- if (cur_pages > thr)
+ if (cur_pages > zswap_accept_thr_pages())
goto shrink;
else
zswap_pool_reached_full = false;
@@ -1705,7 +1708,7 @@ static int debugfs_get_total_size(void *data, u64 *val)
*val = zswap_total_pages() * PAGE_SIZE;
return 0;
}
-DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu");
+DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu\n");

static int zswap_debugfs_init(void)
{

2024-03-12 02:36:55

by Johannes Weiner

[permalink] [raw]

Subject: Re: [PATCH 2/2] mm: zpool: return pool size in pages

On Mon, Mar 11, 2024 at 10:12:02PM +0000, Yosry Ahmed wrote:
> On Mon, Mar 11, 2024 at 12:12:14PM -0400, Johannes Weiner wrote:
> > All zswap backends track their pool sizes in pages. Currently they
> > multiply by PAGE_SIZE for zswap, only for zswap to divide again in
> > order to do limit math. Report pages directly.
>
> Nice. Although I would prefer renaming the zpool interface to
> total_pages and renaming the zpool backends functions as well to use
> pages rather than size.

Ha, I was on the fence, since it's kind of churny. But if you don't
mind, then it works for me as well.

> Either way:
> Acked-by: Yosry Ahmed <[email protected]>

Thanks.

diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index 3296438eec06..a67d62b79698 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -53,7 +53,7 @@ void *zpool_map_handle(struct zpool *pool, unsigned long handle,

void zpool_unmap_handle(struct zpool *pool, unsigned long handle);

-u64 zpool_get_total_size(struct zpool *pool);
+u64 zpool_get_total_pages(struct zpool *pool);

/**
@@ -91,7 +91,7 @@ struct zpool_driver {
enum zpool_mapmode mm);
void (*unmap)(void *pool, unsigned long handle);

- u64 (*total_size)(void *pool);
+ u64 (*total_pages)(void *pool);
};

void zpool_register_driver(struct zpool_driver *driver);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 9bacacd4168c..2ebfed32871b 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -1237,12 +1237,12 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
}

/**
- * z3fold_get_pool_size() - gets the z3fold pool size in pages
+ * z3fold_get_pool_pages() - gets the z3fold pool size in pages
* @pool: pool whose size is being queried
*
* Returns: size in pages of the given pool.
*/
-static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
+static u64 z3fold_get_pool_pages(struct z3fold_pool *pool)
{
return atomic64_read(&pool->pages_nr);
}
@@ -1402,9 +1402,9 @@ static void z3fold_zpool_unmap(void *pool, unsigned long handle)
z3fold_unmap(pool, handle);
}

-static u64 z3fold_zpool_total_size(void *pool)
+static u64 z3fold_zpool_total_pages(void *pool)
{
- return z3fold_get_pool_size(pool);
+ return z3fold_get_pool_pages(pool);
}

static struct zpool_driver z3fold_zpool_driver = {
@@ -1417,7 +1417,7 @@ static struct zpool_driver z3fold_zpool_driver = {
.free = z3fold_zpool_free,
.map = z3fold_zpool_map,
.unmap = z3fold_zpool_unmap,
- .total_size = z3fold_zpool_total_size,
+ .total_pages = z3fold_zpool_total_pages,
};

MODULE_ALIAS("zpool-z3fold");
diff --git a/mm/zbud.c b/mm/zbud.c
index b7d8a22bbf5f..e9836fff9438 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -365,13 +365,13 @@ static void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
}

/**
- * zbud_get_pool_size() - gets the zbud pool size in pages
+ * zbud_get_pool_pages() - gets the zbud pool size in pages
* @pool: pool whose size is being queried
*
* Returns: size in pages of the given pool. The pool lock need not be
* taken to access pages_nr.
*/
-static u64 zbud_get_pool_size(struct zbud_pool *pool)
+static u64 zbud_get_pool_pages(struct zbud_pool *pool)
{
return pool->pages_nr;
}
@@ -410,9 +410,9 @@ static void zbud_zpool_unmap(void *pool, unsigned long handle)
zbud_unmap(pool, handle);
}

-static u64 zbud_zpool_total_size(void *pool)
+static u64 zbud_zpool_total_pages(void *pool)
{
- return zbud_get_pool_size(pool);
+ return zbud_get_pool_pages(pool);
}

static struct zpool_driver zbud_zpool_driver = {
@@ -425,7 +425,7 @@ static struct zpool_driver zbud_zpool_driver = {
.free = zbud_zpool_free,
.map = zbud_zpool_map,
.unmap = zbud_zpool_unmap,
- .total_size = zbud_zpool_total_size,
+ .total_pages = zbud_zpool_total_pages,
};

MODULE_ALIAS("zpool-zbud");
diff --git a/mm/zpool.c b/mm/zpool.c
index 410808aee7fe..b9fda1fa857d 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -321,16 +321,16 @@ void zpool_unmap_handle(struct zpool *zpool, unsigned long handle)
}

/**
- * zpool_get_total_size() - The total size of the pool
+ * zpool_get_total_pages() - The total size of the pool
* @zpool: The zpool to check
*
* This returns the total size in pages of the pool.
*
* Returns: Total size of the zpool in pages.
*/
-u64 zpool_get_total_size(struct zpool *zpool)
+u64 zpool_get_total_pages(struct zpool *zpool)
{
- return zpool->driver->total_size(zpool->pool);
+ return zpool->driver->total_pages(zpool->pool);
}

/**
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 398f3856817f..b42d3545ca85 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -399,7 +399,7 @@ static void zs_zpool_unmap(void *pool, unsigned long handle)
zs_unmap_object(pool, handle);
}

-static u64 zs_zpool_total_size(void *pool)
+static u64 zs_zpool_total_pages(void *pool)
{
return zs_get_total_pages(pool);
}
@@ -414,7 +414,7 @@ static struct zpool_driver zs_zpool_driver = {
.free = zs_zpool_free,
.map = zs_zpool_map,
.unmap = zs_zpool_unmap,
- .total_size = zs_zpool_total_size,
+ .total_pages = zs_zpool_total_pages,
};

MODULE_ALIAS("zpool-zsmalloc");
diff --git a/mm/zswap.c b/mm/zswap.c
index 7ed79caf1e1e..9fdf4c76d5ea 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -519,7 +519,7 @@ unsigned long zswap_total_pages(void)
int i;

for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
- total += zpool_get_total_size(pool->zpools[i]);
+ total += zpool_get_total_pages(pool->zpools[i]);
}
rcu_read_unlock();

2024-03-12 04:04:46

by Yosry Ahmed

[permalink] [raw]

Subject: Re: [PATCH 1/2] mm: zswap: optimize zswap pool size tracking

On Mon, Mar 11, 2024 at 10:34:11PM -0400, Johannes Weiner wrote:
> On Mon, Mar 11, 2024 at 10:09:35PM +0000, Yosry Ahmed wrote:
> > On Mon, Mar 11, 2024 at 12:12:13PM -0400, Johannes Weiner wrote:
> > > Profiling the munmap() of a zswapped memory region shows 50%(!) of the
> > > total cycles currently going into updating the zswap_pool_total_size.
> >
> > Yikes. I have always hated that size update scheme FWIW.
> >
> > I have also wondered whether it makes sense to just maintain the number
> > of pages in zswap as an atomic, like zswap_stored_pages. I guess your
> > proposed scheme is even cheaper for the load/invalidate paths because we
> > do nothing at all. It could be an option if the aggregation in other
> > paths ever becomes a problem, but we would need to make sure it
> > doesn't regress the load/invalidate paths. Just sharing some thoughts.
>
> Agree with you there. I actually tried doing it that way at first, but
> noticed zram uses zs_get_total_pages() and actually wants a per-pool
> count. I didn't want the backend to have to update two atomics, so I
> settled for this version.

Could be useful to document this context if you send a v2. This version
is a big improvement anyway, so hopefully we don' t need to revisit.

>
> > > There are three consumers of this counter:
> > > - store, to enforce the globally configured pool limit
> > > - meminfo & debugfs, to report the size to the user
> > > - shrink, to determine the batch size for each cycle
> > >
> > > Instead of aggregating everytime an entry enters or exits the zswap
> > > pool, aggregate the value from the zpools on-demand:
> > >
> > > - Stores aggregate the counter anyway upon success. Aggregating to
> > > check the limit instead is the same amount of work.
> > >
> > > - Meminfo & debugfs might benefit somewhat from a pre-aggregated
> > > counter, but aren't exactly hotpaths.
> > >
> > > - Shrinking can aggregate once for every cycle instead of doing it for
> > > every freed entry. As the shrinker might work on tens or hundreds of
> > > objects per scan cycle, this is a large reduction in aggregations.
> > >
> > > The paths that benefit dramatically are swapin, swapoff, and
> > > unmaps. There could be millions of pages being processed until
> > > somebody asks for the pool size again. This eliminates the pool size
> > > updates from those paths entirely.
> >
> > This looks like a big win, thanks! I wonder if you have any numbers of
> > perf profiles to share. That would be nice to have, but I think the
> > benefit is clear regardless.
>
> I deleted the perf files already, but can re-run it tomorrow.

Thanks!

>
> > I also like the implicit cleanup when we switch to maintaining the
> > number of pages rather than bytes. The code looks much better with all
> > the shifts and divisions gone :)
> >
> > I have a couple of comments below. With them addressed, feel free to
> > add:
> > Acked-by: Yosry Ahmed <[email protected]>
>
> Thanks!
>
> > > @@ -1385,6 +1365,10 @@ static void shrink_worker(struct work_struct *w)
> > > {
> > > struct mem_cgroup *memcg;
> > > int ret, failures = 0;
> > > + unsigned long thr;
> > > +
> > > + /* Reclaim down to the accept threshold */
> > > + thr = zswap_max_pages() * zswap_accept_thr_percent / 100;
> >
> > This calculation is repeated twice, so I'd rather keep a helper for it
> > as an alternative to zswap_can_accept(). Perhaps zswap_threshold_page()
> > or zswap_acceptance_pages()?
>
> Sounds good. I went with zswap_accept_thr_pages().

Even better.

>
> > > @@ -1711,6 +1700,13 @@ void zswap_swapoff(int type)
> > >
> > > static struct dentry *zswap_debugfs_root;
> > >
> > > +static int debugfs_get_total_size(void *data, u64 *val)
> > > +{
> > > + *val = zswap_total_pages() * PAGE_SIZE;
> > > + return 0;
> > > +}
> > > +DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu");
> >
> > I think we are missing a newline here to maintain the current format
> > (i.e "%llu\n").
>
> Oops, good catch! I had verified the debugfs file (along with the
> others) with 'grep . *', which hides that this is missing. Fixed up.
>
> Thanks for taking a look. The incremental diff is below. I'll run the
> tests and recapture the numbers tomorrow, then send v2.

LGTM. Feel free to carry the Ack forward.

2024-03-12 04:08:02

by Yosry Ahmed

[permalink] [raw]

Subject: Re: [PATCH 2/2] mm: zpool: return pool size in pages

On Mon, Mar 11, 2024 at 10:36:37PM -0400, Johannes Weiner wrote:
> On Mon, Mar 11, 2024 at 10:12:02PM +0000, Yosry Ahmed wrote:
> > On Mon, Mar 11, 2024 at 12:12:14PM -0400, Johannes Weiner wrote:
> > > All zswap backends track their pool sizes in pages. Currently they
> > > multiply by PAGE_SIZE for zswap, only for zswap to divide again in
> > > order to do limit math. Report pages directly.
> >
> > Nice. Although I would prefer renaming the zpool interface to
> > total_pages and renaming the zpool backends functions as well to use
> > pages rather than size.
>
> Ha, I was on the fence, since it's kind of churny. But if you don't
> mind, then it works for me as well.

If we are cleaning up, might as well do it all the way :)

>
> > Either way:
> > Acked-by: Yosry Ahmed <[email protected]>
>
> Thanks.

LGTM. Feel free to carry the Ack forward.

2024-03-12 04:55:58

by Chengming Zhou

[permalink] [raw]

Subject: Re: [PATCH 1/2] mm: zswap: optimize zswap pool size tracking

On 2024/3/12 00:12, Johannes Weiner wrote:
> Profiling the munmap() of a zswapped memory region shows 50%(!) of the
> total cycles currently going into updating the zswap_pool_total_size.
>
> There are three consumers of this counter:
> - store, to enforce the globally configured pool limit
> - meminfo & debugfs, to report the size to the user
> - shrink, to determine the batch size for each cycle
>
> Instead of aggregating everytime an entry enters or exits the zswap
> pool, aggregate the value from the zpools on-demand:
>
> - Stores aggregate the counter anyway upon success. Aggregating to
> check the limit instead is the same amount of work.
>
> - Meminfo & debugfs might benefit somewhat from a pre-aggregated
> counter, but aren't exactly hotpaths.
>
> - Shrinking can aggregate once for every cycle instead of doing it for
> every freed entry. As the shrinker might work on tens or hundreds of
> objects per scan cycle, this is a large reduction in aggregations.
>
> The paths that benefit dramatically are swapin, swapoff, and
> unmaps. There could be millions of pages being processed until
> somebody asks for the pool size again. This eliminates the pool size
> updates from those paths entirely.
>
> Signed-off-by: Johannes Weiner <[email protected]>

Great! This is a clever simplification and optimization.

With the incremental diff, feel free to add:

Reviewed-by: Chengming Zhou <[email protected]>

Thanks.

> ---
> fs/proc/meminfo.c | 3 +-
> include/linux/zswap.h | 2 +-
> mm/zswap.c | 98 +++++++++++++++++++++----------------------
> 3 files changed, 49 insertions(+), 54 deletions(-)
>
> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
> index 45af9a989d40..245171d9164b 100644
> --- a/fs/proc/meminfo.c
> +++ b/fs/proc/meminfo.c
> @@ -89,8 +89,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
> show_val_kb(m, "SwapTotal: ", i.totalswap);
> show_val_kb(m, "SwapFree: ", i.freeswap);
> #ifdef CONFIG_ZSWAP
> - seq_printf(m, "Zswap: %8lu kB\n",
> - (unsigned long)(zswap_pool_total_size >> 10));
> + show_val_kb(m, "Zswap: ", zswap_total_pages());
> seq_printf(m, "Zswapped: %8lu kB\n",
> (unsigned long)atomic_read(&zswap_stored_pages) <<
> (PAGE_SHIFT - 10));
> diff --git a/include/linux/zswap.h b/include/linux/zswap.h
> index 341aea490070..2a85b941db97 100644
> --- a/include/linux/zswap.h
> +++ b/include/linux/zswap.h
> @@ -7,7 +7,6 @@
>
> struct lruvec;
>
> -extern u64 zswap_pool_total_size;
> extern atomic_t zswap_stored_pages;
>
> #ifdef CONFIG_ZSWAP
> @@ -27,6 +26,7 @@ struct zswap_lruvec_state {
> atomic_long_t nr_zswap_protected;
> };
>
> +unsigned long zswap_total_pages(void);
> bool zswap_store(struct folio *folio);
> bool zswap_load(struct folio *folio);
> void zswap_invalidate(swp_entry_t swp);
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 9a3237752082..7c39327a7cc2 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -43,8 +43,6 @@
> /*********************************
> * statistics
> **********************************/
> -/* Total bytes used by the compressed storage */
> -u64 zswap_pool_total_size;
> /* The number of compressed pages currently stored in zswap */
> atomic_t zswap_stored_pages = ATOMIC_INIT(0);
> /* The number of same-value filled pages currently stored in zswap */
> @@ -264,45 +262,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
> pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
> zpool_get_type((p)->zpools[0]))
>
> -static bool zswap_is_full(void)
> -{
> - return totalram_pages() * zswap_max_pool_percent / 100 <
> - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
> -}
> -
> -static bool zswap_can_accept(void)
> -{
> - return totalram_pages() * zswap_accept_thr_percent / 100 *
> - zswap_max_pool_percent / 100 >
> - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
> -}
> -
> -static u64 get_zswap_pool_size(struct zswap_pool *pool)
> -{
> - u64 pool_size = 0;
> - int i;
> -
> - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> - pool_size += zpool_get_total_size(pool->zpools[i]);
> -
> - return pool_size;
> -}
> -
> -static void zswap_update_total_size(void)
> -{
> - struct zswap_pool *pool;
> - u64 total = 0;
> -
> - rcu_read_lock();
> -
> - list_for_each_entry_rcu(pool, &zswap_pools, list)
> - total += get_zswap_pool_size(pool);
> -
> - rcu_read_unlock();
> -
> - zswap_pool_total_size = total;
> -}
> -
> /*********************************
> * pool functions
> **********************************/
> @@ -540,6 +499,28 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
> return NULL;
> }
>
> +static unsigned long zswap_max_pages(void)
> +{
> + return totalram_pages() * zswap_max_pool_percent / 100;
> +}
> +
> +unsigned long zswap_total_pages(void)
> +{
> + struct zswap_pool *pool;
> + u64 total = 0;
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(pool, &zswap_pools, list) {
> + int i;
> +
> + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> + total += zpool_get_total_size(pool->zpools[i]);
> + }
> + rcu_read_unlock();
> +
> + return total >> PAGE_SHIFT;
> +}
> +
> /*********************************
> * param callbacks
> **********************************/
> @@ -912,7 +893,6 @@ static void zswap_entry_free(struct zswap_entry *entry)
> }
> zswap_entry_cache_free(entry);
> atomic_dec(&zswap_stored_pages);
> - zswap_update_total_size();
> }
>
> /*
> @@ -1317,7 +1297,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
> nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
> #else
> /* use pool stats instead of memcg stats */
> - nr_backing = zswap_pool_total_size >> PAGE_SHIFT;
> + nr_backing = zswap_total_pages();
> nr_stored = atomic_read(&zswap_nr_stored);
> #endif
>
> @@ -1385,6 +1365,10 @@ static void shrink_worker(struct work_struct *w)
> {
> struct mem_cgroup *memcg;
> int ret, failures = 0;
> + unsigned long thr;
> +
> + /* Reclaim down to the accept threshold */
> + thr = zswap_max_pages() * zswap_accept_thr_percent / 100;
>
> /* global reclaim will select cgroup in a round-robin fashion. */
> do {
> @@ -1432,10 +1416,9 @@ static void shrink_worker(struct work_struct *w)
> break;
> if (ret && ++failures == MAX_RECLAIM_RETRIES)
> break;
> -
> resched:
> cond_resched();
> - } while (!zswap_can_accept());
> + } while (zswap_total_pages() > thr);
> }
>
> static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
> @@ -1476,6 +1459,7 @@ bool zswap_store(struct folio *folio)
> struct zswap_entry *entry, *dupentry;
> struct obj_cgroup *objcg = NULL;
> struct mem_cgroup *memcg = NULL;
> + unsigned long max_pages, cur_pages;
>
> VM_WARN_ON_ONCE(!folio_test_locked(folio));
> VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
> @@ -1487,6 +1471,7 @@ bool zswap_store(struct folio *folio)
> if (!zswap_enabled)
> goto check_old;
>
> + /* Check cgroup limits */
> objcg = get_obj_cgroup_from_folio(folio);
> if (objcg && !obj_cgroup_may_zswap(objcg)) {
> memcg = get_mem_cgroup_from_objcg(objcg);
> @@ -1497,15 +1482,20 @@ bool zswap_store(struct folio *folio)
> mem_cgroup_put(memcg);
> }
>
> - /* reclaim space if needed */
> - if (zswap_is_full()) {
> + /* Check global limits */
> + cur_pages = zswap_total_pages();
> + max_pages = zswap_max_pages();
> +
> + if (cur_pages >= max_pages) {
> zswap_pool_limit_hit++;
> zswap_pool_reached_full = true;
> goto shrink;
> }
>
> if (zswap_pool_reached_full) {
> - if (!zswap_can_accept())
> + unsigned long thr = max_pages * zswap_accept_thr_percent / 100;
> +
> + if (cur_pages > thr)
> goto shrink;
> else
> zswap_pool_reached_full = false;
> @@ -1581,7 +1571,6 @@ bool zswap_store(struct folio *folio)
>
> /* update stats */
> atomic_inc(&zswap_stored_pages);
> - zswap_update_total_size();
> count_vm_event(ZSWPOUT);
>
> return true;
> @@ -1711,6 +1700,13 @@ void zswap_swapoff(int type)
>
> static struct dentry *zswap_debugfs_root;
>
> +static int debugfs_get_total_size(void *data, u64 *val)
> +{
> + *val = zswap_total_pages() * PAGE_SIZE;
> + return 0;
> +}
> +DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu");
> +
> static int zswap_debugfs_init(void)
> {
> if (!debugfs_initialized())
> @@ -1732,8 +1728,8 @@ static int zswap_debugfs_init(void)
> zswap_debugfs_root, &zswap_reject_compress_poor);
> debugfs_create_u64("written_back_pages", 0444,
> zswap_debugfs_root, &zswap_written_back_pages);
> - debugfs_create_u64("pool_total_size", 0444,
> - zswap_debugfs_root, &zswap_pool_total_size);
> + debugfs_create_file("pool_total_size", 0444,
> + zswap_debugfs_root, NULL, &total_size_fops);
> debugfs_create_atomic_t("stored_pages", 0444,
> zswap_debugfs_root, &zswap_stored_pages);
> debugfs_create_atomic_t("same_filled_pages", 0444,

2024-03-12 04:56:53

by Chengming Zhou

[permalink] [raw]

Subject: Re: [PATCH 2/2] mm: zpool: return pool size in pages

On 2024/3/12 00:12, Johannes Weiner wrote:
> All zswap backends track their pool sizes in pages. Currently they
> multiply by PAGE_SIZE for zswap, only for zswap to divide again in
> order to do limit math. Report pages directly.
>
> Signed-off-by: Johannes Weiner <[email protected]>

With the incremental diff, feel free to add:

Reviewed-by: Chengming Zhou <[email protected]>

Thanks.

> ---
> mm/z3fold.c | 2 +-
> mm/zbud.c | 2 +-
> mm/zpool.c | 4 ++--
> mm/zsmalloc.c | 2 +-
> mm/zswap.c | 4 ++--
> 5 files changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/mm/z3fold.c b/mm/z3fold.c
> index 7ab05621052d..9bacacd4168c 100644
> --- a/mm/z3fold.c
> +++ b/mm/z3fold.c
> @@ -1404,7 +1404,7 @@ static void z3fold_zpool_unmap(void *pool, unsigned long handle)
>
> static u64 z3fold_zpool_total_size(void *pool)
> {
> - return z3fold_get_pool_size(pool) * PAGE_SIZE;
> + return z3fold_get_pool_size(pool);
> }
>
> static struct zpool_driver z3fold_zpool_driver = {
> diff --git a/mm/zbud.c b/mm/zbud.c
> index 2190cc1f37b3..b7d8a22bbf5f 100644
> --- a/mm/zbud.c
> +++ b/mm/zbud.c
> @@ -412,7 +412,7 @@ static void zbud_zpool_unmap(void *pool, unsigned long handle)
>
> static u64 zbud_zpool_total_size(void *pool)
> {
> - return zbud_get_pool_size(pool) * PAGE_SIZE;
> + return zbud_get_pool_size(pool);
> }
>
> static struct zpool_driver zbud_zpool_driver = {
> diff --git a/mm/zpool.c b/mm/zpool.c
> index 846410479c2f..410808aee7fe 100644
> --- a/mm/zpool.c
> +++ b/mm/zpool.c
> @@ -324,9 +324,9 @@ void zpool_unmap_handle(struct zpool *zpool, unsigned long handle)
> * zpool_get_total_size() - The total size of the pool
> * @zpool: The zpool to check
> *
> - * This returns the total size in bytes of the pool.
> + * This returns the total size in pages of the pool.
> *
> - * Returns: Total size of the zpool in bytes.
> + * Returns: Total size of the zpool in pages.
> */
> u64 zpool_get_total_size(struct zpool *zpool)
> {
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 7d7cb3eaabe0..398f3856817f 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -401,7 +401,7 @@ static void zs_zpool_unmap(void *pool, unsigned long handle)
>
> static u64 zs_zpool_total_size(void *pool)
> {
> - return zs_get_total_pages(pool) << PAGE_SHIFT;
> + return zs_get_total_pages(pool);
> }
>
> static struct zpool_driver zs_zpool_driver = {
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 7c39327a7cc2..fe4343e416e0 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -507,7 +507,7 @@ static unsigned long zswap_max_pages(void)
> unsigned long zswap_total_pages(void)
> {
> struct zswap_pool *pool;
> - u64 total = 0;
> + unsigned long total = 0;
>
> rcu_read_lock();
> list_for_each_entry_rcu(pool, &zswap_pools, list) {
> @@ -518,7 +518,7 @@ unsigned long zswap_total_pages(void)
> }
> rcu_read_unlock();
>
> - return total >> PAGE_SHIFT;
> + return total;
> }
>
> /*********************************

2024-03-12 09:12:55

by Nhat Pham

[permalink] [raw]

Subject: Re: [PATCH 1/2] mm: zswap: optimize zswap pool size tracking

On Mon, Mar 11, 2024 at 11:12 PM Johannes Weiner <[email protected]> wrote:
>
> Profiling the munmap() of a zswapped memory region shows 50%(!) of the
> total cycles currently going into updating the zswap_pool_total_size.
>
> There are three consumers of this counter:
> - store, to enforce the globally configured pool limit
> - meminfo & debugfs, to report the size to the user
> - shrink, to determine the batch size for each cycle
>
> Instead of aggregating everytime an entry enters or exits the zswap
> pool, aggregate the value from the zpools on-demand:
>
> - Stores aggregate the counter anyway upon success. Aggregating to
> check the limit instead is the same amount of work.
>
> - Meminfo & debugfs might benefit somewhat from a pre-aggregated
> counter, but aren't exactly hotpaths.
>
> - Shrinking can aggregate once for every cycle instead of doing it for
> every freed entry. As the shrinker might work on tens or hundreds of
> objects per scan cycle, this is a large reduction in aggregations.

Nice!

>
> The paths that benefit dramatically are swapin, swapoff, and
> unmaps. There could be millions of pages being processed until
> somebody asks for the pool size again. This eliminates the pool size
> updates from those paths entirely.
>
> Signed-off-by: Johannes Weiner <[email protected]>

With your fixlet applied:
Reviewed-by: Nhat Pham <[email protected]>

> ---
> fs/proc/meminfo.c | 3 +-
> include/linux/zswap.h | 2 +-
> mm/zswap.c | 98 +++++++++++++++++++++----------------------
> 3 files changed, 49 insertions(+), 54 deletions(-)
>
> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
> index 45af9a989d40..245171d9164b 100644
> --- a/fs/proc/meminfo.c
> +++ b/fs/proc/meminfo.c
> @@ -89,8 +89,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
> show_val_kb(m, "SwapTotal: ", i.totalswap);
> show_val_kb(m, "SwapFree: ", i.freeswap);
> #ifdef CONFIG_ZSWAP
> - seq_printf(m, "Zswap: %8lu kB\n",
> - (unsigned long)(zswap_pool_total_size >> 10));
> + show_val_kb(m, "Zswap: ", zswap_total_pages());
> seq_printf(m, "Zswapped: %8lu kB\n",
> (unsigned long)atomic_read(&zswap_stored_pages) <<
> (PAGE_SHIFT - 10));
> diff --git a/include/linux/zswap.h b/include/linux/zswap.h
> index 341aea490070..2a85b941db97 100644
> --- a/include/linux/zswap.h
> +++ b/include/linux/zswap.h
> @@ -7,7 +7,6 @@
>
> struct lruvec;
>
> -extern u64 zswap_pool_total_size;
> extern atomic_t zswap_stored_pages;
>
> #ifdef CONFIG_ZSWAP
> @@ -27,6 +26,7 @@ struct zswap_lruvec_state {
> atomic_long_t nr_zswap_protected;
> };
>
> +unsigned long zswap_total_pages(void);
> bool zswap_store(struct folio *folio);
> bool zswap_load(struct folio *folio);
> void zswap_invalidate(swp_entry_t swp);
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 9a3237752082..7c39327a7cc2 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -43,8 +43,6 @@
> /*********************************
> * statistics
> **********************************/
> -/* Total bytes used by the compressed storage */
> -u64 zswap_pool_total_size;
> /* The number of compressed pages currently stored in zswap */
> atomic_t zswap_stored_pages = ATOMIC_INIT(0);
> /* The number of same-value filled pages currently stored in zswap */
> @@ -264,45 +262,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
> pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
> zpool_get_type((p)->zpools[0]))
>
> -static bool zswap_is_full(void)
> -{
> - return totalram_pages() * zswap_max_pool_percent / 100 <
> - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
> -}
> -
> -static bool zswap_can_accept(void)
> -{
> - return totalram_pages() * zswap_accept_thr_percent / 100 *
> - zswap_max_pool_percent / 100 >
> - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
> -}
> -
> -static u64 get_zswap_pool_size(struct zswap_pool *pool)
> -{
> - u64 pool_size = 0;
> - int i;
> -
> - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> - pool_size += zpool_get_total_size(pool->zpools[i]);
> -
> - return pool_size;
> -}
> -
> -static void zswap_update_total_size(void)
> -{
> - struct zswap_pool *pool;
> - u64 total = 0;
> -
> - rcu_read_lock();
> -
> - list_for_each_entry_rcu(pool, &zswap_pools, list)
> - total += get_zswap_pool_size(pool);
> -
> - rcu_read_unlock();
> -
> - zswap_pool_total_size = total;
> -}
> -
> /*********************************
> * pool functions
> **********************************/
> @@ -540,6 +499,28 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
> return NULL;
> }
>
> +static unsigned long zswap_max_pages(void)
> +{
> + return totalram_pages() * zswap_max_pool_percent / 100;
> +}
> +
> +unsigned long zswap_total_pages(void)
> +{
> + struct zswap_pool *pool;
> + u64 total = 0;
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(pool, &zswap_pools, list) {
> + int i;
> +
> + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> + total += zpool_get_total_size(pool->zpools[i]);
> + }
> + rcu_read_unlock();
> +
> + return total >> PAGE_SHIFT;
> +}
> +
> /*********************************
> * param callbacks
> **********************************/
> @@ -912,7 +893,6 @@ static void zswap_entry_free(struct zswap_entry *entry)
> }
> zswap_entry_cache_free(entry);
> atomic_dec(&zswap_stored_pages);
> - zswap_update_total_size();
> }
>
> /*
> @@ -1317,7 +1297,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
> nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
> #else
> /* use pool stats instead of memcg stats */
> - nr_backing = zswap_pool_total_size >> PAGE_SHIFT;
> + nr_backing = zswap_total_pages();
> nr_stored = atomic_read(&zswap_nr_stored);
> #endif
>
> @@ -1385,6 +1365,10 @@ static void shrink_worker(struct work_struct *w)
> {
> struct mem_cgroup *memcg;
> int ret, failures = 0;
> + unsigned long thr;
> +
> + /* Reclaim down to the accept threshold */
> + thr = zswap_max_pages() * zswap_accept_thr_percent / 100;
>
> /* global reclaim will select cgroup in a round-robin fashion. */
> do {
> @@ -1432,10 +1416,9 @@ static void shrink_worker(struct work_struct *w)
> break;
> if (ret && ++failures == MAX_RECLAIM_RETRIES)
> break;
> -
> resched:
> cond_resched();
> - } while (!zswap_can_accept());
> + } while (zswap_total_pages() > thr);
> }
>
> static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
> @@ -1476,6 +1459,7 @@ bool zswap_store(struct folio *folio)
> struct zswap_entry *entry, *dupentry;
> struct obj_cgroup *objcg = NULL;
> struct mem_cgroup *memcg = NULL;
> + unsigned long max_pages, cur_pages;
>
> VM_WARN_ON_ONCE(!folio_test_locked(folio));
> VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
> @@ -1487,6 +1471,7 @@ bool zswap_store(struct folio *folio)
> if (!zswap_enabled)
> goto check_old;
>
> + /* Check cgroup limits */
> objcg = get_obj_cgroup_from_folio(folio);
> if (objcg && !obj_cgroup_may_zswap(objcg)) {
> memcg = get_mem_cgroup_from_objcg(objcg);
> @@ -1497,15 +1482,20 @@ bool zswap_store(struct folio *folio)
> mem_cgroup_put(memcg);
> }
>
> - /* reclaim space if needed */
> - if (zswap_is_full()) {
> + /* Check global limits */
> + cur_pages = zswap_total_pages();
> + max_pages = zswap_max_pages();
> +
> + if (cur_pages >= max_pages) {
> zswap_pool_limit_hit++;
> zswap_pool_reached_full = true;
> goto shrink;
> }
>
> if (zswap_pool_reached_full) {
> - if (!zswap_can_accept())
> + unsigned long thr = max_pages * zswap_accept_thr_percent / 100;
> +
> + if (cur_pages > thr)
> goto shrink;
> else
> zswap_pool_reached_full = false;
> @@ -1581,7 +1571,6 @@ bool zswap_store(struct folio *folio)
>
> /* update stats */
> atomic_inc(&zswap_stored_pages);
> - zswap_update_total_size();
> count_vm_event(ZSWPOUT);
>
> return true;
> @@ -1711,6 +1700,13 @@ void zswap_swapoff(int type)
>
> static struct dentry *zswap_debugfs_root;
>
> +static int debugfs_get_total_size(void *data, u64 *val)
> +{
> + *val = zswap_total_pages() * PAGE_SIZE;
> + return 0;
> +}
> +DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu");
> +
> static int zswap_debugfs_init(void)
> {
> if (!debugfs_initialized())
> @@ -1732,8 +1728,8 @@ static int zswap_debugfs_init(void)
> zswap_debugfs_root, &zswap_reject_compress_poor);
> debugfs_create_u64("written_back_pages", 0444,
> zswap_debugfs_root, &zswap_written_back_pages);
> - debugfs_create_u64("pool_total_size", 0444,
> - zswap_debugfs_root, &zswap_pool_total_size);
> + debugfs_create_file("pool_total_size", 0444,
> + zswap_debugfs_root, NULL, &total_size_fops);
> debugfs_create_atomic_t("stored_pages", 0444,
> zswap_debugfs_root, &zswap_stored_pages);
> debugfs_create_atomic_t("same_filled_pages", 0444,
> --
> 2.44.0
>

2024-03-12 09:16:40

by Nhat Pham

[permalink] [raw]

Subject: Re: [PATCH 2/2] mm: zpool: return pool size in pages

On Mon, Mar 11, 2024 at 11:12 PM Johannes Weiner <[email protected]> wrote:
>
> All zswap backends track their pool sizes in pages. Currently they
> multiply by PAGE_SIZE for zswap, only for zswap to divide again in
> order to do limit math. Report pages directly.

I've always found this to be weird. Perhaps the original author of
this API wants to support more fine-grained memory consumption
tracking?

>
> Signed-off-by: Johannes Weiner <[email protected]>

Anyway, let's just work with realities, rather than hypotheticals :)
With the renaming fixlet:
Reviewed-by: Nhat Pham <[email protected]>