2023-06-21 18:42:07

by Yuanchu Xie

[permalink] [raw]
Subject: [RFC PATCH v2 0/6] mm: working set reporting

RFC v1: https://lore.kernel.org/linux-mm/[email protected]/
For background and interfaces, see the RFC v1 posting.

Changes from v1 -> v2:
- Refactored the patchs into smaller pieces
- Renamed interfaces and functions from wss to wsr (Working Set Reporting)
- Fixed build errors when CONFIG_WSR is not set
- Changed working_set_num_bins to u8 for virtio-balloon
- Added support for per-NUMA node reporting for virtio-balloon

The RFC adds CONFIG_WSR and requires MGLRU to function. T.J. and I aim to support
the active/inactive LRU and working set estimation from the userspace as well.
This series should be build with the following configs:
CONFIG_LRU_GEN=y
CONFIG_LRU_GEN_ENABLED=y
CONFIG_VIRTIO_BALLOON=y
CONFIG_WSR=y

TODO list:
- There's a hack in mm/vmscan.c that calls into the virtio-balloon driver,
which doesn't work if CONFIG_VIRTIO_BALLOON=m. T.J. Alumbaugh ([email protected])
and I plan on solving this problem with a working set notification mechanism
that would allow multiple consumers to subscribe for working set changes.
- memory.reaccess.histogram does not consider swapped out pages to be reaccessed.
I plan to implement this with the shadow entry computed from mm/workingset.c.

QEMU device implementation:
https://lists.gnu.org/archive/html/qemu-devel/2023-05/msg06617.html

virtio-dev spec proposal v1 (v2 to be posted by T.J.):
https://lore.kernel.org/virtio-dev/CABmGT5Hv6Jd_F9EoQqVMDo4w5=7wJYmS4wwYDqXK3wov44Tf=w@mail.gmail.com/

LSF/MM discussion slides:
https://lore.kernel.org/linux-mm/CABmGT5HK9xHz=E4q4sECCD8XodP9DUcH0dMeQ8kznUQB5HTQhQ@mail.gmail.com/

T.J. Alumbaugh (1):
virtio-balloon: Add Working Set reporting

Yuanchu Xie (5):
mm: aggregate working set information into histograms
mm: add working set refresh threshold to rate-limit aggregation
mm: report working set when under memory pressure
mm: extend working set reporting to memcgs
mm: add per-memcg reaccess histogram

drivers/base/node.c | 3 +
drivers/virtio/virtio_balloon.c | 288 +++++++++++++++++
include/linux/balloon_compaction.h | 3 +
include/linux/memcontrol.h | 6 +
include/linux/mmzone.h | 5 +
include/linux/wsr.h | 114 +++++++
include/uapi/linux/virtio_balloon.h | 33 ++
mm/Kconfig | 7 +
mm/Makefile | 1 +
mm/internal.h | 12 +
mm/memcontrol.c | 351 ++++++++++++++++++++-
mm/mmzone.c | 3 +
mm/vmscan.c | 194 +++++++++++-
mm/wsr.c | 464 ++++++++++++++++++++++++++++
14 files changed, 1480 insertions(+), 4 deletions(-)
create mode 100644 include/linux/wsr.h
create mode 100644 mm/wsr.c

--
2.41.0.162.gfafddb0af9-goog



2023-06-21 18:42:15

by Yuanchu Xie

[permalink] [raw]
Subject: [RFC PATCH v2 4/6] mm: extend working set reporting to memcgs

Break down the system-wide working set reporting into
per-memcg reports, which aggregages its children hierarchically.
The per-node working set reporting histograms and refresh/report
threshold files are presented as memcg files, showing a report
containing all the nodes.

Signed-off-by: T.J. Alumbaugh <[email protected]>
Signed-off-by: Yuanchu Xie <[email protected]>
---
include/linux/memcontrol.h | 6 +
include/linux/wsr.h | 4 +
mm/memcontrol.c | 262 ++++++++++++++++++++++++++++++++++++-
mm/vmscan.c | 9 +-
4 files changed, 277 insertions(+), 4 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 85dc9b88ea379..96971aa6a48cd 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -10,6 +10,7 @@

#ifndef _LINUX_MEMCONTROL_H
#define _LINUX_MEMCONTROL_H
+#include <linux/wait.h>
#include <linux/cgroup.h>
#include <linux/vm_event_item.h>
#include <linux/hardirq.h>
@@ -325,6 +326,11 @@ struct mem_cgroup {
struct lru_gen_mm_list mm_list;
#endif

+#ifdef CONFIG_WSR
+ int wsr_event;
+ wait_queue_head_t wsr_wait_queue;
+#endif
+
struct mem_cgroup_per_node *nodeinfo[];
};

diff --git a/include/linux/wsr.h b/include/linux/wsr.h
index 85c901ce026b9..d45f7cc0672ac 100644
--- a/include/linux/wsr.h
+++ b/include/linux/wsr.h
@@ -48,6 +48,7 @@ ssize_t wsr_intervals_ms_parse(char *src, struct ws_bin *bins);
*/
void wsr_refresh(struct wsr *wsr, struct mem_cgroup *root,
struct pglist_data *pgdat);
+void report_ws(struct pglist_data *pgdat, struct scan_control *sc);
#else
struct ws_bin;
struct wsr;
@@ -73,6 +74,9 @@ static inline void wsr_refresh(struct wsr *wsr, struct mem_cgroup *root,
struct pglist_data *pgdat)
{
}
+static inline void report_ws(struct pglist_data *pgdat, struct scan_control *sc)
+{
+}
#endif /* CONFIG_WSR */

#endif /* _LINUX_WSR_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2eee092f8f119..edf5bb31bb19c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -25,6 +25,7 @@
* Copyright (C) 2020 Alibaba, Inc, Alex Shi
*/

+#include <linux/wait.h>
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
@@ -65,6 +66,7 @@
#include <linux/seq_buf.h>
#include "internal.h"
#include <net/sock.h>
+#include <linux/wsr.h>
#include <net/ip.h>
#include "slab.h"
#include "swap.h"
@@ -5233,6 +5235,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return;

+ wsr_destroy(&pn->lruvec);
free_percpu(pn->lruvec_stats_percpu);
kfree(pn);
}
@@ -5311,6 +5314,10 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
memcg->deferred_split_queue.split_queue_len = 0;
+#endif
+#ifdef CONFIG_WSR
+ memcg->wsr_event = 0;
+ init_waitqueue_head(&memcg->wsr_wait_queue);
#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
lru_gen_init_memcg(memcg);
@@ -5411,6 +5418,11 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock_irq(&memcg->event_list_lock);

+#ifdef CONFIG_WSR
+ wake_up_pollfree(&memcg->wsr_wait_queue);
+ synchronize_rcu();
+#endif
+
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);

@@ -6642,6 +6654,228 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
return nbytes;
}

+#ifdef CONFIG_WSR
+static int memory_wsr_intervals_ms_show(struct seq_file *m, void *v)
+{
+ int nid;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct wsr *wsr;
+ struct ws_bin *bin;
+
+ wsr = lruvec_wsr(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+ mutex_lock(&wsr->bins_lock);
+ seq_printf(m, "N%d=", nid);
+ for (bin = wsr->bins; bin->idle_age != -1; bin++)
+ seq_printf(m, "%u,", jiffies_to_msecs(bin->idle_age));
+ mutex_unlock(&wsr->bins_lock);
+
+ seq_printf(m, "%lld ", LLONG_MAX);
+ }
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static ssize_t memory_wsr_intervals_ms_parse(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ unsigned int *nid_out,
+ struct ws_bin *bins)
+{
+ char *node, *intervals;
+ unsigned int nid;
+ int err;
+
+ buf = strstrip(buf);
+ intervals = buf;
+ node = strsep(&intervals, "=");
+
+ if (*node != 'N')
+ return -EINVAL;
+
+ err = kstrtouint(node + 1, 0, &nid);
+ if (err)
+ return err;
+
+ if (nid >= nr_node_ids || !node_state(nid, N_MEMORY))
+ return -EINVAL;
+
+ err = wsr_intervals_ms_parse(intervals, bins);
+ if (err)
+ return err;
+
+ *nid_out = nid;
+ return 0;
+}
+
+static ssize_t memory_wsr_intervals_ms_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ unsigned int nid;
+ int err;
+ struct wsr *wsr;
+ struct ws_bin *bins;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+
+ bins = kzalloc(sizeof(wsr->bins), GFP_KERNEL);
+ if (!bins)
+ return -ENOMEM;
+
+ err = memory_wsr_intervals_ms_parse(of, buf, nbytes, &nid, bins);
+ if (err)
+ goto failed;
+
+ wsr = lruvec_wsr(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+ mutex_lock(&wsr->bins_lock);
+ memcpy(wsr->bins, bins, sizeof(wsr->bins));
+ mutex_unlock(&wsr->bins_lock);
+failed:
+ kfree(bins);
+ return err ?: nbytes;
+}
+
+static int memory_wsr_refresh_ms_show(struct seq_file *m, void *v)
+{
+ int nid;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct wsr *wsr =
+ lruvec_wsr(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+
+ seq_printf(m, "N%d=%u ", nid,
+ jiffies_to_msecs(READ_ONCE(wsr->refresh_threshold)));
+ }
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static ssize_t memory_wsr_threshold_parse(char *buf, size_t nbytes,
+ unsigned int *nid_out,
+ unsigned int *msecs)
+{
+ char *node, *threshold;
+ unsigned int nid;
+ int err;
+
+ buf = strstrip(buf);
+ threshold = buf;
+ node = strsep(&threshold, "=");
+
+ if (*node != 'N')
+ return -EINVAL;
+
+ err = kstrtouint(node + 1, 0, &nid);
+ if (err)
+ return err;
+
+ if (nid >= nr_node_ids || !node_state(nid, N_MEMORY))
+ return -EINVAL;
+
+ err = kstrtouint(threshold, 0, msecs);
+ if (err)
+ return err;
+
+ *nid_out = nid;
+
+ return nbytes;
+}
+
+static ssize_t memory_wsr_refresh_ms_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ unsigned int nid, msecs;
+ struct wsr *wsr;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ ssize_t ret = memory_wsr_threshold_parse(buf, nbytes, &nid, &msecs);
+
+ if (ret < 0)
+ return ret;
+
+ wsr = lruvec_wsr(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+ WRITE_ONCE(wsr->refresh_threshold, msecs_to_jiffies(msecs));
+ return ret;
+}
+
+static int memory_wsr_report_ms_show(struct seq_file *m, void *v)
+{
+ int nid;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct wsr *wsr =
+ lruvec_wsr(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+
+ seq_printf(m, "N%d=%u ", nid,
+ jiffies_to_msecs(READ_ONCE(wsr->report_threshold)));
+ }
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static ssize_t memory_wsr_report_ms_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ unsigned int nid, msecs;
+ struct wsr *wsr;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ ssize_t ret = memory_wsr_threshold_parse(buf, nbytes, &nid, &msecs);
+
+ if (ret < 0)
+ return ret;
+
+ wsr = lruvec_wsr(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+ WRITE_ONCE(wsr->report_threshold, msecs_to_jiffies(msecs));
+ return ret;
+}
+
+static int memory_wsr_histogram_show(struct seq_file *m, void *v)
+{
+ int nid;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct wsr *wsr =
+ lruvec_wsr(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+ struct ws_bin *bin;
+
+ seq_printf(m, "N%d\n", nid);
+
+ mutex_lock(&wsr->bins_lock);
+ wsr_refresh(wsr, memcg, NODE_DATA(nid));
+ for (bin = wsr->bins; bin->idle_age != -1; bin++)
+ seq_printf(m, "%u anon=%lu file=%lu\n",
+ jiffies_to_msecs(bin->idle_age),
+ bin->nr_pages[0], bin->nr_pages[1]);
+
+ seq_printf(m, "%lld anon=%lu file=%lu\n", LLONG_MAX,
+ bin->nr_pages[0], bin->nr_pages[1]);
+
+ mutex_unlock(&wsr->bins_lock);
+ }
+
+ return 0;
+}
+
+__poll_t memory_wsr_histogram_poll(struct kernfs_open_file *of,
+ struct poll_table_struct *pt)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+
+ if (memcg->css.flags & CSS_DYING)
+ return DEFAULT_POLLMASK;
+
+ poll_wait(of->file, &memcg->wsr_wait_queue, pt);
+ if (cmpxchg(&memcg->wsr_event, 1, 0) == 1)
+ return DEFAULT_POLLMASK | EPOLLPRI;
+ return DEFAULT_POLLMASK;
+}
+#endif
+
static struct cftype memory_files[] = {
{
.name = "current",
@@ -6710,7 +6944,33 @@ static struct cftype memory_files[] = {
.flags = CFTYPE_NS_DELEGATABLE,
.write = memory_reclaim,
},
- { } /* terminate */
+#ifdef CONFIG_WSR
+ {
+ .name = "wsr.intervals_ms",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_wsr_intervals_ms_show,
+ .write = memory_wsr_intervals_ms_write,
+ },
+ {
+ .name = "wsr.refresh_ms",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_wsr_refresh_ms_show,
+ .write = memory_wsr_refresh_ms_write,
+ },
+ {
+ .name = "wsr.report_ms",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_wsr_report_ms_show,
+ .write = memory_wsr_report_ms_write,
+ },
+ {
+ .name = "wsr.histogram",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_wsr_histogram_show,
+ .poll = memory_wsr_histogram_poll,
+ },
+#endif
+ {} /* terminate */
};

struct cgroup_subsys memory_cgrp_subsys = {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c56fddcec88fb..ba254b6e91e19 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4559,8 +4559,6 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned
return true;
}

-static void report_ws(struct pglist_data *pgdat, struct scan_control *sc);
-
/* to protect the working set of the last N jiffies */
static unsigned long lru_gen_min_ttl __read_mostly;

@@ -5937,7 +5935,7 @@ void wsr_refresh(struct wsr *wsr, struct mem_cgroup *root,
}
}

-static void report_ws(struct pglist_data *pgdat, struct scan_control *sc)
+void report_ws(struct pglist_data *pgdat, struct scan_control *sc)
{
static DEFINE_RATELIMIT_STATE(rate, HZ, 3);

@@ -5969,6 +5967,8 @@ static void report_ws(struct pglist_data *pgdat, struct scan_control *sc)

if (wsr->notifier)
kernfs_notify(wsr->notifier);
+ if (memcg && cmpxchg(&memcg->wsr_event, 0, 1) == 0)
+ wake_up_interruptible(&memcg->wsr_wait_queue);
}
#endif /* CONFIG_WSR */

@@ -6486,6 +6486,9 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
if (zone->zone_pgdat == last_pgdat)
continue;
last_pgdat = zone->zone_pgdat;
+
+ if (!sc->proactive)
+ report_ws(zone->zone_pgdat, sc);
shrink_node(zone->zone_pgdat, sc);
}

--
2.41.0.162.gfafddb0af9-goog


2023-06-21 18:43:25

by Yuanchu Xie

[permalink] [raw]
Subject: [RFC PATCH v2 5/6] mm: add per-memcg reaccess histogram

A reaccess refers to detecting an access on a page via refault
or access bit harvesting after the initial access. Similar to
the working set histogram, the reaccess histogram breaks down
reaccesses into user-defined bins.

Currently it only tracks reaccesses from access bit harvesting,
and the plan is to include refaults in the same histogram
by pulling information from folio->mapping->i_pages shadow entry
for swapped out pages.

Signed-off-by: T.J. Alumbaugh <[email protected]>
Signed-off-by: Yuanchu Xie <[email protected]>
---
include/linux/wsr.h | 9 +++-
mm/memcontrol.c | 89 ++++++++++++++++++++++++++++++++++++++
mm/vmscan.c | 6 ++-
mm/wsr.c | 101 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 203 insertions(+), 2 deletions(-)

diff --git a/include/linux/wsr.h b/include/linux/wsr.h
index d45f7cc0672ac..68246734679cd 100644
--- a/include/linux/wsr.h
+++ b/include/linux/wsr.h
@@ -26,11 +26,14 @@ struct ws_bin {
struct wsr {
/* protects bins */
struct mutex bins_lock;
+ /* protects reaccess_bins */
+ struct mutex reaccess_bins_lock;
struct kernfs_node *notifier;
unsigned long timestamp;
unsigned long report_threshold;
unsigned long refresh_threshold;
struct ws_bin bins[MAX_NR_BINS];
+ struct ws_bin reaccess_bins[MAX_NR_BINS];
};

void wsr_register_node(struct node *node);
@@ -48,6 +51,7 @@ ssize_t wsr_intervals_ms_parse(char *src, struct ws_bin *bins);
*/
void wsr_refresh(struct wsr *wsr, struct mem_cgroup *root,
struct pglist_data *pgdat);
+void report_reaccess(struct lruvec *lruvec, struct lru_gen_mm_walk *walk);
void report_ws(struct pglist_data *pgdat, struct scan_control *sc);
#else
struct ws_bin;
@@ -71,7 +75,10 @@ static inline ssize_t wsr_intervals_ms_parse(char *src, struct ws_bin *bins)
return -EINVAL;
}
static inline void wsr_refresh(struct wsr *wsr, struct mem_cgroup *root,
- struct pglist_data *pgdat)
+ struct pglist_data *pgdat)
+{
+}
+static inline void report_reaccess(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
{
}
static inline void report_ws(struct pglist_data *pgdat, struct scan_control *sc)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index edf5bb31bb19c..b901982d659d2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6736,6 +6736,56 @@ static ssize_t memory_wsr_intervals_ms_write(struct kernfs_open_file *of,
return err ?: nbytes;
}

+static int memory_reaccess_intervals_ms_show(struct seq_file *m, void *v)
+{
+ int nid;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct wsr *wsr;
+ struct ws_bin *bin;
+
+ wsr = lruvec_wsr(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+ mutex_lock(&wsr->reaccess_bins_lock);
+ seq_printf(m, "N%d=", nid);
+ for (bin = wsr->reaccess_bins; bin->idle_age != -1; bin++)
+ seq_printf(m, "%u,", jiffies_to_msecs(bin->idle_age));
+ mutex_unlock(&wsr->reaccess_bins_lock);
+
+ seq_printf(m, "%lld ", LLONG_MAX);
+ }
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static ssize_t memory_reaccess_intervals_ms_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ unsigned int nid;
+ int err;
+ struct wsr *wsr;
+ struct ws_bin *bins;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+
+ bins = kzalloc(sizeof(wsr->reaccess_bins), GFP_KERNEL);
+ if (!bins)
+ return -ENOMEM;
+
+ err = memory_wsr_intervals_ms_parse(of, buf, nbytes, &nid, bins);
+ if (err)
+ goto failed;
+
+ wsr = lruvec_wsr(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+ mutex_lock(&wsr->reaccess_bins_lock);
+ memcpy(wsr->reaccess_bins, bins, sizeof(wsr->reaccess_bins));
+ mutex_unlock(&wsr->reaccess_bins_lock);
+failed:
+ kfree(bins);
+ return err ?: nbytes;
+}
+
static int memory_wsr_refresh_ms_show(struct seq_file *m, void *v)
{
int nid;
@@ -6874,6 +6924,34 @@ __poll_t memory_wsr_histogram_poll(struct kernfs_open_file *of,
return DEFAULT_POLLMASK | EPOLLPRI;
return DEFAULT_POLLMASK;
}
+
+static int memory_reaccess_histogram_show(struct seq_file *m, void *v)
+{
+ int nid;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct wsr *wsr =
+ lruvec_wsr(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+ struct ws_bin *bin;
+
+ seq_printf(m, "N%d\n", nid);
+
+ mutex_lock(&wsr->reaccess_bins_lock);
+ wsr_refresh(wsr, memcg, NODE_DATA(nid));
+ for (bin = wsr->reaccess_bins; bin->idle_age != -1; bin++)
+ seq_printf(m, "%u anon=%lu file=%lu\n",
+ jiffies_to_msecs(bin->idle_age),
+ bin->nr_pages[0], bin->nr_pages[1]);
+
+ seq_printf(m, "%lld anon=%lu file=%lu\n", LLONG_MAX,
+ bin->nr_pages[0], bin->nr_pages[1]);
+
+ mutex_unlock(&wsr->reaccess_bins_lock);
+ }
+
+ return 0;
+}
#endif

static struct cftype memory_files[] = {
@@ -6969,6 +7047,17 @@ static struct cftype memory_files[] = {
.seq_show = memory_wsr_histogram_show,
.poll = memory_wsr_histogram_poll,
},
+ {
+ .name = "reaccess.intervals_ms",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_reaccess_intervals_ms_show,
+ .write = memory_reaccess_intervals_ms_write,
+ },
+ {
+ .name = "reaccess.histogram",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_reaccess_histogram_show,
+ },
#endif
{} /* terminate */
};
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ba254b6e91e19..bc8c026ceef0d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4226,6 +4226,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
mem_cgroup_unlock_pages();

if (walk->batched) {
+ report_reaccess(lruvec, walk);
spin_lock_irq(&lruvec->lru_lock);
reset_batch_size(lruvec, walk);
spin_unlock_irq(&lruvec->lru_lock);
@@ -5079,11 +5080,14 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
sc->nr_scanned -= folio_nr_pages(folio);
}

+ walk = current->reclaim_state->mm_walk;
+ if (walk && walk->batched)
+ report_reaccess(lruvec, walk);
+
spin_lock_irq(&lruvec->lru_lock);

move_folios_to_lru(lruvec, &list);

- walk = current->reclaim_state->mm_walk;
if (walk && walk->batched)
reset_batch_size(lruvec, walk);

diff --git a/mm/wsr.c b/mm/wsr.c
index cd045ade5e9ba..a63d678e64f8b 100644
--- a/mm/wsr.c
+++ b/mm/wsr.c
@@ -23,8 +23,10 @@ void wsr_init(struct lruvec *lruvec)
struct wsr *wsr = lruvec_wsr(lruvec);

mutex_init(&wsr->bins_lock);
+ mutex_init(&wsr->reaccess_bins_lock);
wsr->bins[0].idle_age = -1;
wsr->notifier = NULL;
+ wsr->reaccess_bins[0].idle_age = -1;
}

void wsr_destroy(struct lruvec *lruvec)
@@ -32,6 +34,7 @@ void wsr_destroy(struct lruvec *lruvec)
struct wsr *wsr = lruvec_wsr(lruvec);

mutex_destroy(&wsr->bins_lock);
+ mutex_destroy(&wsr->reaccess_bins_lock);
memset(wsr, 0, sizeof(*wsr));
}

@@ -172,6 +175,104 @@ void refresh_wsr(struct wsr *wsr, struct mem_cgroup *root,
cond_resched();
} while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
}
+
+static void collect_reaccess_locked(struct wsr *wsr,
+ struct lru_gen_struct *lrugen,
+ struct lru_gen_mm_walk *walk)
+{
+ int gen, type, zone;
+ unsigned long curr_timestamp = jiffies;
+ unsigned long max_seq = READ_ONCE(walk->max_seq);
+ unsigned long min_seq[ANON_AND_FILE] = {
+ READ_ONCE(lrugen->min_seq[LRU_GEN_ANON]),
+ READ_ONCE(lrugen->min_seq[LRU_GEN_FILE]),
+ };
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ unsigned long seq;
+ struct ws_bin *bin = wsr->reaccess_bins;
+
+ lockdep_assert_held(&wsr->reaccess_bins_lock);
+ /* Skip max_seq because a reaccess moves a page from another seq
+ * to max_seq. We use the negative change in page count from
+ * other seqs to track the number of reaccesses.
+ */
+ for (seq = max_seq - 1; seq + 1 > min_seq[type]; seq--) {
+ long error;
+ int next_gen;
+ unsigned long birth, gen_start;
+ long delta = 0;
+
+ gen = lru_gen_from_seq(seq);
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ long nr_pages = walk->nr_pages[gen][type][zone];
+
+ if (nr_pages < 0)
+ delta += -nr_pages;
+ }
+
+ birth = READ_ONCE(lrugen->timestamps[gen]);
+ next_gen = lru_gen_from_seq(seq + 1);
+ gen_start = READ_ONCE(lrugen->timestamps[next_gen]);
+
+ /* ensure gen_start is within idle_age of bin */
+ while (bin->idle_age != -1 &&
+ time_before(gen_start + bin->idle_age,
+ curr_timestamp))
+ bin++;
+
+ error = delta;
+ /* gen exceeds the idle_age of bin */
+ while (bin->idle_age != -1 &&
+ time_before(birth + bin->idle_age,
+ curr_timestamp)) {
+ unsigned long proportion =
+ gen_start -
+ (curr_timestamp - bin->idle_age);
+ unsigned long gen_len = gen_start - birth;
+
+ if (!gen_len)
+ break;
+ if (proportion) {
+ unsigned long split_bin =
+ delta / gen_len * proportion;
+ bin->nr_pages[type] += split_bin;
+ error -= split_bin;
+ }
+ gen_start = curr_timestamp - bin->idle_age;
+ bin++;
+ }
+ bin->nr_pages[type] += error;
+ }
+ }
+}
+
+static void collect_reaccess(struct wsr *wsr,
+ struct lru_gen_struct *lrugen,
+ struct lru_gen_mm_walk *walk)
+{
+ if (READ_ONCE(wsr->reaccess_bins->idle_age) == -1)
+ return;
+
+ mutex_lock(&wsr->reaccess_bins_lock);
+ collect_reaccess_locked(wsr, lrugen, walk);
+ mutex_unlock(&wsr->reaccess_bins_lock);
+}
+
+void report_reaccess(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
+{
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+ while (memcg) {
+ collect_reaccess(lruvec_wsr(mem_cgroup_lruvec(
+ memcg, lruvec_pgdat(lruvec))),
+ lrugen, walk);
+ memcg = parent_mem_cgroup(memcg);
+ }
+}
+
static struct pglist_data *kobj_to_pgdat(struct kobject *kobj)
{
int nid = IS_ENABLED(CONFIG_NUMA) ? kobj_to_dev(kobj)->id :
--
2.41.0.162.gfafddb0af9-goog


2023-06-21 19:13:48

by Yu Zhao

[permalink] [raw]
Subject: Re: [RFC PATCH v2 0/6] mm: working set reporting

On Wed, Jun 21, 2023 at 12:16 PM Yuanchu Xie <[email protected]> wrote:
>
> RFC v1: https://lore.kernel.org/linux-mm/[email protected]/
> For background and interfaces, see the RFC v1 posting.

v1 only mentioned one use case (ballooning), but we both know there
are at least two solid use cases (the other being job
scheduling/binpacking, e.g., for kubernetes [1]).

Please do a survey, as thoroughly as possible, of use cases.
* What's the significance of WSR to the landscape, in terms of server
and client use cases?
* How would userspace tools, e.g., a PMU-based memory profiler,
leverage the infra provided by WSR?
* Would those who register slab shrinkers, e.g., DMA buffs [2], want
to report their working sets?
* Does this effort intersect with memory placement with NUMA and CXL.mem?

[1] https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
[2] https://lore.kernel.org/linux-mm/[email protected]/