Hi all,
Some time ago KOSAKI Motohiro noticed[1] that vmevent might be very
inaccurate (up to 2GB inaccuracy on a very large machines) since per CPU
stats synchronization happens either on time basis or when we hit stat
thresholds.
KOSAKI also told that perf API might be a good inspirations for further
improvements, but I must admit I didn't fully get the idea, although I'm
open to investigate this route too, but I guess it needs a bit more
explanations.
Also note that this is just an RFC, I just show some ideas and wonder how
you feel about it. Since we now use memory pressure factor bolted into the
reclaimer code path, we don't desperately need the accurate stats, but
it's still nice thing to have/fix.
Anyway, here we take two approaches:
- Asynchronously sum vm_stat diffs and global stats. This is very similar
to what we already have for per-zone stats, implemented in
zone_page_state_snapshot(). The values still could be inaccurate, but
overall this makes things better;
- Implement configurable per CPU vmstat thresholds. This is much more
powerful tool to get accurate statistics, but it comes with a price: it
might cause some performance penalty as we'd update global stats more
frequently (in a fast path), so users have to be careful.
The two items are independent, so we might implement one or another, or
both, or none, if desired. ;-)
Thanks,
Anton.
p.s. Note that the patches are against my vmevent tree, i.e.:
git://git.infradead.org/users/cbou/linux-vmevent.git
[1] http://lkml.indiana.edu/hypermail/linux/kernel/1205.1/00062.html
There are two things that affect vmstat accuracy:
- Per CPU pageset stats to global stats synchronization time;
- Per CPU pageset stats thresholds;
Currently user can only change vmstat update time (via stat_interval
sysctl, which is 1 second by default).
As for thresholds, the max threshold is 125 pages, which is per CPU, per
zone, so the vmstat inaccuracy might be significant. With vmevent API we
will able to set vmstat thresholds as well -- we will use this small
helper for this.
Note that since various MM areas depend on the accuracy too, we should be
very carefully to not downgrade it. User also have to understand that
lower thresholds puts more pressure on caches, and can somewhat degrade
performance, especially on very large systems. But that's the price for
accuracy (if it is needed).
p.s.
set_pgdat_percpu_threshold() used for_each_possible_cpu(), and
refresh_zone_stat_thresholds() used for_each_online_cpu(). I think
for_each_possible_cpu() is unnecessary, as on CPU hotplug we call
refresh_zone_stat_thresholds() anyway.
Signed-off-by: Anton Vorontsov <[email protected]>
---
include/linux/vmstat.h | 6 ++++++
mm/vmstat.c | 52 ++++++++++++++++++++++++++++++++++++++------------
2 files changed, 46 insertions(+), 12 deletions(-)
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index ad2cfd5..590808d 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -202,6 +202,9 @@ int calculate_pressure_threshold(struct zone *zone);
int calculate_normal_threshold(struct zone *zone);
void set_pgdat_percpu_threshold(pg_data_t *pgdat,
int (*calculate_pressure)(struct zone *));
+s8 set_zone_stat_thresholds(struct zone *zone,
+ int (*calc_thres)(struct zone *zone),
+ s8 force);
#else /* CONFIG_SMP */
/*
@@ -248,6 +251,9 @@ static inline void __dec_zone_page_state(struct page *page,
#define set_pgdat_percpu_threshold(pgdat, callback) { }
+static inline s8 set_zone_stat_thresholds(struct zone *zone,
+ int (*calc_thres)(struct zone *zone),
+ s8 force) { return 0; }
static inline void refresh_cpu_vm_stats(int cpu) { }
static inline void refresh_zone_stat_thresholds(void) { }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index df7a674..3609e3e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -155,22 +155,55 @@ int calculate_normal_threshold(struct zone *zone)
}
/*
+ * set_zone_stat_thresholds() - Set zone stat thresholds
+ * @zone: A zone to set thresholds for
+ * @calc_thres: An optional callback to calculate thresholds
+ * @force: An optional threshold value to force thresholds
+ *
+ * This function sets stat thresholds for a desired zone. The thresholds
+ * are either calculated by the optional @calc_thres callback, or set to
+ * the @force value. If @force is greater than current zone's threshold,
+ * the new value is ignored.
+ */
+s8 set_zone_stat_thresholds(struct zone *zone,
+ int (*calc_thres)(struct zone *zone),
+ s8 force)
+{
+ static s8 forced_threshold;
+ s8 thres = force;
+ uint cpu;
+
+ if (!calc_thres) {
+ if (!force)
+ calc_thres = calculate_normal_threshold;
+ forced_threshold = force;
+ }
+
+ if (calc_thres) {
+ thres = calc_thres(zone);
+ if (forced_threshold)
+ thres = min(thres, forced_threshold);
+ }
+
+ for_each_online_cpu(cpu)
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold = thres;
+
+ return thres;
+}
+
+/*
* Refresh the thresholds for each zone.
*/
void refresh_zone_stat_thresholds(void)
{
struct zone *zone;
- int cpu;
int threshold;
for_each_populated_zone(zone) {
unsigned long max_drift, tolerate_drift;
- threshold = calculate_normal_threshold(zone);
-
- for_each_online_cpu(cpu)
- per_cpu_ptr(zone->pageset, cpu)->stat_threshold
- = threshold;
+ threshold = set_zone_stat_thresholds(zone,
+ calculate_normal_threshold, 0);
/*
* Only set percpu_drift_mark if there is a danger that
@@ -189,8 +222,6 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
int (*calculate_pressure)(struct zone *))
{
struct zone *zone;
- int cpu;
- int threshold;
int i;
for (i = 0; i < pgdat->nr_zones; i++) {
@@ -198,10 +229,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
if (!zone->percpu_drift_mark)
continue;
- threshold = (*calculate_pressure)(zone);
- for_each_possible_cpu(cpu)
- per_cpu_ptr(zone->pageset, cpu)->stat_threshold
- = threshold;
+ set_zone_stat_thresholds(zone, calculate_pressure, 0);
}
}
--
1.7.12.1
Currently vmevent relies on the global page state stats, which is updated
once per stat_interval (1 second) or when the per CPU pageset stats hit
their threshold.
We can sum the vm_stat_diff values asynchronously: they will be possibly a
bit inconsistent, but overall this should improve accuracy, since with
previous scheme we would always use worst-case scenario (i.e. we'd wait
for threshold to hit or 1 second delay), but now we use somewhat average
accuracy.
The idea is very similar to zone_page_state_snapshot().
Note that this might cause more pressure on CPU caches, so we only use
this when userland explicitly asks for accuracy, plus since we gather
stats outside of any fastpath, this should be OK in general.
Signed-off-by: Anton Vorontsov <[email protected]>
---
mm/vmevent.c | 43 ++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 42 insertions(+), 1 deletion(-)
diff --git a/mm/vmevent.c b/mm/vmevent.c
index 8113bda..a059bed 100644
--- a/mm/vmevent.c
+++ b/mm/vmevent.c
@@ -52,10 +52,51 @@ static u64 vmevent_attr_swap_pages(struct vmevent_watch *watch,
#endif
}
+/*
+ * In the worst case, this is inaccurate by
+ *
+ * ±(pcp->stat_threshold * zones * online_cpus)
+ *
+ * For say 4-core 2GB setup that would be ~350 KB worst case inaccuracy,
+ * but to reach this inaccuracy, CPUs would all need have to keep
+ * allocating (or freeing) pages from all the zones at the same time, and
+ * all their current vm_stat_diff values would need to be pretty close to
+ * pcp->stat_threshold.
+ *
+ * The larger the system, the more inaccurare vm_stat is (but at the same
+ * time, on large systems we care much less about small chunks of memory).
+ * When a more predicted behaviour is needed, userland can set a desired
+ * accuracy via attr->value2.
+ */
+static ulong vmevent_global_page_state(struct vmevent_attr *attr,
+ enum zone_stat_item si)
+{
+ ulong global = global_page_state(si);
+#ifdef CONFIG_SMP
+ struct zone *zone;
+
+ if (!attr->value2)
+ return global;
+
+ for_each_populated_zone(zone) {
+ uint cpu;
+
+ for_each_online_cpu(cpu) {
+ struct per_cpu_pageset *pcp;
+
+ pcp = per_cpu_ptr(zone->pageset, cpu);
+
+ global += pcp->vm_stat_diff[si];
+ }
+ }
+#endif
+ return global;
+}
+
static u64 vmevent_attr_free_pages(struct vmevent_watch *watch,
struct vmevent_attr *attr)
{
- return global_page_state(NR_FREE_PAGES);
+ return vmevent_global_page_state(attr, NR_FREE_PAGES);
}
static u64 vmevent_attr_avail_pages(struct vmevent_watch *watch,
--
1.7.12.1
Attributes that use vmstat can now use attr->value2 to specify an optional
accuracy. Based on the provided value, we will setup appropriate vmstat
thresholds.
Signed-off-by: Anton Vorontsov <[email protected]>
---
include/linux/vmevent.h | 5 +++++
mm/vmevent.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 61 insertions(+)
diff --git a/include/linux/vmevent.h b/include/linux/vmevent.h
index b1c4016..b8c1394 100644
--- a/include/linux/vmevent.h
+++ b/include/linux/vmevent.h
@@ -46,6 +46,11 @@ struct vmevent_attr {
__u64 value;
/*
+ * Some attributes accept two configuration values.
+ */
+ __u64 value2;
+
+ /*
* Type of profiled attribute from VMEVENT_ATTR_XXX
*/
__u32 type;
diff --git a/mm/vmevent.c b/mm/vmevent.c
index 8c0fbe6..8113bda 100644
--- a/mm/vmevent.c
+++ b/mm/vmevent.c
@@ -28,8 +28,13 @@ struct vmevent_watch {
/* poll */
wait_queue_head_t waitq;
+
+ struct list_head node;
};
+static LIST_HEAD(vmevent_watchers);
+static DEFINE_SPINLOCK(vmevent_watchers_lock);
+
typedef u64 (*vmevent_attr_sample_fn)(struct vmevent_watch *watch,
struct vmevent_attr *attr);
@@ -259,12 +264,57 @@ out:
return ret;
}
+#ifdef CONFIG_SMP
+
+static void vmevent_set_thresholds(void)
+{
+ struct vmevent_watch *w;
+ struct zone *zone;
+ u64 thres = ULLONG_MAX;
+
+ spin_lock(&vmevent_watchers_lock);
+
+ list_for_each_entry(w, &vmevent_watchers, node) {
+ int i;
+
+ for (i = 0; i < w->config.counter; i++) {
+ struct vmevent_attr *attr = &w->config.attrs[i];
+
+ if (attr->type != VMEVENT_ATTR_NR_FREE_PAGES)
+ continue;
+ if (!attr->value2)
+ continue;
+ thres = min(thres, attr->value2);
+ }
+ }
+
+ if (thres == ULLONG_MAX)
+ thres = 0;
+
+ thres = (thres + PAGE_SIZE - 1) / PAGE_SIZE;
+
+ for_each_populated_zone(zone)
+ set_zone_stat_thresholds(zone, NULL, thres);
+
+ spin_unlock(&vmevent_watchers_lock);
+}
+
+#else
+static inline void vmevent_set_thresholds(void) {}
+#endif /* CONFIG_SMP */
+
static int vmevent_release(struct inode *inode, struct file *file)
{
struct vmevent_watch *watch = file->private_data;
cancel_delayed_work_sync(&watch->work);
+ spin_lock(&vmevent_watchers_lock);
+ list_del(&watch->node);
+ spin_unlock(&vmevent_watchers_lock);
+
+ vmevent_set_thresholds();
+
kfree(watch);
return 0;
@@ -328,6 +378,10 @@ static int vmevent_setup_watch(struct vmevent_watch *watch)
watch->sample_attrs = attrs;
watch->nr_attrs = nr;
+ spin_lock(&vmevent_watchers_lock);
+ list_add(&watch->node, &vmevent_watchers);
+ spin_unlock(&vmevent_watchers_lock);
+
return 0;
}
@@ -363,6 +417,8 @@ SYSCALL_DEFINE1(vmevent_fd,
if (err)
goto err_free;
+ vmevent_set_thresholds();
+
fd = get_unused_fd_flags(O_RDONLY);
if (fd < 0) {
err = fd;
--
1.7.12.1