2017-04-25 14:00:36

by Marcelo Tosatti

[permalink] [raw]
Subject: [patch 2/2] MM: allow per-cpu vmstat_threshold and vmstat_worker configuration

The per-CPU vmstat worker is a problem on -RT workloads (because
ideally the CPU is entirely reserved for the -RT app, without
interference). The worker transfers accumulated per-CPU
vmstat counters to global counters.

To resolve the problem, create two tunables:

* Userspace configurable per-CPU vmstat threshold: by default the
VM code calculates the size of the per-CPU vmstat arrays. This
tunable allows userspace to configure the values.

* Userspace configurable per-CPU vmstat worker: allow disabling
the per-CPU vmstat worker.

The patch below contains documentation which describes the tunables
in more detail.

Signed-off-by: Marcelo Tosatti <[email protected]>

---
Documentation/vm/vmstat_thresholds.txt | 38 +++++
mm/vmstat.c | 248 +++++++++++++++++++++++++++++++--
2 files changed, 272 insertions(+), 14 deletions(-)

Index: linux-2.6-git-disable-vmstat-worker/mm/vmstat.c
===================================================================
--- linux-2.6-git-disable-vmstat-worker.orig/mm/vmstat.c 2017-04-25 07:39:13.941019853 -0300
+++ linux-2.6-git-disable-vmstat-worker/mm/vmstat.c 2017-04-25 10:44:51.581977296 -0300
@@ -91,8 +91,17 @@
EXPORT_SYMBOL(vm_zone_stat);
EXPORT_SYMBOL(vm_node_stat);

+struct vmstat_uparam {
+ atomic_t vmstat_work_enabled;
+ atomic_t user_stat_thresh;
+};
+
+static DEFINE_PER_CPU(struct vmstat_uparam, vmstat_uparam);
+
#ifdef CONFIG_SMP

+#define MAX_THRESHOLD 125
+
int calculate_pressure_threshold(struct zone *zone)
{
int threshold;
@@ -110,9 +119,9 @@
threshold = max(1, (int)(watermark_distance / num_online_cpus()));

/*
- * Maximum threshold is 125
+ * Maximum threshold is MAX_THRESHOLD == 125
*/
- threshold = min(125, threshold);
+ threshold = min(MAX_THRESHOLD, threshold);

return threshold;
}
@@ -188,15 +197,31 @@
threshold = calculate_normal_threshold(zone);

for_each_online_cpu(cpu) {
- int pgdat_threshold;
+ int pgdat_threshold, ustat_thresh;
+ struct vmstat_uparam *vup;

- per_cpu_ptr(zone->pageset, cpu)->stat_threshold
- = threshold;
+ struct per_cpu_nodestat __percpu *pcp;
+ struct per_cpu_pageset *p;
+
+ p = per_cpu_ptr(zone->pageset, cpu);
+
+ vup = &per_cpu(vmstat_uparam, cpu);
+ ustat_thresh = atomic_read(&vup->user_stat_thresh);
+
+ if (ustat_thresh)
+ p->stat_threshold = ustat_thresh;
+ else
+ p->stat_threshold = threshold;
+
+ pcp = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);

/* Base nodestat threshold on the largest populated zone. */
- pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
- per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
- = max(threshold, pgdat_threshold);
+ pgdat_threshold = pcp->stat_threshold;
+ if (ustat_thresh)
+ pcp->stat_threshold = ustat_thresh;
+ else
+ pcp->stat_threshold = max(threshold,
+ pgdat_threshold);
}

/*
@@ -226,9 +251,24 @@
continue;

threshold = (*calculate_pressure)(zone);
- for_each_online_cpu(cpu)
+ for_each_online_cpu(cpu) {
+ int t, ustat_thresh;
+ struct vmstat_uparam *vup;
+
+ vup = &per_cpu(vmstat_uparam, cpu);
+ ustat_thresh = atomic_read(&vup->user_stat_thresh);
+ t = threshold;
+
+ /*
+ * min because pressure could cause
+ * calculate_pressure'ed value to be smaller.
+ */
+ if (ustat_thresh)
+ t = min(threshold, ustat_thresh);
+
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
- = threshold;
+ = t;
+ }
}
}

@@ -1567,6 +1607,9 @@
long val;
int err;
int i;
+ int cpu;
+ struct work_struct __percpu *works;
+ static struct cpumask has_work;

/*
* The regular update, every sysctl_stat_interval, may come later
@@ -1580,9 +1623,31 @@
* transiently negative values, report an error here if any of
* the stats is negative, so we know to go looking for imbalance.
*/
- err = schedule_on_each_cpu(refresh_vm_stats);
- if (err)
- return err;
+
+ works = alloc_percpu(struct work_struct);
+ if (!works)
+ return -ENOMEM;
+
+ cpumask_clear(&has_work);
+ get_online_cpus();
+
+ for_each_online_cpu(cpu) {
+ struct work_struct *work = per_cpu_ptr(works, cpu);
+ struct vmstat_uparam *vup = &per_cpu(vmstat_uparam, cpu);
+
+ if (atomic_read(&vup->vmstat_work_enabled)) {
+ INIT_WORK(work, refresh_vm_stats);
+ schedule_work_on(cpu, work);
+ cpumask_set_cpu(cpu, &has_work);
+ }
+ }
+
+ for_each_cpu(cpu, &has_work)
+ flush_work(per_cpu_ptr(works, cpu));
+
+ put_online_cpus();
+ free_percpu(works);
+
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
val = atomic_long_read(&vm_zone_stat[i]);
if (val < 0) {
@@ -1674,6 +1739,10 @@
/* Check processors whose vmstat worker threads have been disabled */
for_each_online_cpu(cpu) {
struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
+ struct vmstat_uparam *vup = &per_cpu(vmstat_uparam, cpu);
+
+ if (atomic_read(&vup->vmstat_work_enabled) == 0)
+ continue;

if (!delayed_work_pending(dw) && need_update(cpu))
queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
@@ -1696,6 +1765,135 @@
round_jiffies_relative(sysctl_stat_interval));
}

+#ifdef CONFIG_SYSFS
+
+static ssize_t vmstat_worker_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ unsigned int cpu = dev->id;
+ struct vmstat_uparam *vup = &per_cpu(vmstat_uparam, cpu);
+
+ return sprintf(buf, "%d\n", atomic_read(&vup->vmstat_work_enabled));
+}
+
+static ssize_t vmstat_worker_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int ret, val;
+ struct vmstat_uparam *vup;
+ unsigned int cpu = dev->id;
+
+ ret = sscanf(buf, "%d", &val);
+ if (ret != 1 || val > 1 || val < 0)
+ return -EINVAL;
+
+ preempt_disable();
+
+ if (cpu_online(cpu)) {
+ vup = &per_cpu(vmstat_uparam, cpu);
+ atomic_set(&vup->vmstat_work_enabled, val);
+ } else
+ count = -EINVAL;
+
+ preempt_enable();
+
+ return count;
+}
+
+static ssize_t vmstat_thresh_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int ret;
+ struct vmstat_uparam *vup;
+ unsigned int cpu = dev->id;
+
+ preempt_disable();
+
+ vup = &per_cpu(vmstat_uparam, cpu);
+ ret = sprintf(buf, "%d\n", atomic_read(&vup->user_stat_thresh));
+
+ preempt_enable();
+
+ return ret;
+}
+
+static ssize_t vmstat_thresh_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int ret, val;
+ unsigned int cpu = dev->id;
+ struct vmstat_uparam *vup;
+
+ ret = sscanf(buf, "%d", &val);
+ if (ret != 1 || val < 1 || val > MAX_THRESHOLD)
+ return -EINVAL;
+
+ preempt_disable();
+
+ if (cpu_online(cpu)) {
+ vup = &per_cpu(vmstat_uparam, cpu);
+ atomic_set(&vup->user_stat_thresh, val);
+ } else
+ count = -EINVAL;
+
+ preempt_enable();
+
+ return count;
+}
+
+struct device_attribute vmstat_worker_attr =
+ __ATTR(vmstat_worker, 0644, vmstat_worker_show, vmstat_worker_store);
+
+struct device_attribute vmstat_threshold_attr =
+ __ATTR(vmstat_threshold, 0644, vmstat_thresh_show, vmstat_thresh_store);
+
+static struct attribute *vmstat_attrs[] = {
+ &vmstat_worker_attr.attr,
+ &vmstat_threshold_attr.attr,
+ NULL
+};
+
+static struct attribute_group vmstat_attr_group = {
+ .attrs = vmstat_attrs,
+ .name = "vmstat"
+};
+
+static int vmstat_thresh_cpu_online(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+ int ret;
+
+ ret = sysfs_create_group(&dev->kobj, &vmstat_attr_group);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static int vmstat_thresh_cpu_down_prep(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ sysfs_remove_group(&dev->kobj, &vmstat_attr_group);
+ return 0;
+}
+
+static void init_vmstat_sysfs(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct vmstat_uparam *vup = &per_cpu(vmstat_uparam, cpu);
+
+ atomic_set(&vup->user_stat_thresh, 0);
+ atomic_set(&vup->vmstat_work_enabled, 1);
+ }
+}
+
+#endif /* CONFIG_SYSFS */
+
static void __init init_cpu_node_state(void)
{
int node;
@@ -1723,9 +1921,13 @@
{
const struct cpumask *node_cpus;
int node;
+ struct vmstat_uparam *vup = &per_cpu(vmstat_uparam, cpu);

node = cpu_to_node(cpu);

+ atomic_set(&vup->user_stat_thresh, 0);
+ atomic_set(&vup->vmstat_work_enabled, 1);
+
refresh_zone_stat_thresholds();
node_cpus = cpumask_of_node(node);
if (cpumask_weight(node_cpus) > 0)
@@ -1735,7 +1937,7 @@
return 0;
}

-#endif
+#endif /* CONFIG_SMP */

struct workqueue_struct *mm_percpu_wq;

@@ -1772,6 +1974,24 @@
#endif
}

+static int __init init_mm_internals_late(void)
+{
+#ifdef CONFIG_SYSFS
+ int ret;
+
+ init_vmstat_sysfs();
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/vmstat_thresh:online",
+ vmstat_thresh_cpu_online,
+ vmstat_thresh_cpu_down_prep);
+ if (ret < 0)
+ pr_err("vmstat_thresh: failed to register 'online' hotplug state\n");
+#endif
+ return 0;
+}
+
+late_initcall(init_mm_internals_late);
+
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)

/*
Index: linux-2.6-git-disable-vmstat-worker/Documentation/vm/vmstat_thresholds.txt
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-git-disable-vmstat-worker/Documentation/vm/vmstat_thresholds.txt 2017-04-25 08:46:25.237395070 -0300
@@ -0,0 +1,38 @@
+Userspace configurable vmstat thresholds
+========================================
+
+This document describes the tunables to control
+per-CPU vmstat threshold and per-CPU vmstat worker
+thread.
+
+/sys/devices/system/cpu/cpuN/vmstat/vmstat_threshold:
+
+This file contains the per-CPU vmstat threshold.
+This value is the maximum that a single per-CPU vmstat statistic
+can accumulate before transferring to the global counters.
+
+A value of 0 indicates that the value is set
+by the in kernel algorithm.
+
+A value different than 0 indicates that particular
+value is used for vmstat_threshold.
+
+/sys/devices/system/cpu/cpuN/vmstat/vmstat_worker:
+
+Enable/disable the per-CPU vmstat worker.
+
+Usage example:
+=============
+
+To disable vmstat_update worker for cpu1:
+
+cd /sys/devices/system/cpu/cpu0/vmstat/
+
+# echo 1 > vmstat_threshold
+# echo 0 > vmstat_worker
+
+Setting vmstat_threshold to 1 means the per-CPU
+vmstat statistics will not be out-of-date
+for CPU 1.
+
+



2017-04-25 19:29:23

by Rik van Riel

[permalink] [raw]
Subject: Re: [patch 2/2] MM: allow per-cpu vmstat_threshold and vmstat_worker configuration

On Tue, 2017-04-25 at 10:57 -0300, Marcelo Tosatti wrote:
> The per-CPU vmstat worker is a problem on -RT workloads (because
> ideally the CPU is entirely reserved for the -RT app, without
> interference). The worker transfers accumulated per-CPU 
> vmstat counters to global counters.
>
> To resolve the problem, create two tunables:
>
> * Userspace configurable per-CPU vmstat threshold: by default the 
> VM code calculates the size of the per-CPU vmstat arrays. This 
> tunable allows userspace to configure the values.
>
> * Userspace configurable per-CPU vmstat worker: allow disabling
> the per-CPU vmstat worker.
>
> The patch below contains documentation which describes the tunables
> in more detail.

The documentation says what the tunables do, but
not how you should set them in different scenarios,
or why.

That could be a little more helpful to sysadmins.

2017-04-25 19:36:50

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [patch 2/2] MM: allow per-cpu vmstat_threshold and vmstat_worker configuration

On Tue, Apr 25, 2017 at 03:29:06PM -0400, Rik van Riel wrote:
> On Tue, 2017-04-25 at 10:57 -0300, Marcelo Tosatti wrote:
> > The per-CPU vmstat worker is a problem on -RT workloads (because
> > ideally the CPU is entirely reserved for the -RT app, without
> > interference). The worker transfers accumulated per-CPU?
> > vmstat counters to global counters.
> >
> > To resolve the problem, create two tunables:
> >
> > * Userspace configurable per-CPU vmstat threshold: by default the?
> > VM code calculates the size of the per-CPU vmstat arrays. This?
> > tunable allows userspace to configure the values.
> >
> > * Userspace configurable per-CPU vmstat worker: allow disabling
> > the per-CPU vmstat worker.
> >
> > The patch below contains documentation which describes the tunables
> > in more detail.
>
> The documentation says what the tunables do, but
> not how you should set them in different scenarios,
> or why.
>
> That could be a little more helpful to sysadmins.

OK i'll update the document to be more verbose.

2017-07-11 15:21:27

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [patch 2/2] MM: allow per-cpu vmstat_threshold and vmstat_worker configuration

On Tue, May 30, 2017 at 01:17:41PM -0500, Christoph Lameter wrote:
> On Fri, 26 May 2017, Marcelo Tosatti wrote:
>
> > > interrupts and scheduler ticks. But what does this have to do with vmstat?
> > >
> > > Show me your dpdk code running and trace the tick on / off events as well
> > > as the vmstat invocations. Also show all system calls occurring on the cpu
> > > that runs dpdk. That is necessary to see what triggers vmstat and how the
> > > system reacts to the changes to the differentials.

This was in the host, while performing virtual machine migration... Which you can
say "invalidates the argument" because virtual machine migration takes
MUCH longer time than what vmstat_update introduces.

> >
> > Sure, i can get that to you. The question remains: Are you arguing
> > its not valid for a realtime application to use any system call
> > which changes a vmstat counter?
>
> A true realtime app would be conscientious of its use of the OS services
> because the use of the services may cause additional latencies and also
> cause timers etc to fire later. A realtime app that is willing to use
> these services is therefore willing to tolerate larger latencies. A
> realtime app that is using OS service may cause the timer tick to be
> enabled which also causes additional latencies.
>
> I have seen completely OS noise free processing for extended time period
> when not using OS services and using RDMA for I/O. This fits my use case
> well.

People might want to use O/S services.

> If there are really these high latencies because of kworker processing for
> vmstat then maybe we need a different mechanism there (bh? or other
> triggers) and maybe we are using far too many counters so that the
> processing becomes a heavy user of resources.
>
> > Because if they are allowed, then its obvious something like
> > this is needed.
>
> I am still wondering what benefit there is. Lets get clear on the test
> load and see if this actually makes sense.

Ok, test load:

* Any userspace app that causes a systemcall which triggers
vmstat_update is susceptible to vmstat_update running on that
CPU, which might be detrimental to latency.

So either something which moves vmstat_update work to another CPU,
or that avoids vmstat_update (which is what the proposed patchset does),
must be necessary.

So if a customer comes to me and says: "i am using sys_XXX in my
application, but my latency is high", i'll have to tell him: "ok, please
don't use that system call since it triggers kernel activity on the CPU
which does not allow you to achieve the latency you desire".

But it seems the "no syscalls" rule seems to be a good idea for
CPU isolated, low latency stuff...

So i give up on the use-case behind this patch.