Each page allocation updates a set of per-zone statistics with a call to
zone_statistics(). As discussed in 2017 MM summit.
A link to the MM summit slides:
http://people.netfilter.org/hawk/presentations/MM-summit2017/MM-summit2017
-JesperBrouer.pdf
This is the second step for optimizing zone statistics, the first patch
introduces a tunable interface that allow VM statistics configurable(see
the first patch for details):
if vmstat_mode = auto, automatic detection of VM statistics
if vmstat_mode = strict, keep all the VM statistics
if vmstat_mode = coarse, ignore unimportant VM statistics
As suggested by Dave Hansen and Ying Huang.
With this interface, the second patch handles numa counters distinctively
according to different vmstat mode, and the test result shows about 4.8%
(185->176) drop of cpu cycles with single thread and 8.1% (343->315) drop
of of cpu cycles with 88 threads for single page allocation.
The third patch updates ABI document accordingly.
Kemi Wang (3):
mm, sysctl: make VM stats configurable
mm: Handle numa statistics distinctively based-on different VM stats
modes
sysctl/vm.txt: Update document
Documentation/sysctl/vm.txt | 26 ++++++++++
drivers/base/node.c | 2 +
include/linux/vmstat.h | 20 +++++++
kernel/sysctl.c | 7 +++
mm/page_alloc.c | 13 +++++
mm/vmstat.c | 124 ++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 192 insertions(+)
--
2.7.4
This patch adds a tunable interface that allows VM stats configurable, as
suggested by Dave Hansen and Ying Huang.
When performance becomes a bottleneck and you can tolerate some possible
tool breakage and some decreased counter precision (e.g. numa counter), you
can do:
echo [C|c]oarse > /proc/sys/vm/vmstat_mode
When performance is not a bottleneck and you want all tooling to work, you
can do:
echo [S|s]trict > /proc/sys/vm/vmstat_mode
We recommend automatic detection of virtual memory statistics by system,
this is also system default configuration, you can do:
echo [A|a]uto > /proc/sys/vm/vmstat_mode
The next patch handles numa statistics distinctively based-on different VM
stats mode.
Reported-by: Jesper Dangaard Brouer <[email protected]>
Suggested-by: Dave Hansen <[email protected]>
Suggested-by: Ying Huang <[email protected]>
Signed-off-by: Kemi Wang <[email protected]>
---
include/linux/vmstat.h | 14 ++++++++++
kernel/sysctl.c | 7 +++++
mm/vmstat.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 91 insertions(+)
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index ade7cb5..c3634c7 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -9,6 +9,20 @@
extern int sysctl_stat_interval;
+/*
+ * vmstat_mode:
+ * 0 = auto mode of vmstat, automatic detection of VM statistics.
+ * 1 = strict mode of vmstat, keep all VM statistics.
+ * 2 = coarse mode of vmstat, ignore unimportant VM statistics.
+ */
+#define VMSTAT_AUTO_MODE 0
+#define VMSTAT_STRICT_MODE 1
+#define VMSTAT_COARSE_MODE 2
+#define VMSTAT_MODE_LEN 16
+extern char sysctl_vmstat_mode[];
+extern int sysctl_vmstat_mode_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos);
+
#ifdef CONFIG_VM_EVENT_COUNTERS
/*
* Light weight per cpu counter implementation.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6648fbb..f5b813b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1234,6 +1234,13 @@ static struct ctl_table kern_table[] = {
static struct ctl_table vm_table[] = {
{
+ .procname = "vmstat_mode",
+ .data = &sysctl_vmstat_mode,
+ .maxlen = VMSTAT_MODE_LEN,
+ .mode = 0644,
+ .proc_handler = sysctl_vmstat_mode_handler,
+ },
+ {
.procname = "overcommit_memory",
.data = &sysctl_overcommit_memory,
.maxlen = sizeof(sysctl_overcommit_memory),
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4bb13e7..e675ad2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -32,6 +32,76 @@
#define NUMA_STATS_THRESHOLD (U16_MAX - 2)
+int vmstat_mode = VMSTAT_AUTO_MODE;
+char sysctl_vmstat_mode[VMSTAT_MODE_LEN] = "auto";
+static const char *vmstat_mode_name[3] = {"auto", "strict", "coarse"};
+static DEFINE_MUTEX(vmstat_mode_lock);
+
+
+static int __parse_vmstat_mode(char *s)
+{
+ const char *str = s;
+
+ if (strcmp(str, "auto") == 0 || strcmp(str, "Auto") == 0)
+ vmstat_mode = VMSTAT_AUTO_MODE;
+ else if (strcmp(str, "strict") == 0 || strcmp(str, "Strict") == 0)
+ vmstat_mode = VMSTAT_STRICT_MODE;
+ else if (strcmp(str, "coarse") == 0 || strcmp(str, "Coarse") == 0)
+ vmstat_mode = VMSTAT_COARSE_MODE;
+ else {
+ pr_warn("Ignoring invalid vmstat_mode value: %s\n", s);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int sysctl_vmstat_mode_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ char old_string[VMSTAT_MODE_LEN];
+ int ret, oldval;
+
+ mutex_lock(&vmstat_mode_lock);
+ if (write)
+ strncpy(old_string, (char *)table->data, VMSTAT_MODE_LEN);
+ ret = proc_dostring(table, write, buffer, length, ppos);
+ if (ret || !write) {
+ mutex_unlock(&vmstat_mode_lock);
+ return ret;
+ }
+
+ oldval = vmstat_mode;
+ if (__parse_vmstat_mode((char *)table->data)) {
+ /*
+ * invalid sysctl_vmstat_mode value, restore saved string
+ */
+ strncpy((char *)table->data, old_string, VMSTAT_MODE_LEN);
+ vmstat_mode = oldval;
+ } else {
+ /*
+ * check whether vmstat mode changes or not
+ */
+ if (vmstat_mode == oldval) {
+ /* no change */
+ mutex_unlock(&vmstat_mode_lock);
+ return 0;
+ } else if (vmstat_mode == VMSTAT_AUTO_MODE)
+ pr_info("vmstat mode changes from %s to auto mode\n",
+ vmstat_mode_name[oldval]);
+ else if (vmstat_mode == VMSTAT_STRICT_MODE)
+ pr_info("vmstat mode changes from %s to strict mode\n",
+ vmstat_mode_name[oldval]);
+ else if (vmstat_mode == VMSTAT_COARSE_MODE)
+ pr_info("vmstat mode changes from %s to coarse mode\n",
+ vmstat_mode_name[oldval]);
+ else
+ pr_warn("invalid vmstat_mode:%d\n", vmstat_mode);
+ }
+
+ mutex_unlock(&vmstat_mode_lock);
+ return 0;
+}
+
#ifdef CONFIG_VM_EVENT_COUNTERS
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
EXPORT_PER_CPU_SYMBOL(vm_event_states);
--
2.7.4
Each page allocation updates a set of per-zone statistics with a call to
zone_statistics(). As discussed at the 2017 MM Summit, these are a
substantial source of overhead in the page allocator and are very rarely
consumed.
A link to the MM summit slides:
http://people.netfilter.org/hawk/presentations/MM-summit2017/MM-summit2017
-JesperBrouer.pdf
Therefore, with different VM stats mode, numa counters update can operate
differently so that everybody can benefit:
If vmstat_mode = auto, automatic detection of numa statistics, numa counter
update is skipped unless it has been read by users at least once,
e.g. cat /proc/zoneinfo.
If vmstat_mode = strict, numa counter is updated for each page allocation.
If vmstat_mode = coarse, numa counter update is ignored. We can see about
*4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim
on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu
cycles per single page allocation and reclaim on Jesper's page_bench03 (88
threads) running on a 2-Socket Broadwell-based server (88 threads, 126G
memory).
Benchmark link provided by Jesper D Brouer(increase loop times to
10000000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/
bench
Reported-by: Jesper Dangaard Brouer <[email protected]>
Suggested-by: Dave Hansen <[email protected]>
Suggested-by: Ying Huang <[email protected]>
Signed-off-by: Kemi Wang <[email protected]>
---
drivers/base/node.c | 2 ++
include/linux/vmstat.h | 6 +++++
mm/page_alloc.c | 13 +++++++++++
mm/vmstat.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++---
4 files changed, 78 insertions(+), 3 deletions(-)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 3855902..033c0c3 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -153,6 +153,7 @@ static DEVICE_ATTR(meminfo, S_IRUGO, node_read_meminfo, NULL);
static ssize_t node_read_numastat(struct device *dev,
struct device_attribute *attr, char *buf)
{
+ disable_zone_statistics = false;
return sprintf(buf,
"numa_hit %lu\n"
"numa_miss %lu\n"
@@ -194,6 +195,7 @@ static ssize_t node_read_vmstat(struct device *dev,
NR_VM_NUMA_STAT_ITEMS],
node_page_state(pgdat, i));
+ disable_zone_statistics = false;
return n;
}
static DEVICE_ATTR(vmstat, S_IRUGO, node_read_vmstat, NULL);
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index c3634c7..ca9854c 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -9,6 +9,7 @@
extern int sysctl_stat_interval;
+extern bool disable_zone_statistics;
/*
* vmstat_mode:
* 0 = auto mode of vmstat, automatic detection of VM statistics.
@@ -19,6 +20,7 @@ extern int sysctl_stat_interval;
#define VMSTAT_STRICT_MODE 1
#define VMSTAT_COARSE_MODE 2
#define VMSTAT_MODE_LEN 16
+extern int vmstat_mode;
extern char sysctl_vmstat_mode[];
extern int sysctl_vmstat_mode_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos);
@@ -243,6 +245,10 @@ extern unsigned long sum_zone_node_page_state(int node,
extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item);
extern unsigned long node_page_state(struct pglist_data *pgdat,
enum node_stat_item item);
+extern void zero_zone_numa_counters(struct zone *zone);
+extern void zero_zones_numa_counters(void);
+extern void zero_global_numa_counters(void);
+extern void invalid_numa_statistics(void);
#else
#define sum_zone_node_page_state(node, item) global_zone_page_state(item)
#define node_page_state(node, item) global_node_page_state(item)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c841af8..010a620 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -83,6 +83,8 @@ DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
#endif
+bool disable_zone_statistics = true;
+
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
@@ -2743,6 +2745,17 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
#ifdef CONFIG_NUMA
enum numa_stat_item local_stat = NUMA_LOCAL;
+ /*
+ * skip zone_statistics() if vmstat is a coarse mode or zone statistics
+ * is inactive in auto vmstat mode
+ */
+
+ if (vmstat_mode) {
+ if (vmstat_mode == VMSTAT_COARSE_MODE)
+ return;
+ } else if (disable_zone_statistics)
+ return;
+
if (z->node != numa_node_id())
local_stat = NUMA_OTHER;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e675ad2..bcaef62 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -85,15 +85,31 @@ int sysctl_vmstat_mode_handler(struct ctl_table *table, int write,
/* no change */
mutex_unlock(&vmstat_mode_lock);
return 0;
- } else if (vmstat_mode == VMSTAT_AUTO_MODE)
+ } else if (vmstat_mode == VMSTAT_AUTO_MODE) {
pr_info("vmstat mode changes from %s to auto mode\n",
vmstat_mode_name[oldval]);
- else if (vmstat_mode == VMSTAT_STRICT_MODE)
+ /*
+ * Set default numa stats action when vmstat mode changes
+ * from coarse to auto
+ */
+ if (oldval == VMSTAT_COARSE_MODE)
+ disable_zone_statistics = true;
+ } else if (vmstat_mode == VMSTAT_STRICT_MODE)
pr_info("vmstat mode changes from %s to strict mode\n",
vmstat_mode_name[oldval]);
- else if (vmstat_mode == VMSTAT_COARSE_MODE)
+ else if (vmstat_mode == VMSTAT_COARSE_MODE) {
pr_info("vmstat mode changes from %s to coarse mode\n",
vmstat_mode_name[oldval]);
+#ifdef CONFIG_NUMA
+ /*
+ * Invalidate numa counters when vmstat mode is set to coarse
+ * mode, because users can't tell the difference between the
+ * dead state and when allocator activity is quiet once
+ * zone_statistics() is turned off.
+ */
+ invalid_numa_statistics();
+#endif
+ }
else
pr_warn("invalid vmstat_mode:%d\n", vmstat_mode);
}
@@ -984,6 +1000,42 @@ unsigned long sum_zone_numa_state(int node,
return count;
}
+/* zero numa counters within a zone */
+void zero_zone_numa_counters(struct zone *zone)
+{
+ int item, cpu;
+
+ for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) {
+ atomic_long_set(&zone->vm_numa_stat[item], 0);
+ for_each_online_cpu(cpu)
+ per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item] = 0;
+ }
+}
+
+/* zero numa counters of all the populated zones */
+void zero_zones_numa_counters(void)
+{
+ struct zone *zone;
+
+ for_each_populated_zone(zone)
+ zero_zone_numa_counters(zone);
+}
+
+/* zero global numa counters */
+void zero_global_numa_counters(void)
+{
+ int item;
+
+ for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++)
+ atomic_long_set(&vm_numa_stat[item], 0);
+}
+
+void invalid_numa_statistics(void)
+{
+ zero_zones_numa_counters();
+ zero_global_numa_counters();
+}
+
/*
* Determine the per node value of a stat item.
*/
@@ -1652,6 +1704,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
+ disable_zone_statistics = false;
return 0;
}
@@ -1748,6 +1801,7 @@ static int vmstat_show(struct seq_file *m, void *arg)
static void vmstat_stop(struct seq_file *m, void *arg)
{
+ disable_zone_statistics = false;
kfree(m->private);
m->private = NULL;
}
--
2.7.4
Add a paragraph to introduce the functionality and usage on vmstat_mode in
sysctl/vm.txt
Reported-by: Jesper Dangaard Brouer <[email protected]>
Suggested-by: Dave Hansen <[email protected]>
Suggested-by: Ying Huang <[email protected]>
Signed-off-by: Kemi Wang <[email protected]>
---
Documentation/sysctl/vm.txt | 26 ++++++++++++++++++++++++++
1 file changed, 26 insertions(+)
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 9baf66a..6ab2843 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm:
- swappiness
- user_reserve_kbytes
- vfs_cache_pressure
+- vmstat_mode
- watermark_scale_factor
- zone_reclaim_mode
@@ -843,6 +844,31 @@ ten times more freeable objects than there are.
=============================================================
+vmstat_mode
+
+This interface allows virtual memory statistics configurable.
+
+When performance becomes a bottleneck and you can tolerate some possible
+tool breakage and some decreased counter precision (e.g. numa counter), you
+can do:
+ echo [C|c]oarse > /proc/sys/vm/vmstat_mode
+ignorable statistics list:
+- numa counters
+
+When performance is not a bottleneck and you want all tooling to work, you
+can do:
+ echo [S|s]trict > /proc/sys/vm/vmstat_mode
+
+We recommend automatic detection of virtual memory statistics by system,
+this is also system default configuration, you can do:
+ echo [A|a]uto > /proc/sys/vm/vmstat_mode
+
+E.g. numa statistics does not affect system's decision and it is very
+rarely consumed. If set vmstat_mode = auto, numa counters update is skipped
+unless the counter is *read* by users at least once.
+
+==============================================================
+
watermark_scale_factor:
This factor controls the aggressiveness of kswapd. It defines the
--
2.7.4
On Fri 15-09-17 17:23:24, Kemi Wang wrote:
> This patch adds a tunable interface that allows VM stats configurable, as
> suggested by Dave Hansen and Ying Huang.
>
> When performance becomes a bottleneck and you can tolerate some possible
> tool breakage and some decreased counter precision (e.g. numa counter), you
> can do:
> echo [C|c]oarse > /proc/sys/vm/vmstat_mode
>
> When performance is not a bottleneck and you want all tooling to work, you
> can do:
> echo [S|s]trict > /proc/sys/vm/vmstat_mode
>
> We recommend automatic detection of virtual memory statistics by system,
> this is also system default configuration, you can do:
> echo [A|a]uto > /proc/sys/vm/vmstat_mode
>
> The next patch handles numa statistics distinctively based-on different VM
> stats mode.
I would just merge this with the second patch so that it is clear how
those modes are implemented. I am also wondering why cannot we have a
much simpler interface and implementation to enable/disable numa stats
(btw. sysctl_vm_numa_stats would be more descriptive IMHO).
Why do we need an auto-mode? Is it safe to enforce by default. Is it
possible that userspace can get confused to see 0 NUMA stats in the
first read while other allocation stats are non-zero?
> Reported-by: Jesper Dangaard Brouer <[email protected]>
> Suggested-by: Dave Hansen <[email protected]>
> Suggested-by: Ying Huang <[email protected]>
> Signed-off-by: Kemi Wang <[email protected]>
> ---
> include/linux/vmstat.h | 14 ++++++++++
> kernel/sysctl.c | 7 +++++
> mm/vmstat.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 91 insertions(+)
>
> diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
> index ade7cb5..c3634c7 100644
> --- a/include/linux/vmstat.h
> +++ b/include/linux/vmstat.h
> @@ -9,6 +9,20 @@
>
> extern int sysctl_stat_interval;
>
> +/*
> + * vmstat_mode:
> + * 0 = auto mode of vmstat, automatic detection of VM statistics.
> + * 1 = strict mode of vmstat, keep all VM statistics.
> + * 2 = coarse mode of vmstat, ignore unimportant VM statistics.
> + */
> +#define VMSTAT_AUTO_MODE 0
> +#define VMSTAT_STRICT_MODE 1
> +#define VMSTAT_COARSE_MODE 2
> +#define VMSTAT_MODE_LEN 16
> +extern char sysctl_vmstat_mode[];
> +extern int sysctl_vmstat_mode_handler(struct ctl_table *table, int write,
> + void __user *buffer, size_t *length, loff_t *ppos);
> +
> #ifdef CONFIG_VM_EVENT_COUNTERS
> /*
> * Light weight per cpu counter implementation.
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 6648fbb..f5b813b 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1234,6 +1234,13 @@ static struct ctl_table kern_table[] = {
>
> static struct ctl_table vm_table[] = {
> {
> + .procname = "vmstat_mode",
> + .data = &sysctl_vmstat_mode,
> + .maxlen = VMSTAT_MODE_LEN,
> + .mode = 0644,
> + .proc_handler = sysctl_vmstat_mode_handler,
> + },
> + {
> .procname = "overcommit_memory",
> .data = &sysctl_overcommit_memory,
> .maxlen = sizeof(sysctl_overcommit_memory),
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 4bb13e7..e675ad2 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -32,6 +32,76 @@
>
> #define NUMA_STATS_THRESHOLD (U16_MAX - 2)
>
> +int vmstat_mode = VMSTAT_AUTO_MODE;
> +char sysctl_vmstat_mode[VMSTAT_MODE_LEN] = "auto";
> +static const char *vmstat_mode_name[3] = {"auto", "strict", "coarse"};
> +static DEFINE_MUTEX(vmstat_mode_lock);
> +
> +
> +static int __parse_vmstat_mode(char *s)
> +{
> + const char *str = s;
> +
> + if (strcmp(str, "auto") == 0 || strcmp(str, "Auto") == 0)
> + vmstat_mode = VMSTAT_AUTO_MODE;
> + else if (strcmp(str, "strict") == 0 || strcmp(str, "Strict") == 0)
> + vmstat_mode = VMSTAT_STRICT_MODE;
> + else if (strcmp(str, "coarse") == 0 || strcmp(str, "Coarse") == 0)
> + vmstat_mode = VMSTAT_COARSE_MODE;
> + else {
> + pr_warn("Ignoring invalid vmstat_mode value: %s\n", s);
> + return -EINVAL;
> + }
> + return 0;
> +}
> +
> +int sysctl_vmstat_mode_handler(struct ctl_table *table, int write,
> + void __user *buffer, size_t *length, loff_t *ppos)
> +{
> + char old_string[VMSTAT_MODE_LEN];
> + int ret, oldval;
> +
> + mutex_lock(&vmstat_mode_lock);
> + if (write)
> + strncpy(old_string, (char *)table->data, VMSTAT_MODE_LEN);
> + ret = proc_dostring(table, write, buffer, length, ppos);
> + if (ret || !write) {
> + mutex_unlock(&vmstat_mode_lock);
> + return ret;
> + }
> +
> + oldval = vmstat_mode;
> + if (__parse_vmstat_mode((char *)table->data)) {
> + /*
> + * invalid sysctl_vmstat_mode value, restore saved string
> + */
> + strncpy((char *)table->data, old_string, VMSTAT_MODE_LEN);
> + vmstat_mode = oldval;
> + } else {
> + /*
> + * check whether vmstat mode changes or not
> + */
> + if (vmstat_mode == oldval) {
> + /* no change */
> + mutex_unlock(&vmstat_mode_lock);
> + return 0;
> + } else if (vmstat_mode == VMSTAT_AUTO_MODE)
> + pr_info("vmstat mode changes from %s to auto mode\n",
> + vmstat_mode_name[oldval]);
> + else if (vmstat_mode == VMSTAT_STRICT_MODE)
> + pr_info("vmstat mode changes from %s to strict mode\n",
> + vmstat_mode_name[oldval]);
> + else if (vmstat_mode == VMSTAT_COARSE_MODE)
> + pr_info("vmstat mode changes from %s to coarse mode\n",
> + vmstat_mode_name[oldval]);
> + else
> + pr_warn("invalid vmstat_mode:%d\n", vmstat_mode);
> + }
> +
> + mutex_unlock(&vmstat_mode_lock);
> + return 0;
> +}
> +
> #ifdef CONFIG_VM_EVENT_COUNTERS
> DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
> EXPORT_PER_CPU_SYMBOL(vm_event_states);
> --
> 2.7.4
>
--
Michal Hocko
SUSE Labs
On Fri 15-09-17 17:23:25, Kemi Wang wrote:
[...]
> @@ -2743,6 +2745,17 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
> #ifdef CONFIG_NUMA
> enum numa_stat_item local_stat = NUMA_LOCAL;
>
> + /*
> + * skip zone_statistics() if vmstat is a coarse mode or zone statistics
> + * is inactive in auto vmstat mode
> + */
> +
> + if (vmstat_mode) {
> + if (vmstat_mode == VMSTAT_COARSE_MODE)
> + return;
> + } else if (disable_zone_statistics)
> + return;
> +
> if (z->node != numa_node_id())
> local_stat = NUMA_OTHER;
A jump label could make this completely out of the way for the case
where every single cycle matters.
--
Michal Hocko
SUSE Labs
On 09/15/2017 04:49 AM, Michal Hocko wrote:
> Why do we need an auto-mode? Is it safe to enforce by default.
Do we *need* it? Not really.
But, it does offer the best of both worlds: The vast majority of users
see virtually no impact from the counters. The minority that do need
them pay the cost *and* don't have to change their tooling at all.
> Is it> possible that userspace can get confused to see 0 NUMA stats in
the
> first read while other allocation stats are non-zero?
I doubt it. Those counters are pretty worthless by themselves. I have
tooling that goes and reads them, but it aways displays deltas. Read
stats, sleep one second, read again, print the difference.
The only scenario I can see mattering is someone who is seeing a
performance issue due to NUMA allocation misses (or whatever) and wants
to go look *back* in the past.
A single-time printk could also go a long way to keeping folks from
getting confused.
On Fri 15-09-17 07:16:23, Dave Hansen wrote:
> On 09/15/2017 04:49 AM, Michal Hocko wrote:
> > Why do we need an auto-mode? Is it safe to enforce by default.
>
> Do we *need* it? Not really.
>
> But, it does offer the best of both worlds: The vast majority of users
> see virtually no impact from the counters. The minority that do need
> them pay the cost *and* don't have to change their tooling at all.
Just to make it clear, I am not really opposing. It just adds some code
which we can safe... It is also rather chatty for something that can be
true/false.
> > Is it> possible that userspace can get confused to see 0 NUMA stats in
> the
> > first read while other allocation stats are non-zero?
>
> I doubt it. Those counters are pretty worthless by themselves. I have
> tooling that goes and reads them, but it aways displays deltas. Read
> stats, sleep one second, read again, print the difference.
This is how I use them as well.
> The only scenario I can see mattering is someone who is seeing a
> performance issue due to NUMA allocation misses (or whatever) and wants
> to go look *back* in the past.
yes
> A single-time printk could also go a long way to keeping folks from
> getting confused.
--
Michal Hocko
SUSE Labs
-----Original Message-----
From: Michal Hocko [mailto:[email protected]]
Sent: Friday, September 15, 2017 7:50 PM
To: Wang, Kemi <[email protected]>
Cc: Luis R . Rodriguez <[email protected]>; Kees Cook <[email protected]>; Andrew Morton <[email protected]>; Jonathan Corbet <[email protected]>; Mel Gorman <[email protected]>; Johannes Weiner <[email protected]>; Christopher Lameter <[email protected]>; Sebastian Andrzej Siewior <[email protected]>; Vlastimil Babka <[email protected]>; Hillf Danton <[email protected]>; Dave <[email protected]>; Chen, Tim C <[email protected]>; Kleen, Andi <[email protected]>; Jesper Dangaard Brouer <[email protected]>; Huang, Ying <[email protected]>; Lu, Aaron <[email protected]>; Proc sysctl <[email protected]>; Linux MM <[email protected]>; Linux Kernel <[email protected]>
Subject: Re: [PATCH 1/3] mm, sysctl: make VM stats configurable
On Fri 15-09-17 17:23:24, Kemi Wang wrote:
> This patch adds a tunable interface that allows VM stats configurable, as
> suggested by Dave Hansen and Ying Huang.
>
> When performance becomes a bottleneck and you can tolerate some possible
> tool breakage and some decreased counter precision (e.g. numa counter), you
> can do:
> echo [C|c]oarse > /proc/sys/vm/vmstat_mode
>
> When performance is not a bottleneck and you want all tooling to work, you
> can do:
> echo [S|s]trict > /proc/sys/vm/vmstat_mode
>
> We recommend automatic detection of virtual memory statistics by system,
> this is also system default configuration, you can do:
> echo [A|a]uto > /proc/sys/vm/vmstat_mode
>
> The next patch handles numa statistics distinctively based-on different VM
> stats mode.
I would just merge this with the second patch so that it is clear how
those modes are implemented. I am also wondering why cannot we have a
much simpler interface and implementation to enable/disable numa stats
(btw. sysctl_vm_numa_stats would be more descriptive IMHO).
The motivation is that we propose a general tunable interface for VM stats.
This would be more scalable, since we don't have to add an individual
Interface for each type of counter that can be configurable.
In the second patch, NUMA stats, as an example, can benefit for that.
If you still hold your idea, I don't mind to merge them together.
--
Michal Hocko
SUSE Labs
On 2017年09月15日 22:28, Michal Hocko wrote:
> On Fri 15-09-17 07:16:23, Dave Hansen wrote:
>> On 09/15/2017 04:49 AM, Michal Hocko wrote:
>>> Why do we need an auto-mode? Is it safe to enforce by default.
>>
>> Do we *need* it? Not really.
>>
>> But, it does offer the best of both worlds: The vast majority of users
>> see virtually no impact from the counters. The minority that do need
>> them pay the cost *and* don't have to change their tooling at all.
>
> Just to make it clear, I am not really opposing. It just adds some code
> which we can safe... It is also rather chatty for something that can be
> true/false.
>
It has benefit, as Dave mentioned above.
Actually, it adds some coding complexity to provide a tuning interface with
on/off/auto mode. Using human-readable string instead of magic number makes
it easier to use, people probably don't need to review the ABI doc again
before using it. So, I don't think that should be a problem
>>> Is it> possible that userspace can get confused to see 0 NUMA stats in
>> the
>>> first read while other allocation stats are non-zero?
>>
>> I doubt it. Those counters are pretty worthless by themselves. I have
>> tooling that goes and reads them, but it aways displays deltas. Read
>> stats, sleep one second, read again, print the difference.
>
> This is how I use them as well.
>
>> The only scenario I can see mattering is someone who is seeing a
>> performance issue due to NUMA allocation misses (or whatever) and wants
>> to go look *back* in the past.
>
> yes
>
If it really matters, setting vmstat_mode=strict as a default option is a simple
way to fix it. What's your idea? thanks
>> A single-time printk could also go a long way to keeping folks from
>> getting confused.
>
On 2017年09月15日 19:50, Michal Hocko wrote:
> On Fri 15-09-17 17:23:25, Kemi Wang wrote:
> [...]
>> @@ -2743,6 +2745,17 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
>> #ifdef CONFIG_NUMA
>> enum numa_stat_item local_stat = NUMA_LOCAL;
>>
>> + /*
>> + * skip zone_statistics() if vmstat is a coarse mode or zone statistics
>> + * is inactive in auto vmstat mode
>> + */
>> +
>> + if (vmstat_mode) {
>> + if (vmstat_mode == VMSTAT_COARSE_MODE)
>> + return;
>> + } else if (disable_zone_statistics)
>> + return;
>> +
>> if (z->node != numa_node_id())
>> local_stat = NUMA_OTHER;
>
> A jump label could make this completely out of the way for the case
> where every single cycle matters.
>
Could you be more explicit for how to implement it here. Thanks very much.
On 2017年09月15日 19:49, Michal Hocko wrote:
> On Fri 15-09-17 17:23:24, Kemi Wang wrote:
>> This patch adds a tunable interface that allows VM stats configurable, as
>> suggested by Dave Hansen and Ying Huang.
>>
>> When performance becomes a bottleneck and you can tolerate some possible
>> tool breakage and some decreased counter precision (e.g. numa counter), you
>> can do:
>> echo [C|c]oarse > /proc/sys/vm/vmstat_mode
>>
>> When performance is not a bottleneck and you want all tooling to work, you
>> can do:
>> echo [S|s]trict > /proc/sys/vm/vmstat_mode
>>
>> We recommend automatic detection of virtual memory statistics by system,
>> this is also system default configuration, you can do:
>> echo [A|a]uto > /proc/sys/vm/vmstat_mode
>>
>> The next patch handles numa statistics distinctively based-on different VM
>> stats mode.
>
> I would just merge this with the second patch so that it is clear how
> those modes are implemented. I am also wondering why cannot we have a
> much simpler interface and implementation to enable/disable numa stats
> (btw. sysctl_vm_numa_stats would be more descriptive IMHO).
>
Apologize for resending it, because I found my previous reply mixed with
Michal's in many email client.
The motivation is that we propose a general tunable interface for VM stats.
This would be more scalable, since we don't have to add an individual
Interface for each type of counter that can be configurable.
In the second patch, NUMA stats, as an example, can benefit for that.
If you still hold your idea, I don't mind to merge them together.
On 09/17/2017 08:07 PM, kemi wrote:
>>> + if (vmstat_mode) {
>>> + if (vmstat_mode == VMSTAT_COARSE_MODE)
>>> + return;
>>> + } else if (disable_zone_statistics)
>>> + return;
>>> +
>>> if (z->node != numa_node_id())
>>> local_stat = NUMA_OTHER;
>>
>> A jump label could make this completely out of the way for the case
>> where every single cycle matters.
>
> Could you be more explicit for how to implement it here. Thanks very much.
Take a look at include/linux/jump_label.h.
On 2017年09月18日 12:13, Dave Hansen wrote:
> On 09/17/2017 08:07 PM, kemi wrote:
>>>> + if (vmstat_mode) {
>>>> + if (vmstat_mode == VMSTAT_COARSE_MODE)
>>>> + return;
>>>> + } else if (disable_zone_statistics)
>>>> + return;
>>>> +
>>>> if (z->node != numa_node_id())
>>>> local_stat = NUMA_OTHER;
>>>
>>> A jump label could make this completely out of the way for the case
>>> where every single cycle matters.
>>
>> Could you be more explicit for how to implement it here. Thanks very much.
>
> Take a look at include/linux/jump_label.h.
>
>
Sure, Thanks
>
On Mon 18-09-17 11:22:37, kemi wrote:
>
>
> On 2017年09月15日 19:49, Michal Hocko wrote:
> > On Fri 15-09-17 17:23:24, Kemi Wang wrote:
> >> This patch adds a tunable interface that allows VM stats configurable, as
> >> suggested by Dave Hansen and Ying Huang.
> >>
> >> When performance becomes a bottleneck and you can tolerate some possible
> >> tool breakage and some decreased counter precision (e.g. numa counter), you
> >> can do:
> >> echo [C|c]oarse > /proc/sys/vm/vmstat_mode
> >>
> >> When performance is not a bottleneck and you want all tooling to work, you
> >> can do:
> >> echo [S|s]trict > /proc/sys/vm/vmstat_mode
> >>
> >> We recommend automatic detection of virtual memory statistics by system,
> >> this is also system default configuration, you can do:
> >> echo [A|a]uto > /proc/sys/vm/vmstat_mode
> >>
> >> The next patch handles numa statistics distinctively based-on different VM
> >> stats mode.
> >
> > I would just merge this with the second patch so that it is clear how
> > those modes are implemented. I am also wondering why cannot we have a
> > much simpler interface and implementation to enable/disable numa stats
> > (btw. sysctl_vm_numa_stats would be more descriptive IMHO).
> >
>
> Apologize for resending it, because I found my previous reply mixed with
> Michal's in many email client.
>
> The motivation is that we propose a general tunable interface for VM stats.
> This would be more scalable, since we don't have to add an individual
> Interface for each type of counter that can be configurable.
Can you envision which other counters would fall into the same category?
> In the second patch, NUMA stats, as an example, can benefit for that.
> If you still hold your idea, I don't mind to merge them together.
Well, I would prefer simplicy in the first place.
--
Michal Hocko
SUSE Labs
On Mon 18-09-17 10:44:52, kemi wrote:
>
>
> On 2017年09月15日 22:28, Michal Hocko wrote:
> > On Fri 15-09-17 07:16:23, Dave Hansen wrote:
> >> On 09/15/2017 04:49 AM, Michal Hocko wrote:
> >>> Why do we need an auto-mode? Is it safe to enforce by default.
> >>
> >> Do we *need* it? Not really.
> >>
> >> But, it does offer the best of both worlds: The vast majority of users
> >> see virtually no impact from the counters. The minority that do need
> >> them pay the cost *and* don't have to change their tooling at all.
> >
> > Just to make it clear, I am not really opposing. It just adds some code
> > which we can safe... It is also rather chatty for something that can be
> > true/false.
> >
>
> It has benefit, as Dave mentioned above.
> Actually, it adds some coding complexity to provide a tuning interface with
> on/off/auto mode. Using human-readable string instead of magic number makes
> it easier to use, people probably don't need to review the ABI doc again
> before using it. So, I don't think that should be a problem
Is this a thing that would be changed very often. I suspect that once
needed it will be set in a startup sysctl configuration and there will
be no further need to touch it again.
> >>> Is it> possible that userspace can get confused to see 0 NUMA stats in
> >> the
> >>> first read while other allocation stats are non-zero?
> >>
> >> I doubt it. Those counters are pretty worthless by themselves. I have
> >> tooling that goes and reads them, but it aways displays deltas. Read
> >> stats, sleep one second, read again, print the difference.
> >
> > This is how I use them as well.
> >
> >> The only scenario I can see mattering is someone who is seeing a
> >> performance issue due to NUMA allocation misses (or whatever) and wants
> >> to go look *back* in the past.
> >
> > yes
> >
>
> If it really matters, setting vmstat_mode=strict as a default option is a simple
> way to fix it. What's your idea? thanks
Well, we are usually very conservative when changing the default
behavior. The primary reason why I was asking is that the auto mode
doesn't make much sense unless it is the default. I fully realize that
such an hypothetical breakage is really hard to envision but considering
it is more code to allow auto mode than a simple on/off (we have parsing
helpers for that AFAIR) then I would rather go with the simpler option.
This is up to you of course.
--
Michal Hocko
SUSE Labs