Subject: RFC vmstat: On demand vmstat threads


vmstat threads are used for folding counter differentials into the
zone, per node and global counters at certain time intervals.

They currently run at defined intervals on all processors which will
cause some holdoff for processors that need minimal intrusion by the
OS.

This patch creates a vmstat sheperd task that monitors the
per cpu differentials on all processors. If there are differentials
on a processor then a vmstat thread local to the processors with
the differentials is created. That process will then start
folding the diffs in regular intervals. Should the vmstat
process find that there is no work to be done then it will
terminate itself and make the sheperd task monitor the differentials
again.

Note: This patch is based on the vmstat patches in Andrew's tree
to be merged for the 3.12 kernel.

Also some of the logic is inspired by Gilad's earlier work.

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux/mm/vmstat.c
===================================================================
--- linux.orig/mm/vmstat.c 2013-09-04 10:09:59.158419700 -0500
+++ linux/mm/vmstat.c 2013-09-04 10:49:55.114407202 -0500
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/cpu.h>
+#include <linux/cpumask.h>
#include <linux/vmstat.h>
#include <linux/sched.h>
#include <linux/math64.h>
@@ -414,13 +415,18 @@ void dec_zone_page_state(struct page *pa
EXPORT_SYMBOL(dec_zone_page_state);
#endif

-static inline void fold_diff(int *diff)
+
+static inline int fold_diff(int *diff)
{
int i;
+ int changes = 0;

for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- if (diff[i])
+ if (diff[i]) {
atomic_long_add(diff[i], &vm_stat[i]);
+ changes++;
+ }
+ return changes;
}

/*
@@ -437,11 +443,12 @@ static inline void fold_diff(int *diff)
* with the global counters. These could cause remote node cache line
* bouncing and will have to be only done when necessary.
*/
-static void refresh_cpu_vm_stats(void)
+static int refresh_cpu_vm_stats(void)
{
struct zone *zone;
int i;
int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+ int changes = 0;

for_each_populated_zone(zone) {
struct per_cpu_pageset __percpu *p = zone->pageset;
@@ -485,11 +492,41 @@ static void refresh_cpu_vm_stats(void)
if (__this_cpu_dec_return(p->expire))
continue;

- if (__this_cpu_read(p->pcp.count))
+ if (__this_cpu_read(p->pcp.count)) {
drain_zone_pages(zone, __this_cpu_ptr(&p->pcp));
+ changes++;
+ }
#endif
}
- fold_diff(global_diff);
+ changes += fold_diff(global_diff);
+ return changes;
+}
+
+/*
+ * Check if the diffs for a certain cpu indicate that
+ * an update is needed.
+ */
+static int need_update(int cpu)
+{
+ struct zone *zone;
+ int i;
+
+ for_each_populated_zone(zone) {
+ struct per_cpu_pageset __percpu *p = zone->pageset;
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ if (__this_cpu_read(p->vm_stat_diff[i]))
+ return 1;
+#ifdef CONFIG_NUMA
+ /*
+ * Check if there are pages remaining in this pageset
+ */
+ if (__this_cpu_read(p->pcp.count))
+ return 1;
+
+#endif
+ }
+ return 0;
}

/*
@@ -1203,12 +1240,15 @@ static const struct file_operations proc
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
int sysctl_stat_interval __read_mostly = HZ;
+static struct cpumask *monitored_cpus;

static void vmstat_update(struct work_struct *w)
{
- refresh_cpu_vm_stats();
- schedule_delayed_work(this_cpu_ptr(&vmstat_work),
- round_jiffies_relative(sysctl_stat_interval));
+ if (refresh_cpu_vm_stats())
+ schedule_delayed_work(this_cpu_ptr(&vmstat_work),
+ round_jiffies_relative(sysctl_stat_interval));
+ else
+ cpumask_set_cpu(smp_processor_id(), monitored_cpus);
}

static void start_cpu_timer(int cpu)
@@ -1216,9 +1256,41 @@ static void start_cpu_timer(int cpu)
struct delayed_work *work = &per_cpu(vmstat_work, cpu);

INIT_DEFERRABLE_WORK(work, vmstat_update);
- schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
+ schedule_delayed_work_on(cpu, work,
+ __round_jiffies_relative(sysctl_stat_interval, cpu));
+}
+
+static struct delayed_work shepherd_work;
+extern int tick_do_timer_cpu;
+
+static void vmstat_shepherd(struct work_struct *w)
+{
+ int cpu;
+
+ refresh_cpu_vm_stats();
+ for_each_cpu(cpu, monitored_cpus)
+ if (need_update(cpu)) {
+ cpumask_clear_cpu(cpu, monitored_cpus);
+ start_cpu_timer(cpu);
+ }
+
+ schedule_delayed_work_on(tick_do_timer_cpu,
+ &shepherd_work,
+ __round_jiffies_relative(sysctl_stat_interval,
+ tick_do_timer_cpu));
}

+
+static void start_shepherd_timer(void)
+{
+ INIT_DEFERRABLE_WORK(&shepherd_work, vmstat_shepherd);
+ monitored_cpus = kmalloc(BITS_TO_LONGS(nr_cpu_ids) * sizeof(long), __GFP_NOFAIL);
+ cpumask_copy(monitored_cpus, cpu_online_mask);
+ cpumask_clear_cpu(tick_do_timer_cpu, monitored_cpus);
+ schedule_delayed_work_on(tick_do_timer_cpu,
+ &shepherd_work, __round_jiffies_relative(sysctl_stat_interval, tick_do_timer_cpu));
+
+}
/*
* Use the cpu notifier to insure that the thresholds are recalculated
* when necessary.
@@ -1233,17 +1305,19 @@ static int vmstat_cpuup_callback(struct
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
refresh_zone_stat_thresholds();
- start_cpu_timer(cpu);
node_set_state(cpu_to_node(cpu), N_CPU);
+ cpumask_set_cpu(cpu, monitored_cpus);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
- cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
+ if (!cpumask_test_cpu(cpu, monitored_cpus))
+ cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
+ cpumask_clear_cpu(cpu, monitored_cpus);
per_cpu(vmstat_work, cpu).work.func = NULL;
break;
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
- start_cpu_timer(cpu);
+ cpumask_set_cpu(cpu, monitored_cpus);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
@@ -1262,12 +1336,8 @@ static struct notifier_block vmstat_noti
static int __init setup_vmstat(void)
{
#ifdef CONFIG_SMP
- int cpu;
-
register_cpu_notifier(&vmstat_notifier);
-
- for_each_online_cpu(cpu)
- start_cpu_timer(cpu);
+ start_shepherd_timer();
#endif
#ifdef CONFIG_PROC_FS
proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);


2013-09-10 06:15:27

by Gilad Ben-Yossef

[permalink] [raw]
Subject: Re: RFC vmstat: On demand vmstat threads

On Wed, Sep 4, 2013 at 7:48 PM, Christoph Lameter <[email protected]> wrote:
>
> vmstat threads are used for folding counter differentials into the
> zone, per node and global counters at certain time intervals.
>
> They currently run at defined intervals on all processors which will
> cause some holdoff for processors that need minimal intrusion by the
> OS.
>
> This patch creates a vmstat sheperd task that monitors the
> per cpu differentials on all processors. If there are differentials
> on a processor then a vmstat thread local to the processors with
> the differentials is created. That process will then start
> folding the diffs in regular intervals. Should the vmstat
> process find that there is no work to be done then it will
> terminate itself and make the sheperd task monitor the differentials
> again.
>

I wasn't happy with the results of my own attempt to accomplish the same and I
like this much better. So, for what it's worth -

Reviewed-by: Gilad Ben-Yossef <[email protected]>

Thanks,
Gilad


--
Gilad Ben-Yossef
Chief Coffee Drinker
[email protected]
Israel Cell: +972-52-8260388
US Cell: +1-973-8260388
http://benyossef.com

"If you take a class in large-scale robotics, can you end up in a
situation where the homework eats your dog?"
-- Jean-Baptiste Queru

Subject: Re: RFC vmstat: On demand vmstat threads

On Tue, 10 Sep 2013, Gilad Ben-Yossef wrote:

> I wasn't happy with the results of my own attempt to accomplish the same and I
> like this much better. So, for what it's worth -

Thanks. In the meantime I found some issues with my patchset and revised
it further. Could you have another look?


Subject: vmstat: On demand vmstat workers V2

vmstat threads are used for folding counter differentials into the
zone, per node and global counters at certain time intervals.

They currently run at defined intervals on all processors which will
cause some holdoff for processors that need minimal intrusion by the
OS.

This patch creates a vmstat shepherd task that monitors the
per cpu differentials on all processors. If there are differentials
on a processor then a vmstat worker thread local to the processors
with the differentials is created. That worker will then start
folding the diffs in regular intervals. Should the worker
find that there is no work to be done then it will
terminate itself and make the shepherd task monitor the differentials
again.

With this patch it is possible then to have periods longer than
2 seconds without any OS event on a "cpu" (hardware thread).

The tick_do_timer_cpu is chosen to run the shepherd workers.
So there must be at least one cpu that will keep running vmstat
updates.

Note: This patch is based on the vmstat patches in Andrew's tree
to be merged for the 3.12 kernel.

V1->V2:
- Optimize the need_update check by using memchr_inv.
- Clean up.
- Fixup the wrong need_update check.
- Drop the check for pcp.count. Too many false positives.

Reviewed-by: Gilad Ben-Yossef <[email protected]>
Signed-off-by: Christoph Lameter <[email protected]>


Index: linux/mm/vmstat.c
===================================================================
--- linux.orig/mm/vmstat.c 2013-09-09 13:58:25.526562233 -0500
+++ linux/mm/vmstat.c 2013-09-09 16:09:14.266402841 -0500
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/cpu.h>
+#include <linux/cpumask.h>
#include <linux/vmstat.h>
#include <linux/sched.h>
#include <linux/math64.h>
@@ -414,13 +415,18 @@ void dec_zone_page_state(struct page *pa
EXPORT_SYMBOL(dec_zone_page_state);
#endif

-static inline void fold_diff(int *diff)
+
+static inline int fold_diff(int *diff)
{
int i;
+ int changes = 0;

for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- if (diff[i])
+ if (diff[i]) {
atomic_long_add(diff[i], &vm_stat[i]);
+ changes++;
+ }
+ return changes;
}

/*
@@ -437,11 +443,12 @@ static inline void fold_diff(int *diff)
* with the global counters. These could cause remote node cache line
* bouncing and will have to be only done when necessary.
*/
-static void refresh_cpu_vm_stats(void)
+static int refresh_cpu_vm_stats(void)
{
struct zone *zone;
int i;
int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+ int changes = 0;

for_each_populated_zone(zone) {
struct per_cpu_pageset __percpu *p = zone->pageset;
@@ -485,11 +492,14 @@ static void refresh_cpu_vm_stats(void)
if (__this_cpu_dec_return(p->expire))
continue;

- if (__this_cpu_read(p->pcp.count))
+ if (__this_cpu_read(p->pcp.count)) {
drain_zone_pages(zone, __this_cpu_ptr(&p->pcp));
+ changes++;
+ }
#endif
}
- fold_diff(global_diff);
+ changes += fold_diff(global_diff);
+ return changes;
}

/*
@@ -1203,12 +1213,15 @@ static const struct file_operations proc
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
int sysctl_stat_interval __read_mostly = HZ;
+static struct cpumask *monitored_cpus;

static void vmstat_update(struct work_struct *w)
{
- refresh_cpu_vm_stats();
- schedule_delayed_work(this_cpu_ptr(&vmstat_work),
- round_jiffies_relative(sysctl_stat_interval));
+ if (refresh_cpu_vm_stats())
+ schedule_delayed_work(this_cpu_ptr(&vmstat_work),
+ round_jiffies_relative(sysctl_stat_interval));
+ else
+ cpumask_set_cpu(smp_processor_id(), monitored_cpus);
}

static void start_cpu_timer(int cpu)
@@ -1216,7 +1229,63 @@ static void start_cpu_timer(int cpu)
struct delayed_work *work = &per_cpu(vmstat_work, cpu);

INIT_DEFERRABLE_WORK(work, vmstat_update);
- schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
+ schedule_delayed_work_on(cpu, work,
+ __round_jiffies_relative(sysctl_stat_interval, cpu));
+}
+
+/*
+ * Check if the diffs for a certain cpu indicate that
+ * an update is needed.
+ */
+static int need_update(int cpu)
+{
+ struct zone *zone;
+
+ for_each_populated_zone(zone) {
+ struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
+
+ /*
+ * The fast way of checking if there are any vmstat diffs.
+ * This works because the diffs are byte sized items.
+ */
+ if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
+ return 1;
+ }
+ return 0;
+}
+
+static struct delayed_work shepherd_work;
+extern int tick_do_timer_cpu;
+
+static void vmstat_shepherd(struct work_struct *w)
+{
+ int cpu;
+
+ refresh_cpu_vm_stats();
+ for_each_cpu(cpu, monitored_cpus)
+ if (need_update(cpu)) {
+ cpumask_clear_cpu(cpu, monitored_cpus);
+ start_cpu_timer(cpu);
+ }
+
+ schedule_delayed_work_on(tick_do_timer_cpu,
+ &shepherd_work,
+ __round_jiffies_relative(sysctl_stat_interval,
+ tick_do_timer_cpu));
+}
+
+static void start_shepherd_timer(void)
+{
+ INIT_DEFERRABLE_WORK(&shepherd_work, vmstat_shepherd);
+ monitored_cpus = kmalloc(BITS_TO_LONGS(nr_cpu_ids) * sizeof(long),
+ __GFP_NOFAIL);
+ cpumask_copy(monitored_cpus, cpu_online_mask);
+ cpumask_clear_cpu(tick_do_timer_cpu, monitored_cpus);
+ schedule_delayed_work_on(tick_do_timer_cpu,
+ &shepherd_work,
+ __round_jiffies_relative(sysctl_stat_interval,
+ tick_do_timer_cpu));
+
}

/*
@@ -1233,17 +1302,19 @@ static int vmstat_cpuup_callback(struct
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
refresh_zone_stat_thresholds();
- start_cpu_timer(cpu);
node_set_state(cpu_to_node(cpu), N_CPU);
+ cpumask_set_cpu(cpu, monitored_cpus);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
- cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
+ if (!cpumask_test_cpu(cpu, monitored_cpus))
+ cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
+ cpumask_clear_cpu(cpu, monitored_cpus);
per_cpu(vmstat_work, cpu).work.func = NULL;
break;
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
- start_cpu_timer(cpu);
+ cpumask_set_cpu(cpu, monitored_cpus);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
@@ -1262,12 +1333,8 @@ static struct notifier_block vmstat_noti
static int __init setup_vmstat(void)
{
#ifdef CONFIG_SMP
- int cpu;
-
register_cpu_notifier(&vmstat_notifier);
-
- for_each_online_cpu(cpu)
- start_cpu_timer(cpu);
+ start_shepherd_timer();
#endif
#ifdef CONFIG_PROC_FS
proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);

2013-09-18 22:07:04

by Andrew Morton

[permalink] [raw]
Subject: Re: RFC vmstat: On demand vmstat threads

On Tue, 10 Sep 2013 21:13:34 +0000 Christoph Lameter <[email protected]> wrote:

> Subject: vmstat: On demand vmstat workers V2

grumbles.

> vmstat threads are used for folding counter differentials into the
> zone, per node and global counters at certain time intervals.

These are not "threads". Let's please use accurate terminology
("keventd works" is close enough) and not inappropriately repurpose
well-understood terms.

> They currently run at defined intervals on all processors which will
> cause some holdoff for processors that need minimal intrusion by the
> OS.
>
> This patch creates a vmstat shepherd task that monitors the

No, it does not call kthread_run() hence it does not create a task. Or
a thread.

> per cpu differentials on all processors. If there are differentials
> on a processor then a vmstat worker thread local to the processors
> with the differentials is created. That worker will then start
> folding the diffs in regular intervals. Should the worker
> find that there is no work to be done then it will
> terminate itself and make the shepherd task monitor the differentials
> again.
>
> With this patch it is possible then to have periods longer than
> 2 seconds without any OS event on a "cpu" (hardware thread).

It would be useful (actually essential) to have a description of why
anyone cares about this. A good and detailed description, please.

> The tick_do_timer_cpu is chosen to run the shepherd workers.
> So there must be at least one cpu that will keep running vmstat
> updates.
>
> ...
>
> --- linux.orig/mm/vmstat.c 2013-09-09 13:58:25.526562233 -0500
> +++ linux/mm/vmstat.c 2013-09-09 16:09:14.266402841 -0500
> @@ -14,6 +14,7 @@
> #include <linux/module.h>
> #include <linux/slab.h>
> #include <linux/cpu.h>
> +#include <linux/cpumask.h>
> #include <linux/vmstat.h>
> #include <linux/sched.h>
> #include <linux/math64.h>
> @@ -414,13 +415,18 @@ void dec_zone_page_state(struct page *pa
> EXPORT_SYMBOL(dec_zone_page_state);
> #endif
>
> -static inline void fold_diff(int *diff)
> +
> +static inline int fold_diff(int *diff)
> {
> int i;
> + int changes = 0;
>
> for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
> - if (diff[i])
> + if (diff[i]) {
> atomic_long_add(diff[i], &vm_stat[i]);
> + changes++;
> + }
> + return changes;
> }
>
> /*
> @@ -437,11 +443,12 @@ static inline void fold_diff(int *diff)
> * with the global counters. These could cause remote node cache line
> * bouncing and will have to be only done when necessary.

Document the newly-added return value? Especially as it's a scalar
which is used as a boolean when it isn't just ignored.

> */
> -static void refresh_cpu_vm_stats(void)
> +static int refresh_cpu_vm_stats(void)
> {
> struct zone *zone;
> int i;
> int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
> + int changes = 0;
>
> for_each_populated_zone(zone) {
> struct per_cpu_pageset __percpu *p = zone->pageset;
> @@ -485,11 +492,14 @@ static void refresh_cpu_vm_stats(void)
> if (__this_cpu_dec_return(p->expire))
> continue;
>
> - if (__this_cpu_read(p->pcp.count))
> + if (__this_cpu_read(p->pcp.count)) {
> drain_zone_pages(zone, __this_cpu_ptr(&p->pcp));
> + changes++;
> + }
> #endif
> }
> - fold_diff(global_diff);
> + changes += fold_diff(global_diff);
> + return changes;
> }
>
> /*
> @@ -1203,12 +1213,15 @@ static const struct file_operations proc
> #ifdef CONFIG_SMP
> static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
> int sysctl_stat_interval __read_mostly = HZ;
> +static struct cpumask *monitored_cpus;
>
> static void vmstat_update(struct work_struct *w)
> {
> - refresh_cpu_vm_stats();
> - schedule_delayed_work(this_cpu_ptr(&vmstat_work),
> - round_jiffies_relative(sysctl_stat_interval));
> + if (refresh_cpu_vm_stats())
> + schedule_delayed_work(this_cpu_ptr(&vmstat_work),
> + round_jiffies_relative(sysctl_stat_interval));
> + else
> + cpumask_set_cpu(smp_processor_id(), monitored_cpus);
> }
>
> static void start_cpu_timer(int cpu)
> @@ -1216,7 +1229,63 @@ static void start_cpu_timer(int cpu)
> struct delayed_work *work = &per_cpu(vmstat_work, cpu);
>
> INIT_DEFERRABLE_WORK(work, vmstat_update);
> - schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
> + schedule_delayed_work_on(cpu, work,
> + __round_jiffies_relative(sysctl_stat_interval, cpu));
> +}
> +
> +/*
> + * Check if the diffs for a certain cpu indicate that
> + * an update is needed.
> + */
> +static int need_update(int cpu)
> +{
> + struct zone *zone;
> +
> + for_each_populated_zone(zone) {
> + struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
> +
> + /*
> + * The fast way of checking if there are any vmstat diffs.
> + * This works because the diffs are byte sized items.
> + */
> + if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
> + return 1;
> + }
> + return 0;
> +}
> +
> +static struct delayed_work shepherd_work;
> +extern int tick_do_timer_cpu;

This should be in a header file so we can keep definition and users in
sync.

> +static void vmstat_shepherd(struct work_struct *w)
> +{
> + int cpu;
> +
> + refresh_cpu_vm_stats();
> + for_each_cpu(cpu, monitored_cpus)
> + if (need_update(cpu)) {
> + cpumask_clear_cpu(cpu, monitored_cpus);
> + start_cpu_timer(cpu);
> + }
> +
> + schedule_delayed_work_on(tick_do_timer_cpu,
> + &shepherd_work,
> + __round_jiffies_relative(sysctl_stat_interval,
> + tick_do_timer_cpu));
> +}

Some documentation would be nice. The unobvious things. ie: design
concepts.

> +static void start_shepherd_timer(void)

Should be __init

> +{
> + INIT_DEFERRABLE_WORK(&shepherd_work, vmstat_shepherd);

It should be possible to do this at compile time. Might need addition
of core infrastructure.

> + monitored_cpus = kmalloc(BITS_TO_LONGS(nr_cpu_ids) * sizeof(long),
> + __GFP_NOFAIL);

Please don't add new uses of __GFP_NOFAIL.

Using __GFP_NOFAIL without __GFP_WAIT, __GFP_FS etc is irrational and
I'm not sure what the page allocator will do. It might just go into a
non-reclaiming tight loop, as that's precisely what this usage asked it
to do.

Let's just use GFP_KERNEL and handle any failure (which shouldn't
happen at boot time anyway).

> + cpumask_copy(monitored_cpus, cpu_online_mask);
> + cpumask_clear_cpu(tick_do_timer_cpu, monitored_cpus);

What on earth are we using tick_do_timer_cpu for anyway?
tick_do_timer_cpu is cheerfully undocumented, as is this code's use of
it.

> ...

2013-09-18 22:57:44

by Thomas Gleixner

[permalink] [raw]
Subject: Re: RFC vmstat: On demand vmstat threads

On Wed, 18 Sep 2013, Andrew Morton wrote:
> On Tue, 10 Sep 2013 21:13:34 +0000 Christoph Lameter <[email protected]> wrote:
> > + cpumask_copy(monitored_cpus, cpu_online_mask);
> > + cpumask_clear_cpu(tick_do_timer_cpu, monitored_cpus);
>
> What on earth are we using tick_do_timer_cpu for anyway?
> tick_do_timer_cpu is cheerfully undocumented, as is this code's use of
> it.

tick_do_timer_cpu is a timer core internal variable, which holds the
CPU NR which is responsible for calling do_timer(), i.e. the
timekeeping stuff. This variable has two functions:

1) Prevent a thundering herd issue of a gazillion of CPUs trying to
grab the timekeeping lock all at once. Only the CPU which is
assigned to do the update is handling it.

2) Hand off the duty in the NOHZ idle case by setting the value to
TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which
looks at it will take over and keep the time keeping alive.
The hand over procedure also covers cpu hotplug.

(Ab)Using it for anything else outside the timers core code is just
broken.

It's working for Christophs use case as his setup will not change the
assignment away from the boot cpu, but that's really not a brilliant
design to start with.

The vmstat accounting is not the only thing which we want to delegate
to dedicated core(s) for the full NOHZ mode.

So instead of playing broken games with explicitly not exposed core
code variables, we should implement a core code facility which is
aware of the NOHZ details and provides a sane way to delegate stuff to
a certain subset of CPUs.

Thanks,

tglx

2013-09-19 16:55:10

by Gilad Ben-Yossef

[permalink] [raw]
Subject: Re: RFC vmstat: On demand vmstat threads

On Wed, Sep 18, 2013 at 5:06 PM, Andrew Morton
<[email protected]> wrote:
> On Tue, 10 Sep 2013 21:13:34 +0000 Christoph Lameter <[email protected]> wrote:
>

>> With this patch it is possible then to have periods longer than
>> 2 seconds without any OS event on a "cpu" (hardware thread).
>
> It would be useful (actually essential) to have a description of why
> anyone cares about this. A good and detailed description, please.


Let me have a stab at this:

The existing vmstat_update mechanism depends on a deferrable timer
firing every second
by default which registers a work queue item that runs on the local
CPU, with the result
that we have 1 interrupt and one additional schedulable task on each
CPU aprox. every second.

If your workload indeed causes VM activity or you are running multiple
tasks per CPU, you probably
have bigger issues to deal with.

However, many existing workloads dedicate a CPU for a single CPU bound task.
This is done by high performance computing folks, by high frequency
financial applications folks,
by networking folks (Intel DPDK, EZchip NPS) and with the advent of
systems with more and more
CPUs over time, this will(?) become more and more common to do since
when you have enough CPUs
you care less about efficiently sharing your CPU with other tasks and
more about
efficiently monopolizing a CPU per task.

The difference of having this timer firing and workqueue kernel thread
scheduled per second can be enormous.
An artificial test I made measuring the worst case time to do a simple
"i++" in an endless loop on a bare metal
system and under Linux on an isolated CPU (cpusets or isolcpus - take
your pick) with dynticks and with and
without this patch, have Linux match the bare metal performance (~700
cycles) with this patch and loose by
couple of orders of magnitude (~200k cycles) without it[*] - and all
this for something that just calculates statistics.
For networking applications, for example, this is the difference
between dropping packets or sustaining line rate.

Statistics are important and useful, but if there is a way to not
cause statistics gathering produce
such a huge performance difference would be great. This is what we are
trying to do here.

Does it makes sense?

[*] To be honest it required one more patch, but this one or something
like is needed to get that one working, so...

Thanks,
Gilad





--
Gilad Ben-Yossef
Chief Coffee Drinker
[email protected]
Israel Cell: +972-52-8260388
US Cell: +1-973-8260388
http://benyossef.com

"If you take a class in large-scale robotics, can you end up in a
situation where the homework eats your dog?"
-- Jean-Baptiste Queru

Subject: Re: RFC vmstat: On demand vmstat threads

On Thu, 19 Sep 2013, Thomas Gleixner wrote:

> The vmstat accounting is not the only thing which we want to delegate
> to dedicated core(s) for the full NOHZ mode.
>
> So instead of playing broken games with explicitly not exposed core
> code variables, we should implement a core code facility which is
> aware of the NOHZ details and provides a sane way to delegate stuff to
> a certain subset of CPUs.

I would be happy to use such a facility. Otherwise I would just be adding
yet another kernel option or boot parameter I guess.

2013-09-19 21:42:49

by Gilad Ben-Yossef

[permalink] [raw]
Subject: Re: RFC vmstat: On demand vmstat threads

On Tue, Sep 10, 2013 at 4:13 PM, Christoph Lameter <[email protected]> wrote:

>
> Note: This patch is based on the vmstat patches in Andrew's tree
> to be merged for the 3.12 kernel.

Sorry for being dumb but this patch doesn't apply for me on either
mmotm nor Linus master. What did I miss?

Gilad


--
Gilad Ben-Yossef
Chief Coffee Drinker
[email protected]
Israel Cell: +972-52-8260388
US Cell: +1-973-8260388
http://benyossef.com

"If you take a class in large-scale robotics, can you end up in a
situation where the homework eats your dog?"
-- Jean-Baptiste Queru

2013-09-20 10:41:21

by Thomas Gleixner

[permalink] [raw]
Subject: Re: RFC vmstat: On demand vmstat threads

On Thu, 19 Sep 2013, Christoph Lameter wrote:
> On Thu, 19 Sep 2013, Thomas Gleixner wrote:
>
> > The vmstat accounting is not the only thing which we want to delegate
> > to dedicated core(s) for the full NOHZ mode.
> >
> > So instead of playing broken games with explicitly not exposed core
> > code variables, we should implement a core code facility which is
> > aware of the NOHZ details and provides a sane way to delegate stuff to
> > a certain subset of CPUs.
>
> I would be happy to use such a facility. Otherwise I would just be adding
> yet another kernel option or boot parameter I guess.

Uuurgh, no.

The whole delegation stuff is necessary not just for vmstat. We have
the same issue for scheduler stats and other parts of the kernel, so
we are better off in having a core facility to schedule such functions
in consistency with the current full NOHZ state.

Thanks,

tglx


2013-09-20 16:42:09

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: RFC vmstat: On demand vmstat threads

On Fri, Sep 20, 2013 at 12:41:02PM +0200, Thomas Gleixner wrote:
> On Thu, 19 Sep 2013, Christoph Lameter wrote:
> > On Thu, 19 Sep 2013, Thomas Gleixner wrote:
> >
> > > The vmstat accounting is not the only thing which we want to delegate
> > > to dedicated core(s) for the full NOHZ mode.
> > >
> > > So instead of playing broken games with explicitly not exposed core
> > > code variables, we should implement a core code facility which is
> > > aware of the NOHZ details and provides a sane way to delegate stuff to
> > > a certain subset of CPUs.
> >
> > I would be happy to use such a facility. Otherwise I would just be adding
> > yet another kernel option or boot parameter I guess.
>
> Uuurgh, no.
>
> The whole delegation stuff is necessary not just for vmstat. We have
> the same issue for scheduler stats and other parts of the kernel, so
> we are better off in having a core facility to schedule such functions
> in consistency with the current full NOHZ state.

Agreed.

So we have the choice between having this performed from callers in the
kernel with functions that enforce the affinity of some asynchronous tasks,
like "schedule_on_timekeeper()" or "schedule_on_housekeeers()" with workqueues for example.

Or we can add interface to define the affinity of such things from userspace, at the
risk of exposing some kernel details like workqueues or timers internal callback names.

Oh and may be this must stay flexible enough to handle dispatched housekeeping in the future.
Like on big NUMA machines that want to dispatch some part of the housekeeping on each
NUMA nodes for close running CPU. Although I don't have any detail in mind for that.

I've also been thinking of some flag for defferable timers to be also user defferable.
But I expect too much overhead to maintain that on kernel/user boundaries. And eventually
the issues we have go beyond just user/kernel ring conditions.

Just random thoughts.

2013-09-20 21:03:24

by Thomas Gleixner

[permalink] [raw]
Subject: Re: RFC vmstat: On demand vmstat threads

B1;3202;0cOn Fri, 20 Sep 2013, Frederic Weisbecker wrote:
> On Fri, Sep 20, 2013 at 12:41:02PM +0200, Thomas Gleixner wrote:
> > On Thu, 19 Sep 2013, Christoph Lameter wrote:
> > > On Thu, 19 Sep 2013, Thomas Gleixner wrote:
> > >
> > > > The vmstat accounting is not the only thing which we want to delegate
> > > > to dedicated core(s) for the full NOHZ mode.
> > > >
> > > > So instead of playing broken games with explicitly not exposed core
> > > > code variables, we should implement a core code facility which is
> > > > aware of the NOHZ details and provides a sane way to delegate stuff to
> > > > a certain subset of CPUs.
> > >
> > > I would be happy to use such a facility. Otherwise I would just be adding
> > > yet another kernel option or boot parameter I guess.
> >
> > Uuurgh, no.
> >
> > The whole delegation stuff is necessary not just for vmstat. We have
> > the same issue for scheduler stats and other parts of the kernel, so
> > we are better off in having a core facility to schedule such functions
> > in consistency with the current full NOHZ state.
>
> Agreed.
>
> So we have the choice between having this performed from callers in
> the kernel with functions that enforce the affinity of some
> asynchronous tasks, like "schedule_on_timekeeper()" or
> "schedule_on_housekeeers()" with workqueues for example.

Why do you need different targets?

> Or we can add interface to define the affinity of such things from
> userspace, at the ....

We already have the relevant information in the kernel. And it's not
too hard to come up with a rather simple and robust scheme for this.

For the following I use the terms enter/leave isolation mode in that
way:

Enter/leave isolation mode is when the full NOHZ mode is
enabled/disabled for a cpu, not when the CPU actually
enters/leaves that state (i.e. single cpu bound userspace task).

So what you want is something like this:

int housekeeping_register(int (*cb)(struct cpumask *mask),
unsinged period_ms, bool always);

cb: the callback to execute. it processes the data for all cores
which are set in the cpumask handed in by the housekeeping
scheduler.

period_ms: period of the callback, can be 0 for immediate
one time execution

always: the always argument tells the core code whether to schedule
the callback unconditionally. If false it only schedules it
when the core enters isolation mode.

In the beginning we simply schedule the callbacks on each online cpu,
if the always bit is set. For the callbacks which are registered with
the always bit off, we schedule them only on entry into isolation
mode.

Now when a cpu becomes isolated we stop the callback scheduling on
that cpu and assign it to the cpu with the smallest NUMA
distance. So that cpu will process the data for itself and for the
newly isolated cpu.

When a cpu leaves isolation mode then it gets its housekeeping task
assigned back.

We need to be clever about the NOHZ idle interaction. If a cpu has
assigned more than its own data to process, then it shouldn't use a
deferrable timer. CPUs which only take care of their own data can use
a deferrable timer.

This works out of the box for stuff like vmstat, where the callback is
already done in a workqueue and we can register them with always =
true.

The scheduler stats are a slightly different beast, but it's not
rocket science to handle that.

We register the callback with always = false. So for a bog standard
system nothing happens, except the registering. Once the full NOHZ
mode is enabled on a cpu we schedule the work with a reasonable slow
period (e.g. 1 sec) on a non isolated cpu. That's where stuff gets
interesting.

On the isolated cpu we might still execute the scheduler tick because
we did not yet reach a condition to disable it. So we need to protect
the on cpu accounting against the scheduled one on the remote
cpu. Unfortunately that requires locking. The only reasonable lock
here is runqueue lock of the isolated cpu. Though this sounds worse
than it is. We take the cpu local rq lock from the tick anyway in
scheduler_tick(). So we can move the account_process_tick() call to
this code. Zero impact for the non isolated case.

In the isolated case we only might get contention, when the isolated
cpu was not yet able to disable the tick, but the remote update is
going to be slow anyway and that update can exit early when it notices
that the last on cpu update was less than a tick away.

Now if we run the remote update with a slow period (1 sec) there might
be some delay in the stats, but once the cpu vanished into user space
the while(1) mode we can really live with the slightly inaccurate
accumulation.

The only other issue might be posix cpu timers. For the start I really
would just ignore them. There are other means to watchdog a task
runtime, but we can extend the remote slow update scheme to posix cpu
timers as well if the need arises.

Thanks,

tglx

Subject: Re: RFC vmstat: On demand vmstat threads

On Fri, 20 Sep 2013, Thomas Gleixner wrote:

> The whole delegation stuff is necessary not just for vmstat. We have
> the same issue for scheduler stats and other parts of the kernel, so
> we are better off in having a core facility to schedule such functions
> in consistency with the current full NOHZ state.

Ok how do I make use of such a facility? What is the status of work on
such a thing?

Subject: Re: RFC vmstat: On demand vmstat threads

On Fri, 20 Sep 2013, Thomas Gleixner wrote:

> Now when a cpu becomes isolated we stop the callback scheduling on
> that cpu and assign it to the cpu with the smallest NUMA
> distance. So that cpu will process the data for itself and for the
> newly isolated cpu.

That is not possible for many percpu threads since they rely on running on
a specific cpu for optimization purposes. Running on a different processor
makes these threads racy.

What is needed is to be able to switch these things off and on. Something
on a different cpu may monitor if processing on that specific cpu is
needed or not but it cannot perform the vmstat updates.