For nohz full CPUs, we'd like the per-CPU vm statistics to be
synchronized when userspace is executing. Otherwise,
the vmstat_shepherd might queue a work item to synchronize them,
which is undesired intereference for isolated CPUs.
This means that its necessary to check for, and possibly sync,
the statistics when returning to userspace. This means that
there are now two execution contexes, on different CPUs,
which require awareness about each other: context switch
and vmstat shepherd kernel threadr.
To avoid the shared variables between these two contexes (which
would require atomic accesses), delegate the responsability
of statistics synchronization from vmstat_shepherd to local CPU
context, for nohz_full CPUs.
Do that by queueing a delayed work when marking per-CPU vmstat dirty.
When returning to userspace, fold the stats and cancel the delayed work.
When entering idle, only fold the stats.
Signed-off-by: Marcelo Tosatti <[email protected]>
---
include/linux/vmstat.h | 4 ++--
kernel/time/tick-sched.c | 2 +-
mm/vmstat.c | 41 ++++++++++++++++++++++++++++++++---------
3 files changed, 35 insertions(+), 12 deletions(-)
Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c
+++ linux-2.6/mm/vmstat.c
@@ -28,6 +28,7 @@
#include <linux/mm_inline.h>
#include <linux/page_ext.h>
#include <linux/page_owner.h>
+#include <linux/tick.h>
#include "internal.h"
@@ -194,21 +195,57 @@ void fold_vm_numa_events(void)
#endif
#ifdef CONFIG_SMP
-static DEFINE_PER_CPU_ALIGNED(bool, vmstat_dirty);
+
+struct vmstat_dirty {
+ bool dirty;
+#ifdef CONFIG_FLUSH_WORK_ON_RESUME_USER
+ bool cpu_offline;
+#endif
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct vmstat_dirty, vmstat_dirty_pcpu);
+static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
+int sysctl_stat_interval __read_mostly = HZ;
+
+#ifdef CONFIG_FLUSH_WORK_ON_RESUME_USER
+static inline void vmstat_queue_local_work(void)
+{
+ bool vmstat_dirty = this_cpu_read(vmstat_dirty_pcpu.dirty);
+ bool cpu_offline = this_cpu_read(vmstat_dirty_pcpu.cpu_offline);
+ int cpu = smp_processor_id();
+
+ if (tick_nohz_full_cpu(cpu) && !vmstat_dirty) {
+ struct delayed_work *dw;
+
+ dw = this_cpu_ptr(&vmstat_work);
+ if (!delayed_work_pending(dw) && !cpu_offline) {
+ unsigned long delay;
+
+ delay = round_jiffies_relative(sysctl_stat_interval);
+ queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);
+ }
+ }
+}
+#else
+static inline void vmstat_queue_local_work(void)
+{
+}
+#endif
static inline void vmstat_mark_dirty(void)
{
- this_cpu_write(vmstat_dirty, true);
+ vmstat_queue_local_work();
+ this_cpu_write(vmstat_dirty_pcpu.dirty, true);
}
static inline void vmstat_clear_dirty(void)
{
- this_cpu_write(vmstat_dirty, false);
+ this_cpu_write(vmstat_dirty_pcpu.dirty, false);
}
static inline bool is_vmstat_dirty(void)
{
- return this_cpu_read(vmstat_dirty);
+ return this_cpu_read(vmstat_dirty_pcpu.dirty);
}
int calculate_pressure_threshold(struct zone *zone)
@@ -1893,9 +1930,6 @@ static const struct seq_operations vmsta
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
-int sysctl_stat_interval __read_mostly = HZ;
-
#ifdef CONFIG_PROC_FS
static void refresh_vm_stats(struct work_struct *work)
{
@@ -1980,7 +2014,7 @@ static void vmstat_update(struct work_st
* until the diffs stay at zero. The function is used by NOHZ and can only be
* invoked when tick processing is not active.
*/
-void quiet_vmstat(void)
+void quiet_vmstat(bool user)
{
if (system_state != SYSTEM_RUNNING)
return;
@@ -1988,13 +2022,19 @@ void quiet_vmstat(void)
if (!is_vmstat_dirty())
return;
+ refresh_cpu_vm_stats(false);
+
+ if (!IS_ENABLED(CONFIG_FLUSH_WORK_ON_RESUME_USER))
+ return;
+
+ if (!user)
+ return;
/*
- * Just refresh counters and do not care about the pending delayed
- * vmstat_update. It doesn't fire that often to matter and canceling
- * it would be too expensive from this path.
- * vmstat_shepherd will take care about that for us.
+ * If the tick is stopped, cancel any delayed work to avoid
+ * interruptions to this CPU in the future.
*/
- refresh_cpu_vm_stats(false);
+ if (delayed_work_pending(this_cpu_ptr(&vmstat_work)))
+ cancel_delayed_work(this_cpu_ptr(&vmstat_work));
}
/*
@@ -2015,8 +2055,14 @@ static void vmstat_shepherd(struct work_
/* Check processors whose vmstat worker threads have been disabled */
for_each_online_cpu(cpu) {
struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
+ struct vmstat_dirty *vms = per_cpu_ptr(&vmstat_dirty_pcpu, cpu);
+
+ if (IS_ENABLED(CONFIG_FLUSH_WORK_ON_RESUME_USER))
+ /* NOHZ full CPUs manage their own vmstat flushing */
+ if (tick_nohz_full_cpu(cpu))
+ continue;
- if (!delayed_work_pending(dw) && per_cpu(vmstat_dirty, cpu))
+ if (!delayed_work_pending(dw) && vms->dirty)
queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
cond_resched();
@@ -2049,8 +2095,36 @@ static void __init init_cpu_node_state(v
}
}
+#ifdef CONFIG_FLUSH_WORK_ON_RESUME_USER
+static void vmstat_cpu_online_rearm(unsigned int cpu)
+{
+ struct vmstat_dirty *vms = per_cpu_ptr(&vmstat_dirty_pcpu, cpu);
+
+ if (tick_nohz_full_cpu(cpu)) {
+ struct delayed_work *dw;
+
+ vms->cpu_offline = false;
+ vms->dirty = true;
+
+ dw = this_cpu_ptr(&vmstat_work);
+ if (!delayed_work_pending(dw)) {
+ unsigned long delay;
+
+ delay = round_jiffies_relative(sysctl_stat_interval);
+ queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);
+ }
+ }
+}
+#else
+static void vmstat_cpu_online_rearm(unsigned int cpu)
+{
+}
+#endif
+
static int vmstat_cpu_online(unsigned int cpu)
{
+ vmstat_cpu_online_rearm(cpu);
+
refresh_zone_stat_thresholds();
if (!node_state(cpu_to_node(cpu), N_CPU)) {
@@ -2060,8 +2134,28 @@ static int vmstat_cpu_online(unsigned in
return 0;
}
+
+#ifdef CONFIG_FLUSH_WORK_ON_RESUME_USER
+static void vmstat_mark_cpu_offline(unsigned int cpu)
+{
+ struct vmstat_dirty *vms = per_cpu_ptr(&vmstat_dirty_pcpu, cpu);
+
+ vms->cpu_offline = true;
+}
+#else
+static void vmstat_mark_cpu_offline(unsigned int cpu)
+{
+}
+#endif
+
+/*
+ * Callbacks in the ONLINE section (CPUHP_AP_ONLINE_DYN is in this section),
+ * are invoked on the hotplugged CPU from the per CPU
+ * hotplug thread with interrupts and preemption enabled.
+ */
static int vmstat_cpu_down_prep(unsigned int cpu)
{
+ vmstat_mark_cpu_offline(cpu);
cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
return 0;
}
Index: linux-2.6/include/linux/vmstat.h
===================================================================
--- linux-2.6.orig/include/linux/vmstat.h
+++ linux-2.6/include/linux/vmstat.h
@@ -290,7 +290,7 @@ extern void dec_zone_state(struct zone *
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_node_state(struct pglist_data *, enum node_stat_item);
-void quiet_vmstat(void);
+void quiet_vmstat(bool user);
void cpu_vm_stats_fold(int cpu);
void refresh_zone_stat_thresholds(void);
@@ -403,7 +403,7 @@ static inline void __dec_node_page_state
static inline void refresh_zone_stat_thresholds(void) { }
static inline void cpu_vm_stats_fold(int cpu) { }
-static inline void quiet_vmstat(void) { }
+static inline void quiet_vmstat(bool user) { }
static inline void drain_zonestat(struct zone *zone,
struct per_cpu_zonestat *pzstats) { }
Index: linux-2.6/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -911,7 +911,7 @@ static void tick_nohz_stop_tick(struct t
*/
if (!ts->tick_stopped) {
calc_load_nohz_start();
- quiet_vmstat();
+ quiet_vmstat(false);
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
Index: linux-2.6/init/Kconfig
===================================================================
--- linux-2.6.orig/init/Kconfig
+++ linux-2.6/init/Kconfig
@@ -678,6 +678,19 @@ config CPU_ISOLATION
Say Y if unsure.
+config FLUSH_WORK_ON_RESUME_USER
+ bool "Flush per-CPU vmstats on user return (for nohz full CPUs)"
+ depends on NO_HZ_FULL
+ default y
+
+ help
+ By default, nohz full CPUs flush per-CPU vm statistics on return
+ to userspace (to avoid additional interferences when executing
+ userspace code). This has a small but measurable impact on
+ system call performance. You can disable this to improve system call
+ performance, at the expense of potential interferences to userspace
+ execution.
+
source "kernel/rcu/Kconfig"
config BUILD_BIN2C
On Fri, Jan 06, 2023 at 08:12:44AM +0800, Hillf Danton wrote:
> On 05 Jan 2023 09:52:21 -0300 Marcelo Tosatti <[email protected]>
> > For nohz full CPUs, we'd like the per-CPU vm statistics to be
> > synchronized when userspace is executing. Otherwise,
> > the vmstat_shepherd might queue a work item to synchronize them,
> > which is undesired intereference for isolated CPUs.
> >
> > This means that its necessary to check for, and possibly sync,
> > the statistics when returning to userspace. This means that
> > there are now two execution contexes, on different CPUs,
> > which require awareness about each other: context switch
> > and vmstat shepherd kernel threadr.
> >
> > To avoid the shared variables between these two contexes (which
> > would require atomic accesses), delegate the responsability
> > of statistics synchronization from vmstat_shepherd to local CPU
> > context, for nohz_full CPUs.
> >
> > Do that by queueing a delayed work when marking per-CPU vmstat dirty.
> >
> > When returning to userspace, fold the stats and cancel the delayed work.
> >
> > When entering idle, only fold the stats.
> >
> > Signed-off-by: Marcelo Tosatti <[email protected]>
> > ---
> > include/linux/vmstat.h | 4 ++--
> > kernel/time/tick-sched.c | 2 +-
> > mm/vmstat.c | 41 ++++++++++++++++++++++++++++++++---------
> > 3 files changed, 35 insertions(+), 12 deletions(-)
> >
> > Index: linux-2.6/mm/vmstat.c
> > ===================================================================
> > --- linux-2.6.orig/mm/vmstat.c
> > +++ linux-2.6/mm/vmstat.c
> > @@ -28,6 +28,7 @@
> > #include <linux/mm_inline.h>
> > #include <linux/page_ext.h>
> > #include <linux/page_owner.h>
> > +#include <linux/tick.h>
> >
> > #include "internal.h"
> >
> > @@ -194,21 +195,57 @@ void fold_vm_numa_events(void)
> > #endif
> >
> > #ifdef CONFIG_SMP
> > -static DEFINE_PER_CPU_ALIGNED(bool, vmstat_dirty);
> > +
> > +struct vmstat_dirty {
> > + bool dirty;
> > +#ifdef CONFIG_FLUSH_WORK_ON_RESUME_USER
> > + bool cpu_offline;
> > +#endif
> > +};
> > +
> > +static DEFINE_PER_CPU_ALIGNED(struct vmstat_dirty, vmstat_dirty_pcpu);
> > +static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
> > +int sysctl_stat_interval __read_mostly = HZ;
> > +
> > +#ifdef CONFIG_FLUSH_WORK_ON_RESUME_USER
> > +static inline void vmstat_queue_local_work(void)
> > +{
> > + bool vmstat_dirty = this_cpu_read(vmstat_dirty_pcpu.dirty);
> > + bool cpu_offline = this_cpu_read(vmstat_dirty_pcpu.cpu_offline);
> > + int cpu = smp_processor_id();
> > +
> > + if (tick_nohz_full_cpu(cpu) && !vmstat_dirty) {
> > + struct delayed_work *dw;
> > +
> > + dw = this_cpu_ptr(&vmstat_work);
> > + if (!delayed_work_pending(dw) && !cpu_offline) {
> > + unsigned long delay;
> > +
> > + delay = round_jiffies_relative(sysctl_stat_interval);
> > + queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);
>
> Regression wrt V12 if timer is added on the CPU that is not doing HK_TYPE_TIMER?
Before this change, the timer was managed (and queued on an isolated
CPU) by vmstat_shepherd. Now it is managed (and queued) by the local
CPU, so there is no regression.
Thanks.
On Fri, Jan 06, 2023 at 11:01:54PM +0800, Hillf Danton wrote:
> On 6 Jan 2023 09:51:00 -0300 Marcelo Tosatti <[email protected]>
> > On Fri, Jan 06, 2023 at 08:12:44AM +0800, Hillf Danton wrote:
> > >
> > > Regression wrt V12 if timer is added on the CPU that is not doing HK_TYPE_TIMER?
> >
> > Before this change, the timer was managed (and queued on an isolated
> > CPU) by vmstat_shepherd. Now it is managed (and queued) by the local
> > CPU, so there is no regression.
>
> Given vm stats folded when returning to userspace, queuing the delayed work
> barely makes sense in the first place. If it can be canceled, queuing it burns
> cycles with nothing earned. Otherwise vm stats got folded already.
Agree, but you can't know whether return to userspace will occur
before the timer is fired.
So queueing the timer is to _ensure_ that eventually vmstats will be
synced (which maintains the current timing behaviour wrt vmstat syncs).
Also don't think the queueing cost is significant: it only happens
for the first vmstat dirty item.
> Nor does shepherd even without delay. And the right thing is only make shepherd
> leave isolated CPUs intact.
>
>
Hi Hillf,
On Sat, Jan 07, 2023 at 08:15:29AM +0800, Hillf Danton wrote:
> On 6 Jan 2023 15:16:23 -0300 Marcelo Tosatti <[email protected]>
> > On Fri, Jan 06, 2023 at 11:01:54PM +0800, Hillf Danton wrote:
> > > On 6 Jan 2023 09:51:00 -0300 Marcelo Tosatti <[email protected]>
> > > > On Fri, Jan 06, 2023 at 08:12:44AM +0800, Hillf Danton wrote:
> > > > >
> > > > > Regression wrt V12 if timer is added on the CPU that is not doing HK_TYPE_TIMER?
> > > >
> > > > Before this change, the timer was managed (and queued on an isolated
> > > > CPU) by vmstat_shepherd. Now it is managed (and queued) by the local
> > > > CPU, so there is no regression.
> > >
> > > Given vm stats folded when returning to userspace, queuing the delayed work
> > > barely makes sense in the first place. If it can be canceled, queuing it burns
> > > cycles with nothing earned. Otherwise vm stats got folded already.
> >
> > Agree, but you can't know whether return to userspace will occur
> > before the timer is fired.
>
> No way to predict a random timer expiration, no?
Right.
> >
> > So queueing the timer is to _ensure_ that eventually vmstats will be
> > synced (which maintains the current timing behaviour wrt vmstat syncs).
>
> After this change,
>
> > > > > > @@ -1988,13 +2022,19 @@ void quiet_vmstat(void)
> > > > > > if (!is_vmstat_dirty())
> > > > > > return;
> > > > > >
>
> it is only ensured eventually by this check instead.
Yes, but if you do not return to userspace, then the per-CPU vm
statistics can be dirty indefinitely.
> > > > > > + refresh_cpu_vm_stats(false);
> > > > > > +
> > > > > > + if (!IS_ENABLED(CONFIG_FLUSH_WORK_ON_RESUME_USER))
> > > > > > + return;
> > > > > > +
> > > > > > + if (!user)
> > > > > > + return;
>
>
> > Also don't think the queueing cost is significant: it only happens
> > for the first vmstat dirty item.
>
> Cost is considered only if it is needed.
Not sure i understand what you mean (or whether there is any alternative
to the timer).
On Tue, Jan 10, 2023 at 10:43:56AM +0800, Hillf Danton wrote:
> On 9 Jan 2023 11:12:49 -0300 Marcelo Tosatti <[email protected]>
> >
> > Yes, but if you do not return to userspace, then the per-CPU vm
> > statistics can be dirty indefinitely.
>
> Could you specify the reasons for failing to return to userspace,
> given it is undesired intereference for the shepherd to queue work
> on the isolated CPUs.
Any system call that takes longer than the threshold to sync vmstats.
Or a long running kernel thread, for example:
https://stackoverflow.com/questions/65111483/long-running-kthread-and-synchronize-net
On Tue, Jan 10, 2023 at 11:19:01PM +0800, Hillf Danton wrote:
> On Tue, 10 Jan 2023 08:50:28 -0300 Marcelo Tosatti <[email protected]>
> > On Tue, Jan 10, 2023 at 10:43:56AM +0800, Hillf Danton wrote:
> > > On 9 Jan 2023 11:12:49 -0300 Marcelo Tosatti <[email protected]>
> > > >
> > > > Yes, but if you do not return to userspace, then the per-CPU vm
> > > > statistics can be dirty indefinitely.
> > >
> > > Could you specify the reasons for failing to return to userspace,
> > > given it is undesired intereference for the shepherd to queue work
> > > on the isolated CPUs.
> >
> > Any system call that takes longer than the threshold to sync vmstats.
>
> Which ones?
>
> If schedule() occurs during syscall because of acquiring mutex for instance
> then anything on the isolated runqueue, including workqueue worker shepherd
> wakes up, can burn CPU cycles without undesired intereference produced.
The above confuses me. How others tasks would help with syscalls that take too long too
service?
> >
> > Or a long running kernel thread, for example:
>
> It is a buggyyyy example.
> >
> > https://stackoverflow.com/questions/65111483/long-running-kthread-and-synchronize-net
I can imagine a CPU spending most of its time processing networking packets
through interrupts/softirq within ksoftirqd/NAPI while another CPU process
these packets in userspace.
In this case the CPU handling the kernel part can theoretically never go to
idle/user. nohz_full isn't optimized toward such job but there is nothing
to prevent it from doing such job.
Thanks.
On Wed, Jan 11, 2023 at 07:58:22AM +0800, Hillf Danton wrote:
> On 10 Jan 2023 17:12:22 +0100 Frederic Weisbecker <[email protected]>
> > On Tue, Jan 10, 2023 at 11:19:01PM +0800, Hillf Danton wrote:
> > > On Tue, 10 Jan 2023 08:50:28 -0300 Marcelo Tosatti <[email protected]>
> > > > On Tue, Jan 10, 2023 at 10:43:56AM +0800, Hillf Danton wrote:
> > > > > On 9 Jan 2023 11:12:49 -0300 Marcelo Tosatti <[email protected]>
> > > > > >
> > > > > > Yes, but if you do not return to userspace, then the per-CPU vm
> > > > > > statistics can be dirty indefinitely.
> > > > >
> > > > > Could you specify the reasons for failing to return to userspace,
> > > > > given it is undesired intereference for the shepherd to queue work
> > > > > on the isolated CPUs.
> > > >
> > > > Any system call that takes longer than the threshold to sync vmstats.
> > >
> > > Which ones?
> > >
> > > If schedule() occurs during syscall because of acquiring mutex for instance
> > > then anything on the isolated runqueue, including workqueue worker shepherd
> > > wakes up, can burn CPU cycles without undesired interference produced.
> >
> > The above confuses me. How others tasks would help with syscalls that take too long too
> > service?
>
> Given no scheduling in userspace, no chance for other tasks to interfere
> after returning to userspace, on one hand.
>
> Upon scheduling during syscall on the other hand, it is the right time
> to sync vmstats for example. But no vmstats can be updated without works
> queued by shepherd.
>
> In a nutshell, no interference could happen without scheduling, and how
> work is queued does not matter. So the current shepherd behavior is prefered.
I'm still confused... We want to avoid the shepherd because it may queue the
vmstat work while the task wants to run noise-free in userspace.
> >
> > > >
> > > > Or a long running kernel thread, for example:
> > >
> > > It is a buggyyyy example.
> > > >
> > > > https://stackoverflow.com/questions/65111483/long-running-kthread-and-synchronize-net
> >
> > I can imagine a CPU spending most of its time processing networking packets
> > through interrupts/softirq within ksoftirqd/NAPI while another CPU process
> > these packets in userspace.
> >
> > In this case the CPU handling the kernel part can theoretically never go to
> > idle/user. nohz_full isn't optimized toward such job but there is nothing
> > to prevent it from doing such job.
>
> A simple FIFO task launched by an administrator can get a CPU out of scheduler's
> control for a week, regardless of isolation.
Sure. But, what do you mean by that exactly?
Thanks.