2022-12-06 18:30:13

by Marcelo Tosatti

[permalink] [raw]
Subject: [PATCH v9 3/5] mm/vmstat: manage per-CPU stats from CPU context when NOHZ full

For nohz full CPUs, manage per-CPU stat syncing from CPU context:
start delayed work when marking per-CPU vmstat dirty.

When returning to userspace, fold the stats and cancel the delayed work.

When entering idle, only fold the stats.

Signed-off-by: Marcelo Tosatti <[email protected]>
---
include/linux/vmstat.h | 4 ++--
kernel/time/tick-sched.c | 2 +-
mm/vmstat.c | 41 ++++++++++++++++++++++++++++++++---------
3 files changed, 35 insertions(+), 12 deletions(-)

Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c
+++ linux-2.6/mm/vmstat.c
@@ -28,6 +28,7 @@
#include <linux/mm_inline.h>
#include <linux/page_ext.h>
#include <linux/page_owner.h>
+#include <linux/tick.h>

#include "internal.h"

@@ -195,9 +196,24 @@ void fold_vm_numa_events(void)

#ifdef CONFIG_SMP
static DEFINE_PER_CPU_ALIGNED(bool, vmstat_dirty);
+static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
+int sysctl_stat_interval __read_mostly = HZ;

static inline void vmstat_mark_dirty(void)
{
+ int cpu = smp_processor_id();
+
+ if (tick_nohz_full_cpu(cpu) && !this_cpu_read(vmstat_dirty)) {
+ struct delayed_work *dw;
+
+ dw = &per_cpu(vmstat_work, cpu);
+ if (!delayed_work_pending(dw)) {
+ unsigned long delay;
+
+ delay = round_jiffies_relative(sysctl_stat_interval);
+ queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);
+ }
+ }
this_cpu_write(vmstat_dirty, true);
}

@@ -1886,9 +1902,6 @@ static const struct seq_operations vmsta
#endif /* CONFIG_PROC_FS */

#ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
-int sysctl_stat_interval __read_mostly = HZ;
-
#ifdef CONFIG_PROC_FS
static void refresh_vm_stats(struct work_struct *work)
{
@@ -1973,21 +1986,27 @@ static void vmstat_update(struct work_st
* until the diffs stay at zero. The function is used by NOHZ and can only be
* invoked when tick processing is not active.
*/
-void quiet_vmstat(void)
+void quiet_vmstat(bool user)
{
+ struct delayed_work *dw;
+
if (system_state != SYSTEM_RUNNING)
return;

if (!is_vmstat_dirty())
return;

+ refresh_cpu_vm_stats(false);
+
+ if (!user)
+ return;
/*
- * Just refresh counters and do not care about the pending delayed
- * vmstat_update. It doesn't fire that often to matter and canceling
- * it would be too expensive from this path.
- * vmstat_shepherd will take care about that for us.
+ * If the tick is stopped, cancel any delayed work to avoid
+ * interruptions to this CPU in the future.
*/
- refresh_cpu_vm_stats(false);
+ dw = &per_cpu(vmstat_work, smp_processor_id());
+ if (delayed_work_pending(dw))
+ cancel_delayed_work(dw);
}

/*
@@ -2009,6 +2028,10 @@ static void vmstat_shepherd(struct work_
for_each_online_cpu(cpu) {
struct delayed_work *dw = &per_cpu(vmstat_work, cpu);

+ /* NOHZ full CPUs manage their own vmstat flushing */
+ if (tick_nohz_full_cpu(smp_processor_id()))
+ continue;
+
if (!delayed_work_pending(dw) && per_cpu(vmstat_dirty, cpu))
queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);

Index: linux-2.6/include/linux/vmstat.h
===================================================================
--- linux-2.6.orig/include/linux/vmstat.h
+++ linux-2.6/include/linux/vmstat.h
@@ -290,7 +290,7 @@ extern void dec_zone_state(struct zone *
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_node_state(struct pglist_data *, enum node_stat_item);

-void quiet_vmstat(void);
+void quiet_vmstat(bool user);
void cpu_vm_stats_fold(int cpu);
void refresh_zone_stat_thresholds(void);

@@ -403,7 +403,7 @@ static inline void __dec_node_page_state

static inline void refresh_zone_stat_thresholds(void) { }
static inline void cpu_vm_stats_fold(int cpu) { }
-static inline void quiet_vmstat(void) { }
+static inline void quiet_vmstat(bool user) { }

static inline void drain_zonestat(struct zone *zone,
struct per_cpu_zonestat *pzstats) { }
Index: linux-2.6/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -911,7 +911,7 @@ static void tick_nohz_stop_tick(struct t
*/
if (!ts->tick_stopped) {
calc_load_nohz_start();
- quiet_vmstat();
+ quiet_vmstat(false);

ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;



2022-12-14 13:26:41

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH v9 3/5] mm/vmstat: manage per-CPU stats from CPU context when NOHZ full

On Tue, Dec 06, 2022 at 01:18:29PM -0300, Marcelo Tosatti wrote:
> For nohz full CPUs, manage per-CPU stat syncing from CPU context:
> start delayed work when marking per-CPU vmstat dirty.
>
> When returning to userspace, fold the stats and cancel the delayed work.
>
> When entering idle, only fold the stats.

The changelog still misses the reason behind the changes.

> @@ -195,9 +196,24 @@ void fold_vm_numa_events(void)
>
> #ifdef CONFIG_SMP
> static DEFINE_PER_CPU_ALIGNED(bool, vmstat_dirty);
> +static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
> +int sysctl_stat_interval __read_mostly = HZ;
>
> static inline void vmstat_mark_dirty(void)
> {
> + int cpu = smp_processor_id();
> +
> + if (tick_nohz_full_cpu(cpu) && !this_cpu_read(vmstat_dirty)) {
> + struct delayed_work *dw;
> +
> + dw = &per_cpu(vmstat_work, cpu);

this_cpu_ptr()

> + if (!delayed_work_pending(dw)) {
> + unsigned long delay;
> +
> + delay = round_jiffies_relative(sysctl_stat_interval);
> + queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);
> + }
> + }
> this_cpu_write(vmstat_dirty, true);
> }
>
> @@ -1973,21 +1986,27 @@ static void vmstat_update(struct work_st
> * until the diffs stay at zero. The function is used by NOHZ and can only be
> * invoked when tick processing is not active.
> */
> -void quiet_vmstat(void)
> +void quiet_vmstat(bool user)
> {
> + struct delayed_work *dw;
> +
> if (system_state != SYSTEM_RUNNING)
> return;
>
> if (!is_vmstat_dirty())
> return;
>
> + refresh_cpu_vm_stats(false);
> +
> + if (!user)
> + return;
> /*
> - * Just refresh counters and do not care about the pending delayed
> - * vmstat_update. It doesn't fire that often to matter and canceling
> - * it would be too expensive from this path.
> - * vmstat_shepherd will take care about that for us.
> + * If the tick is stopped, cancel any delayed work to avoid
> + * interruptions to this CPU in the future.
> */
> - refresh_cpu_vm_stats(false);
> + dw = &per_cpu(vmstat_work, smp_processor_id());

this_cpu_ptr()

Thanks.

> + if (delayed_work_pending(dw))
> + cancel_delayed_work(dw);
> }
>
> /*

2022-12-14 14:20:35

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH v9 3/5] mm/vmstat: manage per-CPU stats from CPU context when NOHZ full

On Tue, Dec 06, 2022 at 01:18:29PM -0300, Marcelo Tosatti wrote:
> static inline void vmstat_mark_dirty(void)
> {
> + int cpu = smp_processor_id();
> +
> + if (tick_nohz_full_cpu(cpu) && !this_cpu_read(vmstat_dirty)) {
> + struct delayed_work *dw;
> +
> + dw = &per_cpu(vmstat_work, cpu);
> + if (!delayed_work_pending(dw)) {
> + unsigned long delay;
> +
> + delay = round_jiffies_relative(sysctl_stat_interval);
> + queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);

Currently the vmstat_work is flushed on cpu_hotplug (CPUHP_AP_ONLINE_DYN).
vmstat_shepherd makes sure to not rearm it afterward. But now it looks
possible for the above to do that mistake?

> + }
> + }
> this_cpu_write(vmstat_dirty, true);
> }
> @@ -2009,6 +2028,10 @@ static void vmstat_shepherd(struct work_
> for_each_online_cpu(cpu) {
> struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
>
> + /* NOHZ full CPUs manage their own vmstat flushing */
> + if (tick_nohz_full_cpu(smp_processor_id()))

It should be the remote CPU instead of the current one.

Thanks.

> + continue;
> +
> if (!delayed_work_pending(dw) && per_cpu(vmstat_dirty, cpu))
> queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
>

2022-12-16 16:46:30

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v9 3/5] mm/vmstat: manage per-CPU stats from CPU context when NOHZ full

On Wed, Dec 14, 2022 at 02:33:02PM +0100, Frederic Weisbecker wrote:
> On Tue, Dec 06, 2022 at 01:18:29PM -0300, Marcelo Tosatti wrote:
> > static inline void vmstat_mark_dirty(void)
> > {
> > + int cpu = smp_processor_id();
> > +
> > + if (tick_nohz_full_cpu(cpu) && !this_cpu_read(vmstat_dirty)) {
> > + struct delayed_work *dw;
> > +
> > + dw = &per_cpu(vmstat_work, cpu);
> > + if (!delayed_work_pending(dw)) {
> > + unsigned long delay;
> > +
> > + delay = round_jiffies_relative(sysctl_stat_interval);
> > + queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);
>
> Currently the vmstat_work is flushed on cpu_hotplug (CPUHP_AP_ONLINE_DYN).
> vmstat_shepherd makes sure to not rearm it afterward. But now it looks
> possible for the above to do that mistake?

Don't think the mistake is an issue. In case of a
queue_delayed_work_on being called after cancel_delayed_work_sync,
either vmstat_update executes on the local CPU, or on a
different CPU (after the bound kworkers have been moved).

Each case is fine (see vmstat_update).

> > + }
> > + }
> > this_cpu_write(vmstat_dirty, true);
> > }
> > @@ -2009,6 +2028,10 @@ static void vmstat_shepherd(struct work_
> > for_each_online_cpu(cpu) {
> > struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
> >
> > + /* NOHZ full CPUs manage their own vmstat flushing */
> > + if (tick_nohz_full_cpu(smp_processor_id()))
>
> It should be the remote CPU instead of the current one.

Fixed.

2022-12-16 23:20:04

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH v9 3/5] mm/vmstat: manage per-CPU stats from CPU context when NOHZ full

On Fri, Dec 16, 2022 at 01:16:09PM -0300, Marcelo Tosatti wrote:
> On Wed, Dec 14, 2022 at 02:33:02PM +0100, Frederic Weisbecker wrote:
> > On Tue, Dec 06, 2022 at 01:18:29PM -0300, Marcelo Tosatti wrote:
> > > static inline void vmstat_mark_dirty(void)
> > > {
> > > + int cpu = smp_processor_id();
> > > +
> > > + if (tick_nohz_full_cpu(cpu) && !this_cpu_read(vmstat_dirty)) {
> > > + struct delayed_work *dw;
> > > +
> > > + dw = &per_cpu(vmstat_work, cpu);
> > > + if (!delayed_work_pending(dw)) {
> > > + unsigned long delay;
> > > +
> > > + delay = round_jiffies_relative(sysctl_stat_interval);
> > > + queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);
> >
> > Currently the vmstat_work is flushed on cpu_hotplug (CPUHP_AP_ONLINE_DYN).
> > vmstat_shepherd makes sure to not rearm it afterward. But now it looks
> > possible for the above to do that mistake?
>
> Don't think the mistake is an issue. In case of a
> queue_delayed_work_on being called after cancel_delayed_work_sync,
> either vmstat_update executes on the local CPU, or on a
> different CPU (after the bound kworkers have been moved).

But after the CPU goes offline, its workqueue pool becomes UNBOUND. Which means
that the vmstat_update() from the offline CPU can then execute partly on CPU 0, then
gets preempted and executes halfway on CPU 1, then gets preempted and...

Having a quick look at refresh_cpu_vm_stats(), I doesn't look ready for that...

Thanks.

>
> Each case is fine (see vmstat_update).
>
> > > + }
> > > + }
> > > this_cpu_write(vmstat_dirty, true);
> > > }
> > > @@ -2009,6 +2028,10 @@ static void vmstat_shepherd(struct work_
> > > for_each_online_cpu(cpu) {
> > > struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
> > >
> > > + /* NOHZ full CPUs manage their own vmstat flushing */
> > > + if (tick_nohz_full_cpu(smp_processor_id()))
> >
> > It should be the remote CPU instead of the current one.
>
> Fixed.
>