2021-04-28 07:01:46

by kernel test robot

[permalink] [raw]
Subject: [entry] 47b8ff194c: will-it-scale.per_process_ops -3.0% regression



Greeting,

FYI, we noticed a -3.0% regression of will-it-scale.per_process_ops due to commit:


commit: 47b8ff194c1fd73d58dc339b597d466fe48c8958 ("entry: Explicitly flush pending rcuog wakeup before last rescheduling point")
https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master


in testcase: will-it-scale
on test machine: 192 threads Intel(R) Xeon(R) Platinum 9242 CPU @ 2.30GHz with 192G memory
with following parameters:

nr_task: 100%
mode: process
test: futex3
cpufreq_governor: performance
ucode: 0x5003006

test-description: Will It Scale takes a testcase and runs it from 1 through to n parallel copies to see if the testcase will scale. It builds both a process and threads based test in order to see any differences between the two.
test-url: https://github.com/antonblanchard/will-it-scale



If you fix the issue, kindly add following tag
Reported-by: kernel test robot <[email protected]>


Details are as below:
-------------------------------------------------------------------------------------------------->


To reproduce:

git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
bin/lkp install job.yaml # job file is attached in this email
bin/lkp split-job --compatible job.yaml
bin/lkp run compatible-job.yaml

=========================================================================================
compiler/cpufreq_governor/kconfig/mode/nr_task/rootfs/tbox_group/test/testcase/ucode:
gcc-9/performance/x86_64-rhel-8.3/process/100%/debian-10.4-x86_64-20200603.cgz/lkp-csl-2ap2/futex3/will-it-scale/0x5003006

commit:
f8bb5cae96 ("rcu/nocb: Trigger self-IPI on late deferred wake up before user resume")
47b8ff194c ("entry: Explicitly flush pending rcuog wakeup before last rescheduling point")

f8bb5cae9616224a 47b8ff194c1fd73d58dc339b597
---------------- ---------------------------
%stddev %change %stddev
\ | \
1.25e+09 -3.0% 1.212e+09 will-it-scale.192.processes
6508984 -3.0% 6314032 will-it-scale.per_process_ops
1.25e+09 -3.0% 1.212e+09 will-it-scale.workload
68.00 -1.5% 67.00 vmstat.cpu.sy
30.00 +3.3% 31.00 vmstat.cpu.us
8.622e+10 +1.2% 8.728e+10 perf-stat.i.branch-instructions
0.38 -0.0 0.36 perf-stat.i.branch-miss-rate%
3.206e+08 -3.7% 3.088e+08 perf-stat.i.branch-misses
0.98 +1.1% 0.99 perf-stat.i.cpi
263518 +2.3% 269550 perf-stat.i.dTLB-store-misses
1.135e+11 -1.9% 1.113e+11 perf-stat.i.dTLB-stores
3.257e+08 -4.8% 3.1e+08 perf-stat.i.iTLB-load-misses
5.718e+11 -1.1% 5.656e+11 perf-stat.i.instructions
1758 +3.9% 1827 perf-stat.i.instructions-per-iTLB-miss
1.03 -1.1% 1.01 perf-stat.i.ipc
0.37 -0.0 0.35 perf-stat.overall.branch-miss-rate%
0.97 +1.1% 0.99 perf-stat.overall.cpi
0.00 +0.0 0.00 perf-stat.overall.dTLB-store-miss-rate%
1755 +3.9% 1824 perf-stat.overall.instructions-per-iTLB-miss
1.03 -1.1% 1.02 perf-stat.overall.ipc
138016 +2.0% 140712 perf-stat.overall.path-length
8.592e+10 +1.2% 8.698e+10 perf-stat.ps.branch-instructions
3.195e+08 -3.7% 3.078e+08 perf-stat.ps.branch-misses
262973 +2.3% 269022 perf-stat.ps.dTLB-store-misses
1.131e+11 -1.9% 1.109e+11 perf-stat.ps.dTLB-stores
3.246e+08 -4.8% 3.09e+08 perf-stat.ps.iTLB-load-misses
5.698e+11 -1.1% 5.637e+11 perf-stat.ps.instructions
1.725e+14 -1.1% 1.706e+14 perf-stat.total.instructions
32.11 -1.0 31.08 perf-profile.calltrace.cycles-pp.__entry_text_start.syscall
36.13 -0.3 35.81 perf-profile.calltrace.cycles-pp.__x64_sys_futex.do_syscall_64.entry_SYSCALL_64_after_hwframe.syscall
39.88 -0.3 39.58 perf-profile.calltrace.cycles-pp.do_syscall_64.entry_SYSCALL_64_after_hwframe.syscall
30.75 -0.2 30.57 perf-profile.calltrace.cycles-pp.do_futex.__x64_sys_futex.do_syscall_64.entry_SYSCALL_64_after_hwframe.syscall
2.22 -0.1 2.16 perf-profile.calltrace.cycles-pp.testcase
2.15 -0.1 2.09 perf-profile.calltrace.cycles-pp.syscall_enter_from_user_mode.do_syscall_64.entry_SYSCALL_64_after_hwframe.syscall
2.21 -0.0 2.17 perf-profile.calltrace.cycles-pp.entry_SYSCALL_64_safe_stack.syscall
6.22 +0.1 6.32 perf-profile.calltrace.cycles-pp.get_futex_key.futex_wake.do_futex.__x64_sys_futex.do_syscall_64
1.17 +0.1 1.31 perf-profile.calltrace.cycles-pp.syscall_exit_to_user_mode_prepare.syscall_exit_to_user_mode.entry_SYSCALL_64_after_hwframe.syscall
52.27 +1.4 53.62 perf-profile.calltrace.cycles-pp.entry_SYSCALL_64_after_hwframe.syscall
3.53 +1.5 5.00 perf-profile.calltrace.cycles-pp.exit_to_user_mode_prepare.syscall_exit_to_user_mode.entry_SYSCALL_64_after_hwframe.syscall
0.00 +1.5 1.55 perf-profile.calltrace.cycles-pp.rcu_nocb_flush_deferred_wakeup.exit_to_user_mode_prepare.syscall_exit_to_user_mode.entry_SYSCALL_64_after_hwframe.syscall
5.58 +1.9 7.47 perf-profile.calltrace.cycles-pp.syscall_exit_to_user_mode.entry_SYSCALL_64_after_hwframe.syscall
20.72 -0.6 20.09 perf-profile.children.cycles-pp.__entry_text_start
17.34 -0.5 16.87 perf-profile.children.cycles-pp.syscall_return_via_sysret
40.05 -0.3 39.75 perf-profile.children.cycles-pp.do_syscall_64
36.43 -0.2 36.20 perf-profile.children.cycles-pp.__x64_sys_futex
31.18 -0.2 30.98 perf-profile.children.cycles-pp.do_futex
2.36 -0.1 2.30 perf-profile.children.cycles-pp.entry_SYSCALL_64_safe_stack
2.41 -0.1 2.34 perf-profile.children.cycles-pp.testcase
2.19 -0.1 2.12 perf-profile.children.cycles-pp.syscall_enter_from_user_mode
97.88 +0.1 97.94 perf-profile.children.cycles-pp.syscall
6.46 +0.1 6.58 perf-profile.children.cycles-pp.get_futex_key
1.19 +0.1 1.33 perf-profile.children.cycles-pp.syscall_exit_to_user_mode_prepare
52.69 +1.4 54.05 perf-profile.children.cycles-pp.entry_SYSCALL_64_after_hwframe
0.00 +1.6 1.60 perf-profile.children.cycles-pp.rcu_nocb_flush_deferred_wakeup
3.58 +1.7 5.33 perf-profile.children.cycles-pp.exit_to_user_mode_prepare
6.16 +1.9 8.07 perf-profile.children.cycles-pp.syscall_exit_to_user_mode
15.88 -0.5 15.33 perf-profile.self.cycles-pp.syscall
17.22 -0.5 16.75 perf-profile.self.cycles-pp.syscall_return_via_sysret
6.00 -0.3 5.70 perf-profile.self.cycles-pp.do_futex
9.33 -0.2 9.09 perf-profile.self.cycles-pp.__entry_text_start
6.58 -0.2 6.35 perf-profile.self.cycles-pp.entry_SYSCALL_64_after_hwframe
8.29 -0.1 8.22 perf-profile.self.cycles-pp.hash_futex
2.36 -0.1 2.29 perf-profile.self.cycles-pp.entry_SYSCALL_64_safe_stack
1.86 -0.1 1.80 perf-profile.self.cycles-pp.syscall_enter_from_user_mode
1.96 -0.0 1.91 perf-profile.self.cycles-pp.testcase
1.14 +0.1 1.27 perf-profile.self.cycles-pp.syscall_exit_to_user_mode_prepare
6.04 +0.2 6.19 perf-profile.self.cycles-pp.get_futex_key
3.29 +0.4 3.65 perf-profile.self.cycles-pp.exit_to_user_mode_prepare
0.00 +1.4 1.39 perf-profile.self.cycles-pp.rcu_nocb_flush_deferred_wakeup



will-it-scale.192.processes

1.4e+09 +-----------------------------------------------------------------+
| .+.+ ++.++.+.+ +.|
1.2e+09 |.++.++.+O +.++.++.++.+.++.++.++.+.++.++.++ : : +.++.+ |
| : : : : |
1e+09 |-+ : : : : |
| : : : : |
8e+08 |-+ : : : : |
| :: : : |
6e+08 |-+ :: :: |
| : :: |
4e+08 |-+ : :: |
| + : |
2e+08 |-+ : |
| : |
0 +-----------------------------------------------------------------+


will-it-scale.per_process_ops

7e+06 +-------------------------------------------------------------------+
|.++.++.+ O+.+.++.++.+.++.+.++.++.+.++.++.+.++.+ +.++.+.++.+.++.++.|
6e+06 |-OO OO : : : : |
| : : : : |
5e+06 |-+ : : : : |
| : : : : |
4e+06 |-+ : : : : |
| :: : : |
3e+06 |-+ :: :: |
| : :: |
2e+06 |-+ : :: |
| + : |
1e+06 |-+ : |
| : |
0 +-------------------------------------------------------------------+


will-it-scale.workload

1.4e+09 +-----------------------------------------------------------------+
| .+.+ ++.++.+.+ +.|
1.2e+09 |.++.++.+O +.++.++.++.+.++.++.++.+.++.++.++ : : +.++.+ |
| : : : : |
1e+09 |-+ : : : : |
| : : : : |
8e+08 |-+ : : : : |
| :: : : |
6e+08 |-+ :: :: |
| : :: |
4e+08 |-+ : :: |
| + : |
2e+08 |-+ : |
| : |
0 +-----------------------------------------------------------------+


[*] bisect-good sample
[O] bisect-bad sample



Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.


---
0DAY/LKP+ Test Infrastructure Open Source Technology Center
https://lists.01.org/hyperkitty/list/[email protected] Intel Corporation

Thanks,
Oliver Sang


Attachments:
(No filename) (13.85 kB)
config-5.11.0-00050-g47b8ff194c1f (175.09 kB)
job-script (7.69 kB)
job.yaml (5.27 kB)
reproduce (349.00 B)
Download all attachments

2021-05-13 08:21:31

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [entry] 47b8ff194c: will-it-scale.per_process_ops -3.0% regression

On Wed, Apr 28, 2021 at 03:16:53PM +0800, kernel test robot wrote:
>
>
> Greeting,
>
> FYI, we noticed a -3.0% regression of will-it-scale.per_process_ops due to commit:
>
>
> commit: 47b8ff194c1fd73d58dc339b597d466fe48c8958 ("entry: Explicitly flush pending rcuog wakeup before last rescheduling point")

So the RCU bits are in rcu_user_enter(), which is called from
__context_tracking_enter() aka user_enter(), which is under
context_tracking_enabled().

But the new code in entry is not; we now unconditionally call
rcu_nocb_flush_deferred_wakeup(). Did that want to be under
context_tracking_enabled() as well?

Frederic, Paul?

---

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 9455476c5ba2..f4df001410fc 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -114,7 +114,12 @@ static inline void rcu_user_exit(void) { }
void rcu_init_nohz(void);
int rcu_nocb_cpu_offload(int cpu);
int rcu_nocb_cpu_deoffload(int cpu);
-void rcu_nocb_flush_deferred_wakeup(void);
+void __rcu_nocb_flush_deferred_wakeup(void);
+static inline void rcu_nocb_flush_deferred_wakeup(void)
+{
+ if (context_tracking_enabled())
+ __rcu_nocb_flush_deferred_wakeup();
+}
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
static inline void rcu_init_nohz(void) { }
static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; }
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index ad0156b86937..3cdbbf7fba01 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2378,7 +2378,7 @@ static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
return false;
}

-void rcu_nocb_flush_deferred_wakeup(void)
+void __rcu_nocb_flush_deferred_wakeup(void)
{
do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
}


2021-05-13 22:51:27

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [entry] 47b8ff194c: will-it-scale.per_process_ops -3.0% regression

On Thu, May 13, 2021 at 10:19:21AM +0200, Peter Zijlstra wrote:
> On Wed, Apr 28, 2021 at 03:16:53PM +0800, kernel test robot wrote:
> >
> >
> > Greeting,
> >
> > FYI, we noticed a -3.0% regression of will-it-scale.per_process_ops due to commit:
> >
> >
> > commit: 47b8ff194c1fd73d58dc339b597d466fe48c8958 ("entry: Explicitly flush pending rcuog wakeup before last rescheduling point")
>
> So the RCU bits are in rcu_user_enter(), which is called from
> __context_tracking_enter() aka user_enter(), which is under
> context_tracking_enabled().
>
> But the new code in entry is not; we now unconditionally call
> rcu_nocb_flush_deferred_wakeup(). Did that want to be under
> context_tracking_enabled() as well?
>
> Frederic, Paul?

My argument in favor of the change below is that if there is no context
tracking, then scheduling-clock interrupts will happen on all non-idle
CPUs. The next scheduling-clock interrupt will check this deferred-wakeup
state, and if set, cause rcu_core() to be invoked (for example, within the
RCU_SOFTIRQ handler). And rcu_core() invokes do_nocb_deferred_wakeup(),
which takes care of this.

For idle CPUs, do_idle() invokes rcu_nocb_flush_deferred_wakeup().

Frederic, any cases that I am missing?

Thanx, Paul

> ---
>
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index 9455476c5ba2..f4df001410fc 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -114,7 +114,12 @@ static inline void rcu_user_exit(void) { }
> void rcu_init_nohz(void);
> int rcu_nocb_cpu_offload(int cpu);
> int rcu_nocb_cpu_deoffload(int cpu);
> -void rcu_nocb_flush_deferred_wakeup(void);
> +void __rcu_nocb_flush_deferred_wakeup(void);
> +static inline void rcu_nocb_flush_deferred_wakeup(void)
> +{
> + if (context_tracking_enabled())
> + __rcu_nocb_flush_deferred_wakeup();
> +}
> #else /* #ifdef CONFIG_RCU_NOCB_CPU */
> static inline void rcu_init_nohz(void) { }
> static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; }
> diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
> index ad0156b86937..3cdbbf7fba01 100644
> --- a/kernel/rcu/tree_plugin.h
> +++ b/kernel/rcu/tree_plugin.h
> @@ -2378,7 +2378,7 @@ static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
> return false;
> }
>
> -void rcu_nocb_flush_deferred_wakeup(void)
> +void __rcu_nocb_flush_deferred_wakeup(void)
> {
> do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
> }
>

2021-05-14 02:10:17

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [entry] 47b8ff194c: will-it-scale.per_process_ops -3.0% regression

On Thu, May 13, 2021 at 10:46:36AM -0700, Paul E. McKenney wrote:
> On Thu, May 13, 2021 at 10:19:21AM +0200, Peter Zijlstra wrote:
> > On Wed, Apr 28, 2021 at 03:16:53PM +0800, kernel test robot wrote:
> > >
> > >
> > > Greeting,
> > >
> > > FYI, we noticed a -3.0% regression of will-it-scale.per_process_ops due to commit:
> > >
> > >
> > > commit: 47b8ff194c1fd73d58dc339b597d466fe48c8958 ("entry: Explicitly flush pending rcuog wakeup before last rescheduling point")
> >
> > So the RCU bits are in rcu_user_enter(), which is called from
> > __context_tracking_enter() aka user_enter(), which is under
> > context_tracking_enabled().
> >
> > But the new code in entry is not; we now unconditionally call
> > rcu_nocb_flush_deferred_wakeup(). Did that want to be under
> > context_tracking_enabled() as well?
> >
> > Frederic, Paul?
>
> My argument in favor of the change below is that if there is no context
> tracking, then scheduling-clock interrupts will happen on all non-idle
> CPUs. The next scheduling-clock interrupt will check this deferred-wakeup
> state, and if set, cause rcu_core() to be invoked (for example, within the
> RCU_SOFTIRQ handler). And rcu_core() invokes do_nocb_deferred_wakeup(),
> which takes care of this.
>
> For idle CPUs, do_idle() invokes rcu_nocb_flush_deferred_wakeup().
>
> Frederic, any cases that I am missing?

That sounds good, but two things:

1) Even if context tracking is not running, we still need to handle
deferred wakeups on idle. But all user/guest/idle currently use the
same function.

2) Context tracking may be running even when full nohz is not. But here only
full nohz is concerned.

So the change should rather be as follows (completely untested!).
I rather put the static key check in tick.h in order not to involve
deep dependencies inside rcupdate.h (especially rcupdate.h -> tick.h -> sched.h)

diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h
index 8b2b1d68b954..136b8d97d8c0 100644
--- a/include/linux/entry-kvm.h
+++ b/include/linux/entry-kvm.h
@@ -3,6 +3,7 @@
#define __LINUX_ENTRYKVM_H

#include <linux/entry-common.h>
+#include <linux/tick.h>

/* Transfer to guest mode work */
#ifdef CONFIG_KVM_XFER_TO_GUEST_WORK
@@ -57,7 +58,7 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu);
static inline void xfer_to_guest_mode_prepare(void)
{
lockdep_assert_irqs_disabled();
- rcu_nocb_flush_deferred_wakeup();
+ tick_nohz_user_enter_prepare();
}

/**
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 0bb80a7f05b9..bfd571f18cfd 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -11,6 +11,7 @@
#include <linux/context_tracking_state.h>
#include <linux/cpumask.h>
#include <linux/sched.h>
+#include <linux/rcupdate.h>

#ifdef CONFIG_GENERIC_CLOCKEVENTS
extern void __init tick_init(void);
@@ -304,4 +305,10 @@ static inline void tick_nohz_task_switch(void)
__tick_nohz_task_switch();
}

+static inline void tick_nohz_user_enter_prepare(void)
+{
+ if (tick_nohz_full_cpu(smp_processor_id()))
+ rcu_nocb_flush_deferred_wakeup();
+}
+
#endif
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index a0b3b04fb596..bf16395b9e13 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -5,6 +5,7 @@
#include <linux/highmem.h>
#include <linux/livepatch.h>
#include <linux/audit.h>
+#include <linux/tick.h>

#include "common.h"

@@ -186,7 +187,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
local_irq_disable_exit_to_user();

/* Check if any of the above work has queued a deferred wakeup */
- rcu_nocb_flush_deferred_wakeup();
+ tick_nohz_user_enter_prepare();

ti_work = READ_ONCE(current_thread_info()->flags);
}
@@ -202,7 +203,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
lockdep_assert_irqs_disabled();

/* Flush pending rcuog wakeup before the last need_resched() check */
- rcu_nocb_flush_deferred_wakeup();
+ tick_nohz_user_enter_prepare();

if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
ti_work = exit_to_user_mode_loop(regs, ti_work);

2021-05-14 10:49:07

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [entry] 47b8ff194c: will-it-scale.per_process_ops -3.0% regression

On Fri, May 14, 2021 at 01:11:27AM +0200, Frederic Weisbecker wrote:

> That sounds good, but two things:
>
> 1) Even if context tracking is not running, we still need to handle
> deferred wakeups on idle. But all user/guest/idle currently use the
> same function.
>
> 2) Context tracking may be running even when full nohz is not. But here only
> full nohz is concerned.
>
> So the change should rather be as follows (completely untested!).
> I rather put the static key check in tick.h in order not to involve
> deep dependencies inside rcupdate.h (especially rcupdate.h -> tick.h -> sched.h)

Compiles and boots for me, 0day folks, could you please test this makes
the reported regression go away?

> diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h
> index 8b2b1d68b954..136b8d97d8c0 100644
> --- a/include/linux/entry-kvm.h
> +++ b/include/linux/entry-kvm.h
> @@ -3,6 +3,7 @@
> #define __LINUX_ENTRYKVM_H
>
> #include <linux/entry-common.h>
> +#include <linux/tick.h>
>
> /* Transfer to guest mode work */
> #ifdef CONFIG_KVM_XFER_TO_GUEST_WORK
> @@ -57,7 +58,7 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu);
> static inline void xfer_to_guest_mode_prepare(void)
> {
> lockdep_assert_irqs_disabled();
> - rcu_nocb_flush_deferred_wakeup();
> + tick_nohz_user_enter_prepare();
> }
>
> /**
> diff --git a/include/linux/tick.h b/include/linux/tick.h
> index 0bb80a7f05b9..bfd571f18cfd 100644
> --- a/include/linux/tick.h
> +++ b/include/linux/tick.h
> @@ -11,6 +11,7 @@
> #include <linux/context_tracking_state.h>
> #include <linux/cpumask.h>
> #include <linux/sched.h>
> +#include <linux/rcupdate.h>
>
> #ifdef CONFIG_GENERIC_CLOCKEVENTS
> extern void __init tick_init(void);
> @@ -304,4 +305,10 @@ static inline void tick_nohz_task_switch(void)
> __tick_nohz_task_switch();
> }
>
> +static inline void tick_nohz_user_enter_prepare(void)
> +{
> + if (tick_nohz_full_cpu(smp_processor_id()))
> + rcu_nocb_flush_deferred_wakeup();
> +}
> +
> #endif
> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> index a0b3b04fb596..bf16395b9e13 100644
> --- a/kernel/entry/common.c
> +++ b/kernel/entry/common.c
> @@ -5,6 +5,7 @@
> #include <linux/highmem.h>
> #include <linux/livepatch.h>
> #include <linux/audit.h>
> +#include <linux/tick.h>
>
> #include "common.h"
>
> @@ -186,7 +187,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> local_irq_disable_exit_to_user();
>
> /* Check if any of the above work has queued a deferred wakeup */
> - rcu_nocb_flush_deferred_wakeup();
> + tick_nohz_user_enter_prepare();
>
> ti_work = READ_ONCE(current_thread_info()->flags);
> }
> @@ -202,7 +203,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
> lockdep_assert_irqs_disabled();
>
> /* Flush pending rcuog wakeup before the last need_resched() check */
> - rcu_nocb_flush_deferred_wakeup();
> + tick_nohz_user_enter_prepare();
>
> if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
> ti_work = exit_to_user_mode_loop(regs, ti_work);

2021-05-19 19:12:49

by kernel test robot

[permalink] [raw]
Subject: Re: [entry] 47b8ff194c: will-it-scale.per_process_ops -3.0% regression

hi, Peter Zijlstra,

On Fri, May 14, 2021 at 12:13:22PM +0200, Peter Zijlstra wrote:
> On Fri, May 14, 2021 at 01:11:27AM +0200, Frederic Weisbecker wrote:
>
> > That sounds good, but two things:
> >
> > 1) Even if context tracking is not running, we still need to handle
> > deferred wakeups on idle. But all user/guest/idle currently use the
> > same function.
> >
> > 2) Context tracking may be running even when full nohz is not. But here only
> > full nohz is concerned.
> >
> > So the change should rather be as follows (completely untested!).
> > I rather put the static key check in tick.h in order not to involve
> > deep dependencies inside rcupdate.h (especially rcupdate.h -> tick.h -> sched.h)
>
> Compiles and boots for me, 0day folks, could you please test this makes
> the reported regression go away?

sorry for late.

on which base should we apply below patch?
we tried to apply upon 4ae7dc97f726e ("entry/kvm: Explicitly flush pending rcuog
wakeup before last rescheduling point") and v5.12, but both build failed like below:

ERROR: modpost: "tick_nohz_full_mask" [arch/x86/kvm/kvm.ko] undefined!

or is there any particular kconfig need to enable?
(the kconfig which we used to build 47b8ff194c is in original report
which maybe you could help have a look what we may miss)

Thanks

>
> > diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h
> > index 8b2b1d68b954..136b8d97d8c0 100644
> > --- a/include/linux/entry-kvm.h
> > +++ b/include/linux/entry-kvm.h
> > @@ -3,6 +3,7 @@
> > #define __LINUX_ENTRYKVM_H
> >
> > #include <linux/entry-common.h>
> > +#include <linux/tick.h>
> >
> > /* Transfer to guest mode work */
> > #ifdef CONFIG_KVM_XFER_TO_GUEST_WORK
> > @@ -57,7 +58,7 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu);
> > static inline void xfer_to_guest_mode_prepare(void)
> > {
> > lockdep_assert_irqs_disabled();
> > - rcu_nocb_flush_deferred_wakeup();
> > + tick_nohz_user_enter_prepare();
> > }
> >
> > /**
> > diff --git a/include/linux/tick.h b/include/linux/tick.h
> > index 0bb80a7f05b9..bfd571f18cfd 100644
> > --- a/include/linux/tick.h
> > +++ b/include/linux/tick.h
> > @@ -11,6 +11,7 @@
> > #include <linux/context_tracking_state.h>
> > #include <linux/cpumask.h>
> > #include <linux/sched.h>
> > +#include <linux/rcupdate.h>
> >
> > #ifdef CONFIG_GENERIC_CLOCKEVENTS
> > extern void __init tick_init(void);
> > @@ -304,4 +305,10 @@ static inline void tick_nohz_task_switch(void)
> > __tick_nohz_task_switch();
> > }
> >
> > +static inline void tick_nohz_user_enter_prepare(void)
> > +{
> > + if (tick_nohz_full_cpu(smp_processor_id()))
> > + rcu_nocb_flush_deferred_wakeup();
> > +}
> > +
> > #endif
> > diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> > index a0b3b04fb596..bf16395b9e13 100644
> > --- a/kernel/entry/common.c
> > +++ b/kernel/entry/common.c
> > @@ -5,6 +5,7 @@
> > #include <linux/highmem.h>
> > #include <linux/livepatch.h>
> > #include <linux/audit.h>
> > +#include <linux/tick.h>
> >
> > #include "common.h"
> >
> > @@ -186,7 +187,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> > local_irq_disable_exit_to_user();
> >
> > /* Check if any of the above work has queued a deferred wakeup */
> > - rcu_nocb_flush_deferred_wakeup();
> > + tick_nohz_user_enter_prepare();
> >
> > ti_work = READ_ONCE(current_thread_info()->flags);
> > }
> > @@ -202,7 +203,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
> > lockdep_assert_irqs_disabled();
> >
> > /* Flush pending rcuog wakeup before the last need_resched() check */
> > - rcu_nocb_flush_deferred_wakeup();
> > + tick_nohz_user_enter_prepare();
> >
> > if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
> > ti_work = exit_to_user_mode_loop(regs, ti_work);

2021-05-23 13:36:44

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [entry] 47b8ff194c: will-it-scale.per_process_ops -3.0% regression

On Wed, May 19, 2021 at 01:27:00PM +0800, Oliver Sang wrote:
> hi, Peter Zijlstra,
>
> On Fri, May 14, 2021 at 12:13:22PM +0200, Peter Zijlstra wrote:
> > On Fri, May 14, 2021 at 01:11:27AM +0200, Frederic Weisbecker wrote:
> >
> > > That sounds good, but two things:
> > >
> > > 1) Even if context tracking is not running, we still need to handle
> > > deferred wakeups on idle. But all user/guest/idle currently use the
> > > same function.
> > >
> > > 2) Context tracking may be running even when full nohz is not. But here only
> > > full nohz is concerned.
> > >
> > > So the change should rather be as follows (completely untested!).
> > > I rather put the static key check in tick.h in order not to involve
> > > deep dependencies inside rcupdate.h (especially rcupdate.h -> tick.h -> sched.h)
> >
> > Compiles and boots for me, 0day folks, could you please test this makes
> > the reported regression go away?
>
> sorry for late.
>
> on which base should we apply below patch?
> we tried to apply upon 4ae7dc97f726e ("entry/kvm: Explicitly flush pending rcuog
> wakeup before last rescheduling point") and v5.12, but both build failed like below:
>
> ERROR: modpost: "tick_nohz_full_mask" [arch/x86/kvm/kvm.ko] undefined!
>
> or is there any particular kconfig need to enable?
> (the kconfig which we used to build 47b8ff194c is in original report
> which maybe you could help have a look what we may miss)

Ah indeed I need to export tick_nohz_full_mask. Too bad but that's the fate
of many cpu masks anyway. Can you try the following patch instead? It boots
and seem to behave:

---
From: Frederic Weisbecker <[email protected]>
Date: Sun, 23 May 2021 14:23:30 +0200
Subject: [PATCH] tick/nohz: Only check for RCU deferred wakeup on user/guest
entry when needed

Checking for and processing RCU-nocb deferred wakeup upon user/guest
entry is only relevant when nohz_full runs on the local CPU, otherwise
the periodic tick should take care of it.

Make sure we don't needlessly pollute these fast-paths.

Reported-by: kernel test robot <[email protected]>
Signed-off-by: Frederic Weisbecker <[email protected]>
---
include/linux/entry-kvm.h | 3 ++-
include/linux/tick.h | 7 +++++++
kernel/entry/common.c | 5 +++--
kernel/time/tick-sched.c | 1 +
4 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h
index 8b2b1d68b954..136b8d97d8c0 100644
--- a/include/linux/entry-kvm.h
+++ b/include/linux/entry-kvm.h
@@ -3,6 +3,7 @@
#define __LINUX_ENTRYKVM_H

#include <linux/entry-common.h>
+#include <linux/tick.h>

/* Transfer to guest mode work */
#ifdef CONFIG_KVM_XFER_TO_GUEST_WORK
@@ -57,7 +58,7 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu);
static inline void xfer_to_guest_mode_prepare(void)
{
lockdep_assert_irqs_disabled();
- rcu_nocb_flush_deferred_wakeup();
+ tick_nohz_user_enter_prepare();
}

/**
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 7340613c7eff..1a0ff88fa107 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -11,6 +11,7 @@
#include <linux/context_tracking_state.h>
#include <linux/cpumask.h>
#include <linux/sched.h>
+#include <linux/rcupdate.h>

#ifdef CONFIG_GENERIC_CLOCKEVENTS
extern void __init tick_init(void);
@@ -300,4 +301,10 @@ static inline void tick_nohz_task_switch(void)
__tick_nohz_task_switch();
}

+static inline void tick_nohz_user_enter_prepare(void)
+{
+ if (tick_nohz_full_cpu(smp_processor_id()))
+ rcu_nocb_flush_deferred_wakeup();
+}
+
#endif
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index a0b3b04fb596..bf16395b9e13 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -5,6 +5,7 @@
#include <linux/highmem.h>
#include <linux/livepatch.h>
#include <linux/audit.h>
+#include <linux/tick.h>

#include "common.h"

@@ -186,7 +187,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
local_irq_disable_exit_to_user();

/* Check if any of the above work has queued a deferred wakeup */
- rcu_nocb_flush_deferred_wakeup();
+ tick_nohz_user_enter_prepare();

ti_work = READ_ONCE(current_thread_info()->flags);
}
@@ -202,7 +203,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
lockdep_assert_irqs_disabled();

/* Flush pending rcuog wakeup before the last need_resched() check */
- rcu_nocb_flush_deferred_wakeup();
+ tick_nohz_user_enter_prepare();

if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
ti_work = exit_to_user_mode_loop(regs, ti_work);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 828b091501ca..6784f27a3099 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -230,6 +230,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)

#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
+EXPORT_SYMBOL_GPL(tick_nohz_full_mask);
bool tick_nohz_full_running;
EXPORT_SYMBOL_GPL(tick_nohz_full_running);
static atomic_t tick_dep_mask;
--
2.25.1