LinuxLists.cc - [PATCH] sched: Warn on long periods of pending need

2021-03-17 05:04:26

Subject: [PATCH] sched: Warn on long periods of pending need_resched

From: Paul Turner <[email protected]>

CPU scheduler marks need_resched flag to signal a schedule() on a
particular CPU. But, schedule() may not happen immediately in cases
where the current task is executing in the kernel mode (no
preemption state) for extended periods of time.

This patch adds a warn_on if need_resched is pending for more than the
time specified in sysctl resched_latency_warn_ms. Monitoring is done via
the tick and the accuracy is hence limited to jiffy scale. This also
means that we won't trigger the warning if the tick is disabled.

If resched_latency_warn_ms is set to the default value, only one warning
will be produced per boot.

This warning only exists under CONFIG_SCHED_DEBUG. If it goes off, it is
likely that there is a missing cond_resched() somewhere.

Signed-off-by: Paul Turner <[email protected]>
Signed-off-by: Josh Don <[email protected]>
---
We've caught various bugs using this patch. In fact, a followup patch to
this one will be a patch introducing a missing cond_resched(). However,
this may be too noisy to have as default enabled (especially given that
it requires some tuning); I'm open to having this default disabled
instead.
kernel/sched/core.c | 91 ++++++++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 6 +++
kernel/sysctl.c | 8 ++++
3 files changed, 105 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 98191218d891..ac037fc87d5e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -58,7 +58,25 @@ const_debug unsigned int sysctl_sched_features =
#include "features.h"
0;
#undef SCHED_FEAT
+
+/*
+ * Print a warning if need_resched is set for at least this long. At the
+ * default value, only a single warning will be printed per boot.
+ *
+ * Values less than 2 disable the feature.
+ *
+ * A kernel compiled with CONFIG_KASAN tends to run more slowly on average.
+ * Increase the need_resched timeout to reduce false alarms.
+ */
+#ifdef CONFIG_KASAN
+#define RESCHED_DEFAULT_WARN_LATENCY_MS 101
+#define RESCHED_BOOT_QUIET_SEC 600
+#else
+#define RESCHED_DEFAULT_WARN_LATENCY_MS 51
+#define RESCHED_BOOT_QUIET_SEC 300
#endif
+int sysctl_resched_latency_warn_ms = RESCHED_DEFAULT_WARN_LATENCY_MS;
+#endif /* CONFIG_SCHED_DEBUG */

/*
* Number of tasks to iterate in a single balance run.
@@ -4520,6 +4538,71 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}

+#ifdef CONFIG_SCHED_DEBUG
+
+static inline u64 resched_latency_check(struct rq *rq)
+{
+ int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
+ bool warn_only_once = (latency_warn_ms == RESCHED_DEFAULT_WARN_LATENCY_MS);
+ u64 need_resched_latency, now = rq_clock(rq);
+ static bool warned_once;
+
+ if (warn_only_once && warned_once)
+ return 0;
+
+ if (!need_resched() || latency_warn_ms < 2)
+ return 0;
+
+ /* Disable this warning for the first few mins after boot */
+ if (now < RESCHED_BOOT_QUIET_SEC * NSEC_PER_SEC)
+ return 0;
+
+ if (!rq->last_seen_need_resched_ns) {
+ rq->last_seen_need_resched_ns = now;
+ rq->ticks_without_resched = 0;
+ return 0;
+ }
+
+ rq->ticks_without_resched++;
+ need_resched_latency = now - rq->last_seen_need_resched_ns;
+ if (need_resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
+ return 0;
+
+ warned_once = true;
+
+ return need_resched_latency;
+}
+
+static inline void resched_latency_warn(int cpu, u64 latency)
+{
+ static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1);
+
+ WARN(__ratelimit(&latency_check_ratelimit),
+ "CPU %d: need_resched set for > %llu ns (%d ticks) "
+ "without schedule\n",
+ cpu, latency, cpu_rq(cpu)->ticks_without_resched);
+}
+
+
+static int __init setup_resched_latency_warn_ms(char *str)
+{
+ long val;
+
+ if ((kstrtol(str, 0, &val))) {
+ pr_warn("Unable to set resched_latency_warn_ms\n");
+ return 1;
+ }
+
+ sysctl_resched_latency_warn_ms = val;
+ return 1;
+}
+__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
+#else
+static inline u64 resched_latency_check(struct rq *rq) { return 0; }
+static inline void resched_latency_warn(int cpu, u64 latency) {}
+#endif /* CONFIG_SCHED_DEBUG */
+
+
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
@@ -4531,6 +4614,7 @@ void scheduler_tick(void)
struct task_struct *curr = rq->curr;
struct rq_flags rf;
unsigned long thermal_pressure;
+ u64 resched_latency;

arch_scale_freq_tick();
sched_clock_tick();
@@ -4541,11 +4625,15 @@ void scheduler_tick(void)
thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
curr->sched_class->task_tick(rq, curr, 0);
+ resched_latency = resched_latency_check(rq);
calc_global_load_tick(rq);
psi_task_tick(rq);

rq_unlock(rq, &rf);

+ if (resched_latency)
+ resched_latency_warn(cpu, resched_latency);
+
perf_event_task_tick();

#ifdef CONFIG_SMP
@@ -5040,6 +5128,9 @@ static void __sched notrace __schedule(bool preempt)
next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
+#ifdef CONFIG_SCHED_DEBUG
+ rq->last_seen_need_resched_ns = 0;
+#endif

if (likely(prev != next)) {
rq->nr_switches++;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10a1522b1e30..912a8886bc7f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -57,6 +57,7 @@
#include <linux/prefetch.h>
#include <linux/profile.h>
#include <linux/psi.h>
+#include <linux/ratelimit.h>
#include <linux/rcupdate_wait.h>
#include <linux/security.h>
#include <linux/stop_machine.h>
@@ -963,6 +964,11 @@ struct rq {

atomic_t nr_iowait;

+#ifdef CONFIG_SCHED_DEBUG
+ u64 last_seen_need_resched_ns;
+ int ticks_without_resched;
+#endif
+
#ifdef CONFIG_MEMBARRIER
int membarrier_state;
#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62fbd09b5dc1..526094fc364e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -193,6 +193,7 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
#endif /* CONFIG_SMP */
+extern int sysctl_resched_latency_warn_ms;
#endif /* CONFIG_SCHED_DEBUG */

#ifdef CONFIG_COMPACTION
@@ -1763,6 +1764,13 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_NUMA_BALANCING */
+ {
+ .procname = "resched_latency_warn_ms",
+ .data = &sysctl_resched_latency_warn_ms,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#endif /* CONFIG_SCHED_DEBUG */
{
.procname = "sched_rt_period_us",
--
2.31.0.rc2.261.g7f71774620-goog

2021-03-17 08:29:29

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH] sched: Warn on long periods of pending need_resched

* Josh Don <[email protected]> wrote:

> From: Paul Turner <[email protected]>
>
> CPU scheduler marks need_resched flag to signal a schedule() on a
> particular CPU. But, schedule() may not happen immediately in cases
> where the current task is executing in the kernel mode (no
> preemption state) for extended periods of time.
>
> This patch adds a warn_on if need_resched is pending for more than the
> time specified in sysctl resched_latency_warn_ms. Monitoring is done via
> the tick and the accuracy is hence limited to jiffy scale. This also
> means that we won't trigger the warning if the tick is disabled.

Looks useful.

> If resched_latency_warn_ms is set to the default value, only one warning
> will be produced per boot.

Looks like a value hack, should probably be a separate flag,
defaulting to warn-once.

> This warning only exists under CONFIG_SCHED_DEBUG. If it goes off, it is
> likely that there is a missing cond_resched() somewhere.

CONFIG_SCHED_DEBUG is default-y, so most distros have it enabled.

> +/*
> + * Print a warning if need_resched is set for at least this long. At the
> + * default value, only a single warning will be printed per boot.
> + *
> + * Values less than 2 disable the feature.
> + *
> + * A kernel compiled with CONFIG_KASAN tends to run more slowly on average.
> + * Increase the need_resched timeout to reduce false alarms.
> + */
> +#ifdef CONFIG_KASAN
> +#define RESCHED_DEFAULT_WARN_LATENCY_MS 101
> +#define RESCHED_BOOT_QUIET_SEC 600
> +#else
> +#define RESCHED_DEFAULT_WARN_LATENCY_MS 51
> +#define RESCHED_BOOT_QUIET_SEC 300
> #endif
> +int sysctl_resched_latency_warn_ms = RESCHED_DEFAULT_WARN_LATENCY_MS;
> +#endif /* CONFIG_SCHED_DEBUG */

I'd really just make this a single value - say 100 or 200 msecs.

> +static inline void resched_latency_warn(int cpu, u64 latency)
> +{
> + static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1);
> +
> + WARN(__ratelimit(&latency_check_ratelimit),
> + "CPU %d: need_resched set for > %llu ns (%d ticks) "
> + "without schedule\n",
> + cpu, latency, cpu_rq(cpu)->ticks_without_resched);
> +}

Could you please put the 'sched:' prefix into scheduler warnings.
Let's have a bit of a namespace structure in new warnings.

Thanks,

Ingo

2021-03-17 08:33:34

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH] sched: Warn on long periods of pending need_resched

* Josh Don <[email protected]> wrote:

> +static inline u64 resched_latency_check(struct rq *rq)
> +{
> + int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
> + bool warn_only_once = (latency_warn_ms == RESCHED_DEFAULT_WARN_LATENCY_MS);
> + u64 need_resched_latency, now = rq_clock(rq);
> + static bool warned_once;
> +
> + if (warn_only_once && warned_once)
> + return 0;
> +
> + if (!need_resched() || latency_warn_ms < 2)
> + return 0;
> +
> + /* Disable this warning for the first few mins after boot */
> + if (now < RESCHED_BOOT_QUIET_SEC * NSEC_PER_SEC)
> + return 0;
> +
> + if (!rq->last_seen_need_resched_ns) {
> + rq->last_seen_need_resched_ns = now;
> + rq->ticks_without_resched = 0;
> + return 0;
> + }
> +
> + rq->ticks_without_resched++;

So AFAICS this will only really do something useful on full-nohz
kernels with sufficiently long scheduler ticks, right?

On other kernels the scheduler tick interrupt, when it returns to
user-space, will trigger a reschedule if it sees a need_resched.

Thanks,

Ingo

2021-03-18 00:08:15

by Josh Don

[permalink] [raw]

Subject: Re: [PATCH] sched: Warn on long periods of pending need_resched

On Wed, Mar 17, 2021 at 1:25 AM Ingo Molnar <[email protected]> wrote:
>
> * Josh Don <[email protected]> wrote:
>
> > If resched_latency_warn_ms is set to the default value, only one warning
> > will be produced per boot.
>
> Looks like a value hack, should probably be a separate flag,
> defaulting to warn-once.

Agreed, done.

> > This warning only exists under CONFIG_SCHED_DEBUG. If it goes off, it is
> > likely that there is a missing cond_resched() somewhere.
>
> CONFIG_SCHED_DEBUG is default-y, so most distros have it enabled.

To avoid log spam for people who don't care, I was considering having
the feature default disabled. Perhaps a better alternative is to only
show a single line warning and not print the full backtrace by
default. Does the latter sound good to you?

> > +#ifdef CONFIG_KASAN
> > +#define RESCHED_DEFAULT_WARN_LATENCY_MS 101
> > +#define RESCHED_BOOT_QUIET_SEC 600
> > +#else
> > +#define RESCHED_DEFAULT_WARN_LATENCY_MS 51
> > +#define RESCHED_BOOT_QUIET_SEC 300
> > #endif
> > +int sysctl_resched_latency_warn_ms = RESCHED_DEFAULT_WARN_LATENCY_MS;
> > +#endif /* CONFIG_SCHED_DEBUG */
>
> I'd really just make this a single value - say 100 or 200 msecs.

Replacing these both with a single value (the more conservative
default of 100ms and 600s).

> > +static inline void resched_latency_warn(int cpu, u64 latency)
> > +{
> > + static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1);
> > +
> > + WARN(__ratelimit(&latency_check_ratelimit),
> > + "CPU %d: need_resched set for > %llu ns (%d ticks) "
> > + "without schedule\n",
> > + cpu, latency, cpu_rq(cpu)->ticks_without_resched);
> > +}
>
> Could you please put the 'sched:' prefix into scheduler warnings.
> Let's have a bit of a namespace structure in new warnings.

Sounds good, done.

2021-03-18 00:23:26

by Josh Don

[permalink] [raw]

Subject: Re: [PATCH] sched: Warn on long periods of pending need_resched

On Wed, Mar 17, 2021 at 1:31 AM Ingo Molnar <[email protected]> wrote:
>
>
> * Josh Don <[email protected]> wrote:
>
> > +static inline u64 resched_latency_check(struct rq *rq)
> > +{
> > + int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
> > + bool warn_only_once = (latency_warn_ms == RESCHED_DEFAULT_WARN_LATENCY_MS);
> > + u64 need_resched_latency, now = rq_clock(rq);
> > + static bool warned_once;
> > +
> > + if (warn_only_once && warned_once)
> > + return 0;
> > +
> > + if (!need_resched() || latency_warn_ms < 2)
> > + return 0;
> > +
> > + /* Disable this warning for the first few mins after boot */
> > + if (now < RESCHED_BOOT_QUIET_SEC * NSEC_PER_SEC)
> > + return 0;
> > +
> > + if (!rq->last_seen_need_resched_ns) {
> > + rq->last_seen_need_resched_ns = now;
> > + rq->ticks_without_resched = 0;
> > + return 0;
> > + }
> > +
> > + rq->ticks_without_resched++;
>
> So AFAICS this will only really do something useful on full-nohz
> kernels with sufficiently long scheduler ticks, right?

Not quite sure what you mean; it is actually the inverse? Since we
rely on the tick to detect the resched latency, on nohz-full we won't
have detection on cpus running a single thread. The ideal scenario is
!nohz-full and tick interval << warn_ms.

> On other kernels the scheduler tick interrupt, when it returns to
> user-space, will trigger a reschedule if it sees a need_resched.

True for the case where we return to userspace, but we could instead
be executing in a non-preemptible region of the kernel. This is where
we've seen/fixed kernel bugs.

Best,
Josh

2021-03-18 14:38:18

by kernel test robot

[permalink] [raw]

Subject: [sched] 663017c554: WARNING:at_kernel/sched/core.c:#scheduler_tick

Greeting,

FYI, we noticed the following commit (built with gcc-9):

commit: 663017c554fa0582b0109d7d5122274724a581a2 ("[PATCH] sched: Warn on long periods of pending need_resched")
url: https://github.com/0day-ci/linux/commits/Josh-Don/sched-Warn-on-long-periods-of-pending-need_resched/20210317-130220
base: https://git.kernel.org/cgit/linux/kernel/git/kees/linux.git for-next/pstore

in testcase: locktorture
version:
with following parameters:

runtime: 300s
test: cpuhotplug

test-description: This torture test consists of creating a number of kernel threads which acquire the lock and hold it for specific amount of time, thus simulating different critical region behaviors.
test-url: https://www.kernel.org/doc/Documentation/locking/locktorture.txt

on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 8G

caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):

+------------------------------------------------+------------+------------+
| | 7db688e99c | 663017c554 |
+------------------------------------------------+------------+------------+
| boot_successes | 15 | 0 |
| boot_failures | 0 | 15 |
| WARNING:at_kernel/sched/core.c:#scheduler_tick | 0 | 15 |
| RIP:scheduler_tick | 0 | 15 |
| RIP:console_unlock | 0 | 6 |
| RIP:delay_tsc | 0 | 9 |
+------------------------------------------------+------------+------------+

If you fix the issue, kindly add following tag
Reported-by: kernel test robot <[email protected]>

[ 300.078613] WARNING: CPU: 1 PID: 440 at kernel/sched/core.c:4585 scheduler_tick (kbuild/src/consumer/kernel/sched/core.c:4585 kbuild/src/consumer/kernel/sched/core.c:4640)
[ 300.081225] Modules linked in: locktorture torture sr_mod cdrom intel_rapl_msr sg bochs_drm ppdev drm_vram_helper drm_ttm_helper intel_rapl_common ttm crct10dif_pclmul crc32_pclmul crc32c_intel drm_kms_helper ghash_clmulni_intel ata_generic rapl syscopyarea sysfillrect sysimgblt i2c_piix4 ipmi_devintf fb_sys_fops ipmi_msghandler ata_piix joydev drm libata serio_raw parport_pc parport ip_tables
[ 300.091527] CPU: 1 PID: 440 Comm: lock_torture_wr Not tainted 5.11.0-00004-g663017c554fa #1
[ 300.094135] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
[ 300.096944] RIP: 0010:scheduler_tick (kbuild/src/consumer/kernel/sched/core.c:4585 kbuild/src/consumer/kernel/sched/core.c:4640)
[ 300.098878] Code: 4d 00 85 c0 74 86 4a 8b 14 ed 00 39 82 ad 48 89 d8 44 89 f6 48 c7 c7 78 91 74 ad 8b 8c 02 e8 09 00 00 4c 89 fa e8 34 d6 a6 00 <0f> 0b e9 5b ff ff ff 80 3d 37 d7 a8 01 00 0f 85 ba fe ff ff 48 c7
All code
========
0: 4d 00 85 c0 74 86 4a rex.WRB add %r8b,0x4a8674c0(%r13)
7: 8b 14 ed 00 39 82 ad mov -0x527dc700(,%rbp,8),%edx
e: 48 89 d8 mov %rbx,%rax
11: 44 89 f6 mov %r14d,%esi
14: 48 c7 c7 78 91 74 ad mov $0xffffffffad749178,%rdi
1b: 8b 8c 02 e8 09 00 00 mov 0x9e8(%rdx,%rax,1),%ecx
22: 4c 89 fa mov %r15,%rdx
25: e8 34 d6 a6 00 callq 0xa6d65e
2a:* 0f 0b ud2 <-- trapping instruction
2c: e9 5b ff ff ff jmpq 0xffffffffffffff8c
31: 80 3d 37 d7 a8 01 00 cmpb $0x0,0x1a8d737(%rip) # 0x1a8d76f
38: 0f 85 ba fe ff ff jne 0xfffffffffffffef8
3e: 48 rex.W
3f: c7 .byte 0xc7

Code starting with the faulting instruction
===========================================
0: 0f 0b ud2
2: e9 5b ff ff ff jmpq 0xffffffffffffff62
7: 80 3d 37 d7 a8 01 00 cmpb $0x0,0x1a8d737(%rip) # 0x1a8d745
e: 0f 85 ba fe ff ff jne 0xfffffffffffffece
14: 48 rex.W
15: c7 .byte 0xc7
[ 300.104318] RSP: 0018:ffff9afa000c0ea0 EFLAGS: 00010086
[ 300.106365] RAX: 0000000000000000 RBX: 000000000002b040 RCX: 0000000000000027
[ 300.108888] RDX: 0000000000000027 RSI: 00000000ffff7fff RDI: ffff8e06bfd17cf8
[ 300.111662] RBP: ffff9afa000c0ed0 R08: ffff8e06bfd17cf0 R09: ffff9afa000c0cb8
[ 300.114329] R10: 0000000000000001 R11: 0000000000000001 R12: ffff8e06bfd2b040
[ 300.117088] R13: 0000000000000001 R14: 0000000000000001 R15: 0000000003197579
[ 300.119577] FS: 0000000000000000(0000) GS:ffff8e06bfd00000(0000) knlGS:0000000000000000
[ 300.122367] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 300.125010] CR2: 0000000000442d70 CR3: 00000001381b8000 CR4: 00000000000406e0
[ 300.127199] Call Trace:
[ 300.128778] <IRQ>
[ 300.130312] update_process_times (kbuild/src/consumer/kernel/time/timer.c:1790)
[ 300.132661] tick_sched_handle+0x1f/0x60
[ 300.134782] tick_sched_timer (kbuild/src/consumer/kernel/time/tick-sched.c:1374)
[ 300.136828] ? tick_sched_do_timer (kbuild/src/consumer/kernel/time/tick-sched.c:1356)
[ 300.138519] __hrtimer_run_queues (kbuild/src/consumer/kernel/time/hrtimer.c:1519 kbuild/src/consumer/kernel/time/hrtimer.c:1583)
[ 300.140292] hrtimer_interrupt (kbuild/src/consumer/kernel/time/hrtimer.c:1648)
[ 300.142182] __sysvec_apic_timer_interrupt (kbuild/src/consumer/arch/x86/include/asm/jump_label.h:25 kbuild/src/consumer/include/linux/jump_label.h:200 kbuild/src/consumer/arch/x86/include/asm/trace/irq_vectors.h:41 kbuild/src/consumer/arch/x86/kernel/apic/apic.c:1107)
[ 300.144442] asm_call_irq_on_stack (kbuild/src/consumer/arch/x86/entry/entry_64.S:796)
[ 300.146568] </IRQ>
[ 300.148560] sysvec_apic_timer_interrupt (kbuild/src/consumer/arch/x86/include/asm/irq_stack.h:38 kbuild/src/consumer/arch/x86/include/asm/irq_stack.h:89 kbuild/src/consumer/arch/x86/kernel/apic/apic.c:1100)
[ 300.150589] asm_sysvec_apic_timer_interrupt (kbuild/src/consumer/arch/x86/include/asm/idtentry.h:629)
[ 300.152893] RIP: 0010:delay_tsc (kbuild/src/consumer/arch/x86/include/asm/msr.h:234 kbuild/src/consumer/arch/x86/lib/delay.c:72)
[ 300.154700] Code: 48 09 c2 49 89 d0 eb 1f 65 ff 0d 34 a8 61 53 f3 90 65 ff 05 2b a8 61 53 65 8b 35 c4 4e 61 53 41 39 f1 75 22 41 89 f1 0f 01 f9 <66> 90 48 c1 e2 20 48 09 d0 48 89 c2 4c 29 c2 48 39 fa 72 ca 65 ff
All code
========
0: 48 09 c2 or %rax,%rdx
3: 49 89 d0 mov %rdx,%r8
6: eb 1f jmp 0x27
8: 65 ff 0d 34 a8 61 53 decl %gs:0x5361a834(%rip) # 0x5361a843
f: f3 90 pause
11: 65 ff 05 2b a8 61 53 incl %gs:0x5361a82b(%rip) # 0x5361a843
18: 65 8b 35 c4 4e 61 53 mov %gs:0x53614ec4(%rip),%esi # 0x53614ee3
1f: 41 39 f1 cmp %esi,%r9d
22: 75 22 jne 0x46
24: 41 89 f1 mov %esi,%r9d
27: 0f 01 f9 rdtscp
2a:* 66 90 xchg %ax,%ax <-- trapping instruction
2c: 48 c1 e2 20 shl $0x20,%rdx
30: 48 09 d0 or %rdx,%rax
33: 48 89 c2 mov %rax,%rdx
36: 4c 29 c2 sub %r8,%rdx
39: 48 39 fa cmp %rdi,%rdx
3c: 72 ca jb 0x8
3e: 65 gs
3f: ff .byte 0xff

Code starting with the faulting instruction
===========================================
0: 66 90 xchg %ax,%ax
2: 48 c1 e2 20 shl $0x20,%rdx
6: 48 09 d0 or %rdx,%rax
9: 48 89 c2 mov %rax,%rdx
c: 4c 29 c2 sub %r8,%rdx
f: 48 39 fa cmp %rdi,%rdx
12: 72 ca jb 0xffffffffffffffde
14: 65 gs
15: ff .byte 0xff
[ 300.160663] RSP: 0018:ffff9afa0065bed0 EFLAGS: 00000246
[ 300.163199] RAX: 00000000da0561b2 RBX: 0000000000000033 RCX: 0000000000000001
[ 300.165892] RDX: 00000000000000bd RSI: 0000000000000001 RDI: 00000000002932f3
[ 300.168766] RBP: ffff9afa0065bef0 R08: 000000bdd9ea4d8d R09: 0000000000000001
[ 300.171407] R10: 0000000000000185 R11: 0000000000a1baa3 R12: ffff8e05b893a980
[ 300.174227] R13: ffffffffc0486520 R14: ffff8e05b8a30ba0 R15: ffff9afa00a9bb98
[ 300.177142] ? lock_torture_reader (kbuild/src/consumer/kernel/locking/locktorture.c:623) locktorture
[ 300.179889] torture_spin_lock_write_delay (kbuild/src/consumer/kernel/locking/locktorture.c:165 (discriminator 5)) locktorture
[ 300.182414] lock_torture_writer (kbuild/src/consumer/kernel/locking/locktorture.c:645) locktorture
[ 300.185365] kthread (kbuild/src/consumer/kernel/kthread.c:292)
[ 300.187277] ? kthread_park (kbuild/src/consumer/kernel/kthread.c:245)
[ 300.189502] ret_from_fork (kbuild/src/consumer/arch/x86/entry/entry_64.S:302)
[ 300.191683] ---[ end trace 8374be259847a8ed ]---
[ 300.660870] smpboot: CPU 0 is now offline
[ 312.644235] 2021-03-18 06:09:15 rmmod locktorture
[ 312.644244]
[ 312.654928] spin_lock-torture: Stopping torture_shuffle task
[ 312.658047] spin_lock-torture: Stopping torture_shuffle
[ 312.660779] spin_lock-torture: Stopping torture_stutter
[ 312.666862] spin_lock-torture: Stopping torture_stutter task
[ 312.671159] spin_lock-torture: Stopping torture_onoff task
[ 312.673951] spin_lock-torture: Stopping torture_onoff
[ 312.678989] spin_lock-torture: Stopping lock_torture_writer task
[ 312.682047] spin_lock-torture: Stopping lock_torture_writer
[ 312.687277] spin_lock-torture: Stopping lock_torture_writer
[ 312.691345] spin_lock-torture: Stopping lock_torture_writer
[ 312.695613] spin_lock-torture: Stopping lock_torture_writer
[ 312.699827] spin_lock-torture: Stopping lock_torture_writer task
[ 312.704340] spin_lock-torture: Stopping lock_torture_writer task
[ 312.709602] spin_lock-torture: Stopping lock_torture_writer task
[ 312.713929] spin_lock-torture: Stopping lock_torture_stats task
[ 312.716148] Writes: Total: 437833814 Max/Min: 110268143/108834804 Fail: 0
[ 312.718110] spin_lock-torture: Stopping lock_torture_stats
[ 312.724378] Writes: Total: 437833814 Max/Min: 110268143/108834804 Fail: 0
[ 312.726381] spin_lock-torture:--- End of test: SUCCESS: nwriters_stress=4 nreaders_stress=0 stat_interval=60 verbose=1 shuffle_interval=3 stutter=5 shutdown_secs=0 onoff_interval=3 onoff_holdoff=30
[ 312.772639]
[ 12.556636] spin_lock-torture:--- Start of test: nwriters_stress=4 nreaders_stress=0 stat_interval=60 verbose=1 shuffle_interval=3 stutter=5 shutdown_secs=0 onoff_interval=3 onoff_holdoff=30
[ 312.772649]
[ 312.790691]
[ 12.559791] spin_lock-torture: Creating torture_onoff task
[ 312.790703]
[ 312.803821]
[ 12.569165] spin_lock-torture: torture_onoff task started
[ 312.803837]
[ 312.823428]
[ 12.569165] spin_lock-torture: Creating torture_shuffle task
[ 312.823442]
[ 312.838283]
[ 12.569170] spin_lock-torture: torture_onoff begin holdoff
[ 312.838291]
[ 312.849538]
[ 12.580144] spin_lock-torture: Creating torture_stutter task
[ 312.849549]
[ 312.859051]
[ 12.580145] spin_lock-torture: torture_shuffle task started
[ 312.859062]
[ 312.868568]
[ 12.588167] spin_lock-torture: Creating lock_torture_writer task
[ 312.868580]
[ 312.882269]
[ 12.588168] spin_lock-torture: torture_stutter task started
[ 312.882279]

To reproduce:

# build kernel
cd linux
cp config-5.11.0-00004-g663017c554fa .config
make HOSTCC=gcc-9 CC=gcc-9 ARCH=x86_64 olddefconfig prepare modules_prepare bzImage modules
make HOSTCC=gcc-9 CC=gcc-9 ARCH=x86_64 INSTALL_MOD_PATH=<mod-install-dir> modules_install
cd <mod-install-dir>
find lib/ | cpio -o -H newc --quiet | gzip > modules.cgz

git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
bin/lkp qemu -k <bzImage> -m modules.cgz job-script # job-script is attached in this email

---
0DAY/LKP+ Test Infrastructure Open Source Technology Center
https://lists.01.org/hyperkitty/list/[email protected] Intel Corporation

Thanks,
Oliver Sang

Attachments:

(No filename) (12.04 kB)
config-5.11.0-00004-g663017c554fa (175.11 kB)
job-script (4.59 kB)
dmesg.xz (16.87 kB)
locktorture (2.70 kB)
Download all attachments

2021-03-18 20:44:27

by Josh Don

[permalink] [raw]

Subject: Re: [sched] 663017c554: WARNING:at_kernel/sched/core.c:#scheduler_tick

The warning is WAI (holding spinlock for 100ms). However, since this
is expected for locktorture, it makes sense to not have the warning
enabled while the test is running. I can add that to the patch.

2021-03-19 09:04:19

by Mel Gorman

[permalink] [raw]

Subject: Re: [PATCH] sched: Warn on long periods of pending need_resched

On Wed, Mar 17, 2021 at 05:06:31PM -0700, Josh Don wrote:
> On Wed, Mar 17, 2021 at 1:25 AM Ingo Molnar <[email protected]> wrote:
> >
> > * Josh Don <[email protected]> wrote:
> >
> > > If resched_latency_warn_ms is set to the default value, only one warning
> > > will be produced per boot.
> >
> > Looks like a value hack, should probably be a separate flag,
> > defaulting to warn-once.
>
> Agreed, done.
>
> > > This warning only exists under CONFIG_SCHED_DEBUG. If it goes off, it is
> > > likely that there is a missing cond_resched() somewhere.
> >
> > CONFIG_SCHED_DEBUG is default-y, so most distros have it enabled.
>
> To avoid log spam for people who don't care, I was considering having
> the feature default disabled. Perhaps a better alternative is to only
> show a single line warning and not print the full backtrace by
> default. Does the latter sound good to you?
>

Default disabling and hidden behind a static branch would be useful
because the majority of users are not going to know what to do about
a need_resched warning and the sysctl is not documented. As Ingo said,
SCHED_DEBUG is enabled by default a lot. While I'm not sure what motivates
most distributions, I have found it useful to display topology information
on boot and in rare cases, for the enabling/disabling of sched features.
Hence, sched debug features should avoid adding too much overhead where
possible.

The static branch would mean splitting the very large inline functions
adding by the patch. The inline section should do a static check only and
do the main work in a function in kernel/sched/debug.c so it has minimal
overhead if unused.

--
Mel Gorman
SUSE Labs

2021-03-23 03:38:13

by Josh Don

[permalink] [raw]

Subject: Re: [PATCH] sched: Warn on long periods of pending need_resched

On Fri, Mar 19, 2021 at 2:02 AM Mel Gorman <[email protected]> wrote:
>
> Default disabling and hidden behind a static branch would be useful
> because the majority of users are not going to know what to do about
> a need_resched warning and the sysctl is not documented. As Ingo said,
> SCHED_DEBUG is enabled by default a lot. While I'm not sure what motivates
> most distributions, I have found it useful to display topology information
> on boot and in rare cases, for the enabling/disabling of sched features.
> Hence, sched debug features should avoid adding too much overhead where
> possible.
>
> The static branch would mean splitting the very large inline functions
> adding by the patch. The inline section should do a static check only and
> do the main work in a function in kernel/sched/debug.c so it has minimal
> overhead if unused.
>
> --
> Mel Gorman
> SUSE Labs

Makes sense, I'll include this in v2.

Thanks,
Josh