2021-04-22 12:39:10

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

When switching on core-sched, CPUs need to agree which lock to use for
their RQ.

The new rule will be that rq->core_enabled will be toggled while
holding all rq->__locks that belong to a core. This means we need to
double check the rq->core_enabled value after each lock acquire and
retry if it changed.

This also has implications for those sites that take multiple RQ
locks, they need to be careful that the second lock doesn't end up
being the first lock.

Verify the lock pointer after acquiring the first lock, because if
they're on the same core, holding any of the rq->__lock instances will
pin the core state.

While there, change the rq->__lock order to CPU number, instead of rq
address, this greatly simplifies the next patch.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/core.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
kernel/sched/sched.h | 41 +++++++++++------------------------------
2 files changed, 57 insertions(+), 32 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -186,12 +186,37 @@ int sysctl_sched_rt_runtime = 950000;

void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
{
- raw_spin_lock_nested(rq_lockp(rq), subclass);
+ raw_spinlock_t *lock;
+
+ if (sched_core_disabled()) {
+ raw_spin_lock_nested(&rq->__lock, subclass);
+ return;
+ }
+
+ for (;;) {
+ lock = rq_lockp(rq);
+ raw_spin_lock_nested(lock, subclass);
+ if (likely(lock == rq_lockp(rq)))
+ return;
+ raw_spin_unlock(lock);
+ }
}

bool raw_spin_rq_trylock(struct rq *rq)
{
- return raw_spin_trylock(rq_lockp(rq));
+ raw_spinlock_t *lock;
+ bool ret;
+
+ if (sched_core_disabled())
+ return raw_spin_trylock(&rq->__lock);
+
+ for (;;) {
+ lock = rq_lockp(rq);
+ ret = raw_spin_trylock(lock);
+ if (!ret || (likely(lock == rq_lockp(rq))))
+ return ret;
+ raw_spin_unlock(lock);
+ }
}

void raw_spin_rq_unlock(struct rq *rq)
@@ -199,6 +224,25 @@ void raw_spin_rq_unlock(struct rq *rq)
raw_spin_unlock(rq_lockp(rq));
}

+#ifdef CONFIG_SMP
+/*
+ * double_rq_lock - safely lock two runqueues
+ */
+void double_rq_lock(struct rq *rq1, struct rq *rq2)
+{
+ lockdep_assert_irqs_disabled();
+
+ if (rq1->cpu > rq2->cpu)
+ swap(rq1, rq2);
+
+ raw_spin_rq_lock(rq1);
+ if (rq_lockp(rq1) == rq_lockp(rq2))
+ return;
+
+ raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
+}
+#endif
+
/*
* __task_rq_lock - lock the rq @p resides on.
*/
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1113,6 +1113,11 @@ static inline bool is_migration_disabled
#endif
}

+static inline bool sched_core_disabled(void)
+{
+ return true;
+}
+
static inline raw_spinlock_t *rq_lockp(struct rq *rq)
{
return &rq->__lock;
@@ -2231,10 +2236,12 @@ unsigned long arch_scale_freq_capacity(i
}
#endif

+
#ifdef CONFIG_SMP
-#ifdef CONFIG_PREEMPTION

-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
+extern void double_rq_lock(struct rq *rq1, struct rq *rq2);
+
+#ifdef CONFIG_PREEMPTION

/*
* fair double_lock_balance: Safely acquires both rq->locks in a fair
@@ -2274,14 +2281,13 @@ static inline int _double_lock_balance(s
if (likely(raw_spin_rq_trylock(busiest)))
return 0;

- if (rq_lockp(busiest) >= rq_lockp(this_rq)) {
+ if (busiest->cpu > this_rq->cpu) {
raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING);
return 0;
}

raw_spin_rq_unlock(this_rq);
- raw_spin_rq_lock(busiest);
- raw_spin_rq_lock_nested(this_rq, SINGLE_DEPTH_NESTING);
+ double_rq_lock(this_rq, busiest);

return 1;
}
@@ -2334,31 +2340,6 @@ static inline void double_raw_lock(raw_s
}

/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
- __acquires(rq1->lock)
- __acquires(rq2->lock)
-{
- BUG_ON(!irqs_disabled());
- if (rq_lockp(rq1) == rq_lockp(rq2)) {
- raw_spin_rq_lock(rq1);
- __acquire(rq2->lock); /* Fake it out ;) */
- } else {
- if (rq_lockp(rq1) < rq_lockp(rq2)) {
- raw_spin_rq_lock(rq1);
- raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
- } else {
- raw_spin_rq_lock(rq2);
- raw_spin_rq_lock_nested(rq1, SINGLE_DEPTH_NESTING);
- }
- }
-}
-
-/*
* double_rq_unlock - safely unlock two runqueues
*
* Note this does not restore interrupts like task_rq_unlock,



2021-04-24 01:25:16

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

Hi Peter,

> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -186,12 +186,37 @@ int sysctl_sched_rt_runtime = 950000;
>
> void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
> {
> - raw_spin_lock_nested(rq_lockp(rq), subclass);
> + raw_spinlock_t *lock;
> +
> + if (sched_core_disabled()) {

Nothing to stop sched_core from being enabled right here? Leading to
us potentially taking the wrong lock.

> + raw_spin_lock_nested(&rq->__lock, subclass);
> + return;
> + }
> +
> + for (;;) {
> + lock = rq_lockp(rq);
> + raw_spin_lock_nested(lock, subclass);
> + if (likely(lock == rq_lockp(rq)))
> + return;
> + raw_spin_unlock(lock);
> + }
> }

2021-04-26 08:32:25

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Fri, Apr 23, 2021 at 06:22:52PM -0700, Josh Don wrote:
> Hi Peter,
>
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -186,12 +186,37 @@ int sysctl_sched_rt_runtime = 950000;
> >
> > void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
> > {
> > - raw_spin_lock_nested(rq_lockp(rq), subclass);
> > + raw_spinlock_t *lock;
> > +
> > + if (sched_core_disabled()) {
>
> Nothing to stop sched_core from being enabled right here? Leading to
> us potentially taking the wrong lock.
>
> > + raw_spin_lock_nested(&rq->__lock, subclass);
> > + return;
> > + }
> > +
> > + for (;;) {
> > + lock = rq_lockp(rq);
> > + raw_spin_lock_nested(lock, subclass);
> > + if (likely(lock == rq_lockp(rq)))
> > + return;
> > + raw_spin_unlock(lock);
> > + }
> > }

Very good; something like the below seems to be the best I can make of
it..

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f732642e3e09..1a81e9cc9e5d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -290,6 +290,10 @@ static void sched_core_assert_empty(void)
static void __sched_core_enable(void)
{
static_branch_enable(&__sched_core_enabled);
+ /*
+ * Ensure raw_spin_rq_*lock*() have completed before flipping.
+ */
+ synchronize_sched();
__sched_core_flip(true);
sched_core_assert_empty();
}
@@ -449,16 +453,22 @@ void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
{
raw_spinlock_t *lock;

+ preempt_disable();
if (sched_core_disabled()) {
raw_spin_lock_nested(&rq->__lock, subclass);
+ /* preempt *MUST* still be disabled here */
+ preempt_enable_no_resched();
return;
}

for (;;) {
lock = __rq_lockp(rq);
raw_spin_lock_nested(lock, subclass);
- if (likely(lock == __rq_lockp(rq)))
+ if (likely(lock == __rq_lockp(rq))) {
+ /* preempt *MUST* still be disabled here */
+ preempt_enable_no_resched();
return;
+ }
raw_spin_unlock(lock);
}
}
@@ -468,14 +478,20 @@ bool raw_spin_rq_trylock(struct rq *rq)
raw_spinlock_t *lock;
bool ret;

- if (sched_core_disabled())
- return raw_spin_trylock(&rq->__lock);
+ preempt_disable();
+ if (sched_core_disabled()) {
+ ret = raw_spin_trylock(&rq->__lock);
+ preempt_enable();
+ return ret;
+ }

for (;;) {
lock = __rq_lockp(rq);
ret = raw_spin_trylock(lock);
- if (!ret || (likely(lock == __rq_lockp(rq))))
+ if (!ret || (likely(lock == __rq_lockp(rq)))) {
+ preempt_enable();
return ret;
+ }
raw_spin_unlock(lock);
}
}

2021-04-26 22:22:35

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index f732642e3e09..1a81e9cc9e5d 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -290,6 +290,10 @@ static void sched_core_assert_empty(void)
> static void __sched_core_enable(void)
> {
> static_branch_enable(&__sched_core_enabled);
> + /*
> + * Ensure raw_spin_rq_*lock*() have completed before flipping.
> + */
> + synchronize_sched();

synchronize_rcu()

> __sched_core_flip(true);
> sched_core_assert_empty();
> }
> @@ -449,16 +453,22 @@ void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
> {
> raw_spinlock_t *lock;
>
> + preempt_disable();
> if (sched_core_disabled()) {
> raw_spin_lock_nested(&rq->__lock, subclass);
> + /* preempt *MUST* still be disabled here */
> + preempt_enable_no_resched();
> return;
> }

This approach looks good to me. I'm guessing you went this route
instead of doing the re-check after locking in order to optimize the
disabled case?

Recommend a comment that the preempt_disable() here pairs with the
synchronize_rcu() in __sched_core_enable().

>
> for (;;) {
> lock = __rq_lockp(rq);
> raw_spin_lock_nested(lock, subclass);
> - if (likely(lock == __rq_lockp(rq)))
> + if (likely(lock == __rq_lockp(rq))) {
> + /* preempt *MUST* still be disabled here */
> + preempt_enable_no_resched();
> return;
> + }
> raw_spin_unlock(lock);
> }
> }

2021-04-27 17:11:44

by Don Hiatt

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Mon, Apr 26, 2021 at 3:21 PM Josh Don <[email protected]> wrote:
>
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index f732642e3e09..1a81e9cc9e5d 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -290,6 +290,10 @@ static void sched_core_assert_empty(void)
> > static void __sched_core_enable(void)
> > {
> > static_branch_enable(&__sched_core_enabled);
> > + /*
> > + * Ensure raw_spin_rq_*lock*() have completed before flipping.
> > + */
> > + synchronize_sched();
>
> synchronize_rcu()
>
> > __sched_core_flip(true);
> > sched_core_assert_empty();
> > }
> > @@ -449,16 +453,22 @@ void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
> > {
> > raw_spinlock_t *lock;
> >
> > + preempt_disable();
> > if (sched_core_disabled()) {
> > raw_spin_lock_nested(&rq->__lock, subclass);
> > + /* preempt *MUST* still be disabled here */
> > + preempt_enable_no_resched();
> > return;
> > }
>
> This approach looks good to me. I'm guessing you went this route
> instead of doing the re-check after locking in order to optimize the
> disabled case?
>
Hi Josh and Peter,

I've been running into soft lookups and hard lockups when running a script
that just cycles setting the cookie of a group of processes over and over again.

Unfortunately the only way I can reproduce this is by setting the cookies
on qemu. I've tried sysbench, stress-ng but those seem to work just fine.

I'm running Peter's branch and even tried the suggested changes here but
still see the same behavior. I enabled panic on hard lockup and here below
is a snippet of the log.

Is there anything you'd like me to try or have any debugging you'd like me to
do? I'd certainly like to get to the bottom of this.

Thanks and have a great day,

Don

[ 1164.129706] NMI backtrace for cpu 10
[ 1164.129707] CPU: 10 PID: 6994 Comm: CPU 2/KVM Kdump: loaded
Tainted: G I L 5.12.0-rc8+ #4
[ 1164.129707] Hardware name: Dell Inc. PowerEdge R640/0W23H8, BIOS
2.9.4 11/06/2020
[ 1164.129708] RIP: 0010:native_queued_spin_lock_slowpath+0x69/0x1e0
[ 1164.129708] Code: 0f 92 c0 0f b6 c0 c1 e0 08 89 c2 8b 07 30 e4 09
d0 a9 00 01 ff ff 75 20 85 c0 75 0c b8 01 00 00 00 66 89 07 5d c3 f3
90 8b 07 <84> c0 75 f8 b8 01 00 00 00 66 89 07 eb ec f6 c4 01 75 04 c6
47 01
[ 1164.129709] RSP: 0018:ffffb35d591e0e20 EFLAGS: 00000002
[ 1164.129710] RAX: 0000000000200101 RBX: ffff9ecb0096c180 RCX: 0000000000000001
[ 1164.129710] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff9ecb0096c180
[ 1164.129711] RBP: ffffb35d591e0e20 R08: 0000000000000000 R09: 000000010002e2a6
[ 1164.129711] R10: 0000000000000000 R11: ffffb35d591e0ff8 R12: ffff9ecb0096c180
[ 1164.129712] R13: 000000000000000a R14: 000000000000000a R15: ffff9e6e8dc88000
[ 1164.129712] FS: 00007f91bd79f700(0000) GS:ffff9ecb00940000(0000)
knlGS:0000000000000000
[ 1164.129713] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1164.129713] CR2: 000000000255d05c CR3: 000000022eb8c005 CR4: 00000000007726e0
[ 1164.129714] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1164.129715] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 1164.129715] PKRU: 55555554
[ 1164.129715] Call Trace:
[ 1164.129716] <IRQ>
[ 1164.129716] _raw_spin_lock+0x1f/0x30
[ 1164.129717] raw_spin_rq_lock_nested.part.93+0x2b/0x60
[ 1164.129717] raw_spin_rq_lock_nested.constprop.126+0x1a/0x20
[ 1164.129718] scheduler_tick+0x4c/0x240
[ 1164.129718] ? tick_sched_handle.isra.17+0x60/0x60
[ 1164.129719] update_process_times+0xab/0xc0
[ 1164.129719] tick_sched_handle.isra.17+0x21/0x60
[ 1164.129720] tick_sched_timer+0x6b/0x80
[ 1164.129720] __hrtimer_run_queues+0x10d/0x230
[ 1164.129721] hrtimer_interrupt+0xe7/0x230
[ 1164.129721] __sysvec_apic_timer_interrupt+0x62/0x100
[ 1164.129721] sysvec_apic_timer_interrupt+0x51/0xa0
[ 1164.129722] </IRQ>
[ 1164.129722] asm_sysvec_apic_timer_interrupt+0x12/0x20
[ 1164.129722] RIP: 0010:smp_call_function_single+0xf6/0x130
[ 1164.129723] Code: 00 75 4d 48 83 c4 48 41 5a 5d 49 8d 62 f8 c3 48
89 75 c0 48 8d 75 b0 48 89 55 c8 e8 84 fe ff ff 8b 55 b8 83 e2 01 74
0a f3 90 <8b> 55 b8 83 e2 01 75 f6 eb c0 8b 05 4a d8 b0 01 85 c0 0f 85
67 ff
[ 1164.129724] RSP: 0018:ffffb35d5e20fb20 EFLAGS: 00000202
[ 1164.129725] RAX: 0000000000000000 RBX: ffff9e6e8df12600 RCX: ffffb35d5f2ffcc0
[ 1164.129725] RDX: 0000000000000001 RSI: ffffb35d5e20fb20 RDI: ffffb35d5e20fb20
[ 1164.129725] RBP: ffffb35d5e20fb70 R08: 0000000000001000 R09: 0000000000001907
[ 1164.129726] R10: ffffb35d5e20fb98 R11: 00000000003d0900 R12: 000000000000000a
[ 1164.129726] R13: 000000000000000c R14: 0000000000000000 R15: 0000000000000000
[ 1164.129727] ? vmclear_error+0x150/0x390 [kvm_intel]
[ 1164.129727] vmx_vcpu_load_vmcs+0x1b2/0x330 [kvm_intel]
[ 1164.129728] ? update_load_avg+0x1b8/0x640
[ 1164.129728] ? vmx_vcpu_load_vmcs+0x1b2/0x330 [kvm_intel]
[ 1164.129728] ? load_fixmap_gdt+0x23/0x30
[ 1164.129729] vmx_vcpu_load_vmcs+0x309/0x330 [kvm_intel]
[ 1164.129729] kvm_arch_vcpu_load+0x41/0x260 [kvm]
[ 1164.129729] ? __switch_to_xtra+0xe3/0x4e0
[ 1164.129730] kvm_io_bus_write+0xbc/0xf0 [kvm]
[ 1164.129730] ? kvm_io_bus_write+0xbc/0xf0 [kvm]
[ 1164.129730] finish_task_switch+0x184/0x290
[ 1164.129731] __schedule+0x30a/0x12f0
[ 1164.129731] schedule+0x40/0xb0
[ 1164.129732] kvm_vcpu_block+0x57/0x4b0 [kvm]
[ 1164.129732] kvm_arch_vcpu_ioctl_run+0x3fe/0x1b60 [kvm]
[ 1164.129732] kvm_vcpu_kick+0x59c/0xa10 [kvm]
[ 1164.129733] ? kvm_vcpu_kick+0x59c/0xa10 [kvm]
[ 1164.129733] __x64_sys_ioctl+0x96/0xd0
[ 1164.129733] do_syscall_64+0x37/0x80
[ 1164.129734] entry_SYSCALL_64_after_hwframe+0x44/0xae
[ 1164.129734] RIP: 0033:0x7fb1c8a4c317
[ 1164.129735] Code: b3 66 90 48 8b 05 71 4b 2d 00 64 c7 00 26 00 00
00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00
00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 41 4b 2d 00 f7 d8 64 89
01 48
[ 1164.129735] RSP: 002b:00007f91bd79e7e8 EFLAGS: 00000246 ORIG_RAX:
0000000000000010
[ 1164.129736] RAX: ffffffffffffffda RBX: 000000000000ae80 RCX: 00007fb1c8a4c317
[ 1164.129737] RDX: 0000000000000000 RSI: 000000000000ae80 RDI: 000000000000001a
[ 1164.129737] RBP: 0000000000000000 R08: 000055f3e6d027c0 R09: 00007f91a80040b8
[ 1164.129738] R10: 00007f91a80050a0 R11: 0000000000000246 R12: 00007fb1c4023010
[ 1164.129739] R13: 000055f3e569d880 R14: 00007fb1cb8e7000 R15: 00007ffd86806f70


> Recommend a comment that the preempt_disable() here pairs with the
> synchronize_rcu() in __sched_core_enable().
>
> >
> > for (;;) {
> > lock = __rq_lockp(rq);
> > raw_spin_lock_nested(lock, subclass);
> > - if (likely(lock == __rq_lockp(rq)))
> > + if (likely(lock == __rq_lockp(rq))) {
> > + /* preempt *MUST* still be disabled here */
> > + preempt_enable_no_resched();
> > return;
> > + }
> > raw_spin_unlock(lock);
> > }
> > }

2021-04-27 23:32:57

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Mon, Apr 26, 2021 at 3:21 PM Josh Don <[email protected]> wrote:
>
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index f732642e3e09..1a81e9cc9e5d 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -290,6 +290,10 @@ static void sched_core_assert_empty(void)
> > static void __sched_core_enable(void)
> > {
> > static_branch_enable(&__sched_core_enabled);
> > + /*
> > + * Ensure raw_spin_rq_*lock*() have completed before flipping.
> > + */
> > + synchronize_sched();
>
> synchronize_rcu()
>
> > __sched_core_flip(true);
> > sched_core_assert_empty();
> > }
> > @@ -449,16 +453,22 @@ void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
> > {
> > raw_spinlock_t *lock;
> >
> > + preempt_disable();
> > if (sched_core_disabled()) {
> > raw_spin_lock_nested(&rq->__lock, subclass);
> > + /* preempt *MUST* still be disabled here */
> > + preempt_enable_no_resched();
> > return;
> > }
>
> This approach looks good to me. I'm guessing you went this route
> instead of doing the re-check after locking in order to optimize the
> disabled case?
>
> Recommend a comment that the preempt_disable() here pairs with the
> synchronize_rcu() in __sched_core_enable().
>
> >
> > for (;;) {
> > lock = __rq_lockp(rq);
> > raw_spin_lock_nested(lock, subclass);
> > - if (likely(lock == __rq_lockp(rq)))
> > + if (likely(lock == __rq_lockp(rq))) {
> > + /* preempt *MUST* still be disabled here */
> > + preempt_enable_no_resched();
> > return;
> > + }
> > raw_spin_unlock(lock);
> > }
> > }

Also, did you mean to have a preempt_enable_no_resched() rather than
prempt_enable() in raw_spin_rq_trylock?

I went over the rq_lockp stuff again after Don's reported lockup. Most
uses are safe due to already holding an rq lock. However,
double_rq_unlock() is prone to race:

double_rq_unlock(rq1, rq2):
/* Initial state: core sched enabled, and rq1 and rq2 are smt
siblings. So, double_rq_lock(rq1, rq2) only took a single rq lock */
raw_spin_rq_unlock(rq1);
/* now not holding any rq lock */
/* sched core disabled. Now __rq_lockp(rq1) != __rq_lockp(rq2), so we
falsely unlock rq2 */
if (__rq_lockp(rq1) != __rq_lockp(rq2))
raw_spin_rq_unlock(rq2);
else
__release(rq2->lock);

Instead we can cache __rq_lockp(rq1) and __rq_lockp(rq2) before
releasing the lock, in order to prevent this. FWIW I think it is
likely that Don is seeing a different issue.

2021-04-27 23:36:32

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Tue, Apr 27, 2021 at 10:10 AM Don Hiatt <[email protected]> wrote:
> Hi Josh and Peter,
>
> I've been running into soft lookups and hard lockups when running a script
> that just cycles setting the cookie of a group of processes over and over again.
>
> Unfortunately the only way I can reproduce this is by setting the cookies
> on qemu. I've tried sysbench, stress-ng but those seem to work just fine.
>
> I'm running Peter's branch and even tried the suggested changes here but
> still see the same behavior. I enabled panic on hard lockup and here below
> is a snippet of the log.
>
> Is there anything you'd like me to try or have any debugging you'd like me to
> do? I'd certainly like to get to the bottom of this.

Hi Don,

I tried to repro using qemu, but did not generate a lockup. Could you
provide more details on what your script is doing (or better yet,
share the script directly)? I would have expected you to potentially
hit a lockup if you were cycling sched_core being enabled and
disabled, but it sounds like you are just recreating the cookie for a
process group over and over?

Best,
Josh

2021-04-28 01:04:41

by Aubrey Li

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Wed, Apr 28, 2021 at 7:36 AM Josh Don <[email protected]> wrote:
>
> On Tue, Apr 27, 2021 at 10:10 AM Don Hiatt <[email protected]> wrote:
> > Hi Josh and Peter,
> >
> > I've been running into soft lookups and hard lockups when running a script
> > that just cycles setting the cookie of a group of processes over and over again.
> >
> > Unfortunately the only way I can reproduce this is by setting the cookies
> > on qemu. I've tried sysbench, stress-ng but those seem to work just fine.
> >
> > I'm running Peter's branch and even tried the suggested changes here but
> > still see the same behavior. I enabled panic on hard lockup and here below
> > is a snippet of the log.
> >
> > Is there anything you'd like me to try or have any debugging you'd like me to
> > do? I'd certainly like to get to the bottom of this.
>
> Hi Don,
>
> I tried to repro using qemu, but did not generate a lockup. Could you
> provide more details on what your script is doing (or better yet,
> share the script directly)? I would have expected you to potentially
> hit a lockup if you were cycling sched_core being enabled and
> disabled, but it sounds like you are just recreating the cookie for a
> process group over and over?
>

I saw something similar on a bare metal hardware. Also tried the suggested
patch here and no luck. Panic stack attached with
softlockup_all_cpu_backtrace=1.
(sorry, my system has 192 cpus and somehow putting 184 cpus offline causes
system hang without any message...)

My script created the core cookie for two different process groups.
The one is for sysbench cpu, the other is for sysbench mysql,
mysqld(cookie=0) is
also on the same machine. The number of tasks in each category is the same as
the number of CPUs on the system. And cookie is created just during task
startup.

Please let me know if the script is needed, I'll push it to github
with some cleanup.

Thanks,
-Aubrey


Attachments:
aubrey-ubuntu_2021-04-28_01-00-04.log (529.33 kB)

2021-04-28 06:04:43

by Li, Aubrey

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On 4/22/21 8:05 PM, Peter Zijlstra wrote:
> When switching on core-sched, CPUs need to agree which lock to use for
> their RQ.
>
> The new rule will be that rq->core_enabled will be toggled while
> holding all rq->__locks that belong to a core. This means we need to
> double check the rq->core_enabled value after each lock acquire and
> retry if it changed.
>
> This also has implications for those sites that take multiple RQ
> locks, they need to be careful that the second lock doesn't end up
> being the first lock.
>
> Verify the lock pointer after acquiring the first lock, because if
> they're on the same core, holding any of the rq->__lock instances will
> pin the core state.
>
> While there, change the rq->__lock order to CPU number, instead of rq
> address, this greatly simplifies the next patch.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> kernel/sched/core.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
> kernel/sched/sched.h | 41 +++++++++++------------------------------
> 2 files changed, 57 insertions(+), 32 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -186,12 +186,37 @@ int sysctl_sched_rt_runtime = 950000;
>
> void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
> {
> - raw_spin_lock_nested(rq_lockp(rq), subclass);
> + raw_spinlock_t *lock;
> +
> + if (sched_core_disabled()) {
> + raw_spin_lock_nested(&rq->__lock, subclass);
> + return;
> + }
> +
> + for (;;) {
> + lock = rq_lockp(rq);
> + raw_spin_lock_nested(lock, subclass);
> + if (likely(lock == rq_lockp(rq)))
> + return;
> + raw_spin_unlock(lock);
> + }
> }
>
> bool raw_spin_rq_trylock(struct rq *rq)
> {
> - return raw_spin_trylock(rq_lockp(rq));
> + raw_spinlock_t *lock;
> + bool ret;
> +
> + if (sched_core_disabled())
> + return raw_spin_trylock(&rq->__lock);
> +
> + for (;;) {
> + lock = rq_lockp(rq);
> + ret = raw_spin_trylock(lock);
> + if (!ret || (likely(lock == rq_lockp(rq))))
> + return ret;
> + raw_spin_unlock(lock);
> + }
> }
>
> void raw_spin_rq_unlock(struct rq *rq)
> @@ -199,6 +224,25 @@ void raw_spin_rq_unlock(struct rq *rq)
> raw_spin_unlock(rq_lockp(rq));
> }
>
> +#ifdef CONFIG_SMP
> +/*
> + * double_rq_lock - safely lock two runqueues
> + */
> +void double_rq_lock(struct rq *rq1, struct rq *rq2)
> +{
> + lockdep_assert_irqs_disabled();
> +
> + if (rq1->cpu > rq2->cpu)
> + swap(rq1, rq2);

I'm not sure why swap rq here instead of rq lock? This swaps dst rq
and src rq and causes the subsequent logic wrong at least in try_steal_cookie().

Thanks,
-Aubrey

2021-04-28 06:06:14

by Li, Aubrey

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On 4/28/21 9:03 AM, Aubrey Li wrote:
> On Wed, Apr 28, 2021 at 7:36 AM Josh Don <[email protected]> wrote:
>>
>> On Tue, Apr 27, 2021 at 10:10 AM Don Hiatt <[email protected]> wrote:
>>> Hi Josh and Peter,
>>>
>>> I've been running into soft lookups and hard lockups when running a script
>>> that just cycles setting the cookie of a group of processes over and over again.
>>>
>>> Unfortunately the only way I can reproduce this is by setting the cookies
>>> on qemu. I've tried sysbench, stress-ng but those seem to work just fine.
>>>
>>> I'm running Peter's branch and even tried the suggested changes here but
>>> still see the same behavior. I enabled panic on hard lockup and here below
>>> is a snippet of the log.
>>>
>>> Is there anything you'd like me to try or have any debugging you'd like me to
>>> do? I'd certainly like to get to the bottom of this.
>>
>> Hi Don,
>>
>> I tried to repro using qemu, but did not generate a lockup. Could you
>> provide more details on what your script is doing (or better yet,
>> share the script directly)? I would have expected you to potentially
>> hit a lockup if you were cycling sched_core being enabled and
>> disabled, but it sounds like you are just recreating the cookie for a
>> process group over and over?
>>
>
> I saw something similar on a bare metal hardware. Also tried the suggested
> patch here and no luck. Panic stack attached with
> softlockup_all_cpu_backtrace=1.
> (sorry, my system has 192 cpus and somehow putting 184 cpus offline causes
> system hang without any message...)

Can you please try the following change to see if the problem is gone on your side?

Thanks,
-Aubrey

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f732642e3e09..1ef13b50dfcd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -493,14 +493,17 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
{
lockdep_assert_irqs_disabled();

- if (rq1->cpu > rq2->cpu)
- swap(rq1, rq2);
-
- raw_spin_rq_lock(rq1);
- if (__rq_lockp(rq1) == __rq_lockp(rq2))
- return;
-
- raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
+ if (__rq_lockp(rq1) == __rq_lockp(rq2)) {
+ raw_spin_rq_lock(rq1);
+ } else {
+ if (__rq_lockp(rq1) < __rq_lockp(rq2)) {
+ raw_spin_rq_lock(rq1);
+ raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
+ } else {
+ raw_spin_rq_lock(rq2);
+ raw_spin_rq_lock_nested(rq1, SINGLE_DEPTH_NESTING);
+ }
+ }
}
#endif

2021-04-28 07:19:15

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Mon, Apr 26, 2021 at 03:21:36PM -0700, Josh Don wrote:
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index f732642e3e09..1a81e9cc9e5d 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -290,6 +290,10 @@ static void sched_core_assert_empty(void)
> > static void __sched_core_enable(void)
> > {
> > static_branch_enable(&__sched_core_enabled);
> > + /*
> > + * Ensure raw_spin_rq_*lock*() have completed before flipping.
> > + */
> > + synchronize_sched();
>
> synchronize_rcu()

Moo, I actually like synchronize_sched() because it indicates it matches
a preempt disabled region.

> > __sched_core_flip(true);
> > sched_core_assert_empty();
> > }
> > @@ -449,16 +453,22 @@ void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
> > {
> > raw_spinlock_t *lock;
> >
> > + preempt_disable();
> > if (sched_core_disabled()) {
> > raw_spin_lock_nested(&rq->__lock, subclass);
> > + /* preempt *MUST* still be disabled here */
> > + preempt_enable_no_resched();
> > return;
> > }
>
> This approach looks good to me. I'm guessing you went this route
> instead of doing the re-check after locking in order to optimize the
> disabled case?

Exactly.

> Recommend a comment that the preempt_disable() here pairs with the
> synchronize_rcu() in __sched_core_enable().

Fair enough.

2021-04-28 09:17:01

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Tue, Apr 27, 2021 at 04:30:02PM -0700, Josh Don wrote:

> Also, did you mean to have a preempt_enable_no_resched() rather than
> prempt_enable() in raw_spin_rq_trylock?

No, trylock really needs to be preempt_enable(), because it can have
failed, at which point it will not have incremented the preemption count
and our decrement can hit 0, at which point we really should reschedule.

> I went over the rq_lockp stuff again after Don's reported lockup. Most
> uses are safe due to already holding an rq lock. However,
> double_rq_unlock() is prone to race:
>
> double_rq_unlock(rq1, rq2):
> /* Initial state: core sched enabled, and rq1 and rq2 are smt
> siblings. So, double_rq_lock(rq1, rq2) only took a single rq lock */
> raw_spin_rq_unlock(rq1);
> /* now not holding any rq lock */
> /* sched core disabled. Now __rq_lockp(rq1) != __rq_lockp(rq2), so we
> falsely unlock rq2 */
> if (__rq_lockp(rq1) != __rq_lockp(rq2))
> raw_spin_rq_unlock(rq2);
> else
> __release(rq2->lock);
>
> Instead we can cache __rq_lockp(rq1) and __rq_lockp(rq2) before
> releasing the lock, in order to prevent this. FWIW I think it is
> likely that Don is seeing a different issue.

Ah, indeed so.. rq_lockp() could do with an assertion, not sure how to
sanely do that. Anyway, double_rq_unlock() is simple enough to fix, we
can simply flip the unlock()s.

( I'm suffering a cold and am really quite slow atm )

How's this then?

---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f732642e3e09..3a534c0c1c46 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -290,6 +290,10 @@ static void sched_core_assert_empty(void)
static void __sched_core_enable(void)
{
static_branch_enable(&__sched_core_enabled);
+ /*
+ * Ensure raw_spin_rq_*lock*() have completed before flipping.
+ */
+ synchronize_sched();
__sched_core_flip(true);
sched_core_assert_empty();
}
@@ -449,16 +453,23 @@ void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
{
raw_spinlock_t *lock;

+ /* Matches synchronize_sched() in __sched_core_enabled() */
+ preempt_disable();
if (sched_core_disabled()) {
raw_spin_lock_nested(&rq->__lock, subclass);
+ /* preempt-count *MUST* be > 1 */
+ preempt_enable_no_resched();
return;
}

for (;;) {
lock = __rq_lockp(rq);
raw_spin_lock_nested(lock, subclass);
- if (likely(lock == __rq_lockp(rq)))
+ if (likely(lock == __rq_lockp(rq))) {
+ /* preempt-count *MUST* be > 1 */
+ preempt_enable_no_resched();
return;
+ }
raw_spin_unlock(lock);
}
}
@@ -468,14 +479,21 @@ bool raw_spin_rq_trylock(struct rq *rq)
raw_spinlock_t *lock;
bool ret;

- if (sched_core_disabled())
- return raw_spin_trylock(&rq->__lock);
+ /* Matches synchronize_sched() in __sched_core_enabled() */
+ preempt_disable();
+ if (sched_core_disabled()) {
+ ret = raw_spin_trylock(&rq->__lock);
+ preempt_enable();
+ return ret;
+ }

for (;;) {
lock = __rq_lockp(rq);
ret = raw_spin_trylock(lock);
- if (!ret || (likely(lock == __rq_lockp(rq))))
+ if (!ret || (likely(lock == __rq_lockp(rq)))) {
+ preempt_enable();
return ret;
+ }
raw_spin_unlock(lock);
}
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6a905fe19eef..c9a52231d58a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2568,11 +2568,12 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__releases(rq1->lock)
__releases(rq2->lock)
{
- raw_spin_rq_unlock(rq1);
if (__rq_lockp(rq1) != __rq_lockp(rq2))
raw_spin_rq_unlock(rq2);
else
__release(rq2->lock);
+
+ raw_spin_rq_unlock(rq1);
}

extern void set_rq_online (struct rq *rq);

2021-04-28 13:45:59

by Aubrey Li

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Wed, Apr 28, 2021 at 5:14 PM Peter Zijlstra <[email protected]> wrote:
>
> On Tue, Apr 27, 2021 at 04:30:02PM -0700, Josh Don wrote:
>
> > Also, did you mean to have a preempt_enable_no_resched() rather than
> > prempt_enable() in raw_spin_rq_trylock?
>
> No, trylock really needs to be preempt_enable(), because it can have
> failed, at which point it will not have incremented the preemption count
> and our decrement can hit 0, at which point we really should reschedule.
>
> > I went over the rq_lockp stuff again after Don's reported lockup. Most
> > uses are safe due to already holding an rq lock. However,
> > double_rq_unlock() is prone to race:
> >
> > double_rq_unlock(rq1, rq2):
> > /* Initial state: core sched enabled, and rq1 and rq2 are smt
> > siblings. So, double_rq_lock(rq1, rq2) only took a single rq lock */
> > raw_spin_rq_unlock(rq1);
> > /* now not holding any rq lock */
> > /* sched core disabled. Now __rq_lockp(rq1) != __rq_lockp(rq2), so we
> > falsely unlock rq2 */
> > if (__rq_lockp(rq1) != __rq_lockp(rq2))
> > raw_spin_rq_unlock(rq2);
> > else
> > __release(rq2->lock);
> >
> > Instead we can cache __rq_lockp(rq1) and __rq_lockp(rq2) before
> > releasing the lock, in order to prevent this. FWIW I think it is
> > likely that Don is seeing a different issue.
>
> Ah, indeed so.. rq_lockp() could do with an assertion, not sure how to
> sanely do that. Anyway, double_rq_unlock() is simple enough to fix, we
> can simply flip the unlock()s.
>
> ( I'm suffering a cold and am really quite slow atm )
>
> How's this then?
>
> ---
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index f732642e3e09..3a534c0c1c46 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -290,6 +290,10 @@ static void sched_core_assert_empty(void)
> static void __sched_core_enable(void)
> {
> static_branch_enable(&__sched_core_enabled);
> + /*
> + * Ensure raw_spin_rq_*lock*() have completed before flipping.
> + */
> + synchronize_sched();

synchronize_sched() seems no longer exist...

Thanks,
-Aubrey

2021-04-28 13:46:39

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Wed, Apr 28, 2021 at 06:35:36PM +0800, Aubrey Li wrote:
> On Wed, Apr 28, 2021 at 5:14 PM Peter Zijlstra <[email protected]> wrote:

> > Ah, indeed so.. rq_lockp() could do with an assertion, not sure how to
> > sanely do that. Anyway, double_rq_unlock() is simple enough to fix, we
> > can simply flip the unlock()s.
> >
> > ( I'm suffering a cold and am really quite slow atm )
> >
> > How's this then?
> >
> > ---
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index f732642e3e09..3a534c0c1c46 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -290,6 +290,10 @@ static void sched_core_assert_empty(void)
> > static void __sched_core_enable(void)
> > {
> > static_branch_enable(&__sched_core_enabled);
> > + /*
> > + * Ensure raw_spin_rq_*lock*() have completed before flipping.
> > + */
> > + synchronize_sched();
>
> synchronize_sched() seems no longer exist...

Bah.. Paul, why did that go away? I realize RCU merged in the sched and
bh flavours, but I still find it expressive to use sync_sched() vs
preempt_disable().

Anyway, just use sync_rcu().

2021-04-28 13:47:50

by Aubrey Li

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Wed, Apr 28, 2021 at 2:05 PM Aubrey Li <[email protected]> wrote:
>
> On 4/28/21 9:03 AM, Aubrey Li wrote:
> > On Wed, Apr 28, 2021 at 7:36 AM Josh Don <[email protected]> wrote:
> >>
> >> On Tue, Apr 27, 2021 at 10:10 AM Don Hiatt <[email protected]> wrote:
> >>> Hi Josh and Peter,
> >>>
> >>> I've been running into soft lookups and hard lockups when running a script
> >>> that just cycles setting the cookie of a group of processes over and over again.
> >>>
> >>> Unfortunately the only way I can reproduce this is by setting the cookies
> >>> on qemu. I've tried sysbench, stress-ng but those seem to work just fine.
> >>>
> >>> I'm running Peter's branch and even tried the suggested changes here but
> >>> still see the same behavior. I enabled panic on hard lockup and here below
> >>> is a snippet of the log.
> >>>
> >>> Is there anything you'd like me to try or have any debugging you'd like me to
> >>> do? I'd certainly like to get to the bottom of this.
> >>
> >> Hi Don,
> >>
> >> I tried to repro using qemu, but did not generate a lockup. Could you
> >> provide more details on what your script is doing (or better yet,
> >> share the script directly)? I would have expected you to potentially
> >> hit a lockup if you were cycling sched_core being enabled and
> >> disabled, but it sounds like you are just recreating the cookie for a
> >> process group over and over?
> >>
> >
> > I saw something similar on a bare metal hardware. Also tried the suggested
> > patch here and no luck. Panic stack attached with
> > softlockup_all_cpu_backtrace=1.
> > (sorry, my system has 192 cpus and somehow putting 184 cpus offline causes
> > system hang without any message...)
>
> Can you please try the following change to see if the problem is gone on your side?
>

Please ignore this patch, as the change of double_rq_unlock() in
Peter's last patch
fixed the problem.

Thanks,
-Aubrey

>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index f732642e3e09..1ef13b50dfcd 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -493,14 +493,17 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
> {
> lockdep_assert_irqs_disabled();
>
> - if (rq1->cpu > rq2->cpu)
> - swap(rq1, rq2);
> -
> - raw_spin_rq_lock(rq1);
> - if (__rq_lockp(rq1) == __rq_lockp(rq2))
> - return;
> -
> - raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
> + if (__rq_lockp(rq1) == __rq_lockp(rq2)) {
> + raw_spin_rq_lock(rq1);
> + } else {
> + if (__rq_lockp(rq1) < __rq_lockp(rq2)) {
> + raw_spin_rq_lock(rq1);
> + raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
> + } else {
> + raw_spin_rq_lock(rq2);
> + raw_spin_rq_lock_nested(rq1, SINGLE_DEPTH_NESTING);
> + }
> + }
> }
> #endif
>

2021-04-28 18:11:25

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Wed, Apr 28, 2021 at 01:03:29PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 28, 2021 at 06:35:36PM +0800, Aubrey Li wrote:
> > On Wed, Apr 28, 2021 at 5:14 PM Peter Zijlstra <[email protected]> wrote:
>
> > > Ah, indeed so.. rq_lockp() could do with an assertion, not sure how to
> > > sanely do that. Anyway, double_rq_unlock() is simple enough to fix, we
> > > can simply flip the unlock()s.
> > >
> > > ( I'm suffering a cold and am really quite slow atm )
> > >
> > > How's this then?
> > >
> > > ---
> > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > > index f732642e3e09..3a534c0c1c46 100644
> > > --- a/kernel/sched/core.c
> > > +++ b/kernel/sched/core.c
> > > @@ -290,6 +290,10 @@ static void sched_core_assert_empty(void)
> > > static void __sched_core_enable(void)
> > > {
> > > static_branch_enable(&__sched_core_enabled);
> > > + /*
> > > + * Ensure raw_spin_rq_*lock*() have completed before flipping.
> > > + */
> > > + synchronize_sched();
> >
> > synchronize_sched() seems no longer exist...
>
> Bah.. Paul, why did that go away? I realize RCU merged in the sched and
> bh flavours, but I still find it expressive to use sync_sched() vs
> preempt_disable().

I could have made synchronize_sched() a synonym for synchronize_rcu(),
but that would be more likely to mislead than to help.

> Anyway, just use sync_rcu().

And yes, just use synchronize_rcu().

Thanx, Paul

2021-04-28 20:50:13

by Don Hiatt

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Tue, Apr 27, 2021 at 4:35 PM Josh Don <[email protected]> wrote:
>
> On Tue, Apr 27, 2021 at 10:10 AM Don Hiatt <[email protected]> wrote:
> > Hi Josh and Peter,
> >
> > I've been running into soft lookups and hard lockups when running a script
> > that just cycles setting the cookie of a group of processes over and over again.
> >
> > Unfortunately the only way I can reproduce this is by setting the cookies
> > on qemu. I've tried sysbench, stress-ng but those seem to work just fine.
> >
> > I'm running Peter's branch and even tried the suggested changes here but
> > still see the same behavior. I enabled panic on hard lockup and here below
> > is a snippet of the log.
> >
> > Is there anything you'd like me to try or have any debugging you'd like me to
> > do? I'd certainly like to get to the bottom of this.
>
> Hi Don,
>
> I tried to repro using qemu, but did not generate a lockup. Could you
> provide more details on what your script is doing (or better yet,
> share the script directly)? I would have expected you to potentially
> hit a lockup if you were cycling sched_core being enabled and
> disabled, but it sounds like you are just recreating the cookie for a
> process group over and over?
>
> Best,
> Josh

Hi Josh,

Sorry if I wasn't clear, but I'm running on bare metal (Peter's
5.12-rc8 repo) and have two qemu-system-x86_64 vms (1 with 8vcpu, the
other with 24 but it doesn't really matter). I then run a script [1]
that cycles setting the cookie for all the processes given the pid of
qemu-system-x86_64.

I also just found a lockup when I set the cookies for those two vms,
pinned them both to the same processor pair, and ran sysbench within
each vm to generate some load [2]. This is without cycling the
cookies, just setting once.

Thanks!

Don

----[1]---
This is a little test harness (I can provide the source but it is
based on your kselftests)
dhiatt@s2r5node34:~/do_coresched$ ./do_coresched -h
Usage for ./do_coresched
-c Create sched cookie for <pid>
-f Share sched cookie from <pid>
-g Get sched cookie from <pid>
-t Share sched cookie to <pid>
-z Clear sched cookie from <pid>
-p PID

Create: _prctl(PR_SCHED_CORE, PR_SCHED_CORE_CREATE, pid, PIDTYPE_PID, 0)
Share: _prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_TO, pid, PIDTYPE_PID, 0)
Get: prctl(PR_SCHED_CORE, PR_SCHED_CORE_GET, pid, PIDTYPE_PID,
(unsigned long)&cookie);

--

dhiatt@s2r5node34:~/do_coresched$ cat set.sh
#!/bin/bash
# usage: set.sh pid

target=$1
echo target pid: $target
pids=`ps -eL | grep $target | awk '{print $2}' | sort -g`

echo "Setting cookies for $target"
./do_coresched -c -p $BASHPID. #
for i in $pids
do
./do_coresched -t -p $i
./do_coresched -g -p $i
done
---

---[2]---
[ 8911.926989] watchdog: BUG: soft lockup - CPU#53 stuck for 22s! [CPU
1/KVM:19539]
[ 8911.935727] NMI watchdog: Watchdog detected hard LOCKUP on cpu 6
[ 8911.935791] NMI watchdog: Watchdog detected hard LOCKUP on cpu 7
[ 8911.935908] NMI watchdog: Watchdog detected hard LOCKUP on cpu 12
[ 8911.935967] NMI watchdog: Watchdog detected hard LOCKUP on cpu 13
[ 8911.936070] NMI watchdog: Watchdog detected hard LOCKUP on cpu 19
[ 8911.936145] NMI watchdog: Watchdog detected hard LOCKUP on cpu 21
[ 8911.936220] NMI watchdog: Watchdog detected hard LOCKUP on cpu 23
[ 8911.936361] NMI watchdog: Watchdog detected hard LOCKUP on cpu 31
[ 8911.936453] NMI watchdog: Watchdog detected hard LOCKUP on cpu 34
[ 8911.936567] NMI watchdog: Watchdog detected hard LOCKUP on cpu 42
[ 8911.936627] NMI watchdog: Watchdog detected hard LOCKUP on cpu 46
[ 8911.936712] NMI watchdog: Watchdog detected hard LOCKUP on cpu 49
[ 8911.936827] NMI watchdog: Watchdog detected hard LOCKUP on cpu 58
[ 8911.936887] NMI watchdog: Watchdog detected hard LOCKUP on cpu 60
[ 8911.936969] NMI watchdog: Watchdog detected hard LOCKUP on cpu 70
[ 8915.926847] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
[ 8915.932904] rcu: 2-...!: (7 ticks this GP)
idle=38e/1/0x4000000000000000 softirq=181538/181540 fqs=1178
[ 8915.942617] rcu: 14-...!: (2 GPs behind)
idle=d0e/1/0x4000000000000000 softirq=44825/44825 fqs=1178
[ 8915.954034] rcu: rcu_sched kthread timer wakeup didn't happen for
12568 jiffies! g462469 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x402
[ 8915.965775] rcu: Possible timer handling issue on cpu=6 timer-softirq=10747
[ 8915.973021] rcu: rcu_sched kthread starved for 12572 jiffies!
g462469 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x402 ->cpu=6
[ 8915.983681] rcu: Unless rcu_sched kthread gets sufficient CPU time,
OOM is now expected behavior.
[ 8915.992879] rcu: RCU grace-period kthread stack dump:
[ 8915.998211] rcu: Stack dump where RCU GP kthread last ran:
[ 8939.925995] watchdog: BUG: soft lockup - CPU#53 stuck for 22s! [CPU
1/KVM:19539]
[ 8939.935274] NMI watchdog: Watchdog detected hard LOCKUP on cpu 25
[ 8939.935351] NMI watchdog: Watchdog detected hard LOCKUP on cpu 27
[ 8939.935425] NMI watchdog: Watchdog detected hard LOCKUP on cpu 29
[ 8939.935653] NMI watchdog: Watchdog detected hard LOCKUP on cpu 44
[ 8939.935845] NMI watchdog: Watchdog detected hard LOCKUP on cpu 63
[ 8963.997140] watchdog: BUG: soft lockup - CPU#71 stuck for 22s!
[SchedulerRunner:4405]

-----

2021-04-28 20:50:46

by Don Hiatt

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Wed, Apr 28, 2021 at 3:57 AM Aubrey Li <[email protected]> wrote:
>
> On Wed, Apr 28, 2021 at 2:05 PM Aubrey Li <[email protected]> wrote:
> >
> > On 4/28/21 9:03 AM, Aubrey Li wrote:
> > > On Wed, Apr 28, 2021 at 7:36 AM Josh Don <[email protected]> wrote:
> > >>
> > >> On Tue, Apr 27, 2021 at 10:10 AM Don Hiatt <[email protected]> wrote:
> > >>> Hi Josh and Peter,
> > >>>
> > >>> I've been running into soft lookups and hard lockups when running a script
> > >>> that just cycles setting the cookie of a group of processes over and over again.
> > >>>
> > >>> Unfortunately the only way I can reproduce this is by setting the cookies
> > >>> on qemu. I've tried sysbench, stress-ng but those seem to work just fine.
> > >>>
> > >>> I'm running Peter's branch and even tried the suggested changes here but
> > >>> still see the same behavior. I enabled panic on hard lockup and here below
> > >>> is a snippet of the log.
> > >>>
> > >>> Is there anything you'd like me to try or have any debugging you'd like me to
> > >>> do? I'd certainly like to get to the bottom of this.
> > >>
> > >> Hi Don,
> > >>
> > >> I tried to repro using qemu, but did not generate a lockup. Could you
> > >> provide more details on what your script is doing (or better yet,
> > >> share the script directly)? I would have expected you to potentially
> > >> hit a lockup if you were cycling sched_core being enabled and
> > >> disabled, but it sounds like you are just recreating the cookie for a
> > >> process group over and over?
> > >>
> > >
> > > I saw something similar on a bare metal hardware. Also tried the suggested
> > > patch here and no luck. Panic stack attached with
> > > softlockup_all_cpu_backtrace=1.
> > > (sorry, my system has 192 cpus and somehow putting 184 cpus offline causes
> > > system hang without any message...)
> >
> > Can you please try the following change to see if the problem is gone on your side?
> >
>
> Please ignore this patch, as the change of double_rq_unlock() in
> Peter's last patch
> fixed the problem.
>
> Thanks,
> -Aubrey
>
I'm still seeing hard lockups while repeatedly setting cookies on qemu
processes even with
the updated patch. If there is any debug you'd like me to turn on,
just let me know.

Thanks!

don



> >
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index f732642e3e09..1ef13b50dfcd 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -493,14 +493,17 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
> > {
> > lockdep_assert_irqs_disabled();
> >
> > - if (rq1->cpu > rq2->cpu)
> > - swap(rq1, rq2);
> > -
> > - raw_spin_rq_lock(rq1);
> > - if (__rq_lockp(rq1) == __rq_lockp(rq2))
> > - return;
> > -
> > - raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
> > + if (__rq_lockp(rq1) == __rq_lockp(rq2)) {
> > + raw_spin_rq_lock(rq1);
> > + } else {
> > + if (__rq_lockp(rq1) < __rq_lockp(rq2)) {
> > + raw_spin_rq_lock(rq1);
> > + raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
> > + } else {
> > + raw_spin_rq_lock(rq2);
> > + raw_spin_rq_lock_nested(rq1, SINGLE_DEPTH_NESTING);
> > + }
> > + }
> > }
> > #endif
> >

2021-04-29 08:05:26

by Aubrey Li

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Thu, Apr 22, 2021 at 8:39 PM Peter Zijlstra <[email protected]> wrote:
>
> When switching on core-sched, CPUs need to agree which lock to use for
> their RQ.
>
> The new rule will be that rq->core_enabled will be toggled while
> holding all rq->__locks that belong to a core. This means we need to
> double check the rq->core_enabled value after each lock acquire and
> retry if it changed.
>
> This also has implications for those sites that take multiple RQ
> locks, they need to be careful that the second lock doesn't end up
> being the first lock.
>
> Verify the lock pointer after acquiring the first lock, because if
> they're on the same core, holding any of the rq->__lock instances will
> pin the core state.
>
> While there, change the rq->__lock order to CPU number, instead of rq
> address, this greatly simplifies the next patch.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> kernel/sched/core.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
> kernel/sched/sched.h | 41 +++++++++++------------------------------
> 2 files changed, 57 insertions(+), 32 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
----snip----
> @@ -199,6 +224,25 @@ void raw_spin_rq_unlock(struct rq *rq)
> raw_spin_unlock(rq_lockp(rq));
> }
>
> +#ifdef CONFIG_SMP
> +/*
> + * double_rq_lock - safely lock two runqueues
> + */
> +void double_rq_lock(struct rq *rq1, struct rq *rq2)
> +{
> + lockdep_assert_irqs_disabled();
> +
> + if (rq1->cpu > rq2->cpu)

It's still a bit hard for me to digest this function, I guess using (rq->cpu)
can't guarantee the sequence of locking when coresched is enabled.

- cpu1 and cpu7 shares lockA
- cpu2 and cpu8 shares lockB

double_rq_lock(1,8) leads to lock(A) and lock(B)
double_rq_lock(7,2) leads to lock(B) and lock(A)

change to below to avoid ABBA?
+ if (__rq_lockp(rq1) > __rq_lockp(rq2))

Please correct me if I was wrong.

Thanks,
-Aubrey

> + swap(rq1, rq2);
> +
> + raw_spin_rq_lock(rq1);
> + if (rq_lockp(rq1) == rq_lockp(rq2))
> + return;
> +
> + raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
> +}
> +#endif
> +

2021-04-29 20:41:35

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Wed, Apr 28, 2021 at 2:13 AM Peter Zijlstra <[email protected]> wrote:
>
> On Tue, Apr 27, 2021 at 04:30:02PM -0700, Josh Don wrote:
>
> > Also, did you mean to have a preempt_enable_no_resched() rather than
> > prempt_enable() in raw_spin_rq_trylock?
>
> No, trylock really needs to be preempt_enable(), because it can have
> failed, at which point it will not have incremented the preemption count
> and our decrement can hit 0, at which point we really should reschedule.

Ah yes, makes sense. Did you want to use preempt_enable_no_resched()
at all then? No chance of preempt_count() being 1 at the point of
enable, as you comment below.

> > I went over the rq_lockp stuff again after Don's reported lockup. Most
> > uses are safe due to already holding an rq lock. However,
> > double_rq_unlock() is prone to race:
> >
> > double_rq_unlock(rq1, rq2):
> > /* Initial state: core sched enabled, and rq1 and rq2 are smt
> > siblings. So, double_rq_lock(rq1, rq2) only took a single rq lock */
> > raw_spin_rq_unlock(rq1);
> > /* now not holding any rq lock */
> > /* sched core disabled. Now __rq_lockp(rq1) != __rq_lockp(rq2), so we
> > falsely unlock rq2 */
> > if (__rq_lockp(rq1) != __rq_lockp(rq2))
> > raw_spin_rq_unlock(rq2);
> > else
> > __release(rq2->lock);
> >
> > Instead we can cache __rq_lockp(rq1) and __rq_lockp(rq2) before
> > releasing the lock, in order to prevent this. FWIW I think it is
> > likely that Don is seeing a different issue.
>
> Ah, indeed so.. rq_lockp() could do with an assertion, not sure how to
> sanely do that. Anyway, double_rq_unlock() is simple enough to fix, we
> can simply flip the unlock()s.
>
> ( I'm suffering a cold and am really quite slow atm )

No worries, hope it's a mild one.

> How's this then?

Looks good to me (other than the synchronize_sched()->synchronize_rcu()).

For these locking patches,
Reviewed-by: Josh Don <[email protected]>

I'll see if I can repro that lockup.

> ---
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index f732642e3e09..3a534c0c1c46 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -290,6 +290,10 @@ static void sched_core_assert_empty(void)
> static void __sched_core_enable(void)
> {
> static_branch_enable(&__sched_core_enabled);
> + /*
> + * Ensure raw_spin_rq_*lock*() have completed before flipping.
> + */
> + synchronize_sched();
> __sched_core_flip(true);
> sched_core_assert_empty();
> }
> @@ -449,16 +453,23 @@ void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
> {
> raw_spinlock_t *lock;
>
> + /* Matches synchronize_sched() in __sched_core_enabled() */
> + preempt_disable();
> if (sched_core_disabled()) {
> raw_spin_lock_nested(&rq->__lock, subclass);
> + /* preempt-count *MUST* be > 1 */
> + preempt_enable_no_resched();
> return;
> }
>
> for (;;) {
> lock = __rq_lockp(rq);
> raw_spin_lock_nested(lock, subclass);
> - if (likely(lock == __rq_lockp(rq)))
> + if (likely(lock == __rq_lockp(rq))) {
> + /* preempt-count *MUST* be > 1 */
> + preempt_enable_no_resched();
> return;
> + }
> raw_spin_unlock(lock);
> }
> }
> @@ -468,14 +479,21 @@ bool raw_spin_rq_trylock(struct rq *rq)
> raw_spinlock_t *lock;
> bool ret;
>
> - if (sched_core_disabled())
> - return raw_spin_trylock(&rq->__lock);
> + /* Matches synchronize_sched() in __sched_core_enabled() */
> + preempt_disable();
> + if (sched_core_disabled()) {
> + ret = raw_spin_trylock(&rq->__lock);
> + preempt_enable();
> + return ret;
> + }
>
> for (;;) {
> lock = __rq_lockp(rq);
> ret = raw_spin_trylock(lock);
> - if (!ret || (likely(lock == __rq_lockp(rq))))
> + if (!ret || (likely(lock == __rq_lockp(rq)))) {
> + preempt_enable();
> return ret;
> + }
> raw_spin_unlock(lock);
> }
> }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 6a905fe19eef..c9a52231d58a 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2568,11 +2568,12 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
> __releases(rq1->lock)
> __releases(rq2->lock)
> {
> - raw_spin_rq_unlock(rq1);
> if (__rq_lockp(rq1) != __rq_lockp(rq2))
> raw_spin_rq_unlock(rq2);
> else
> __release(rq2->lock);
> +
> + raw_spin_rq_unlock(rq1);
> }
>
> extern void set_rq_online (struct rq *rq);

2021-04-29 20:51:20

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Thu, Apr 29, 2021 at 1:03 AM Aubrey Li <[email protected]> wrote:
>
> On Thu, Apr 22, 2021 at 8:39 PM Peter Zijlstra <[email protected]> wrote:
> ----snip----
> > @@ -199,6 +224,25 @@ void raw_spin_rq_unlock(struct rq *rq)
> > raw_spin_unlock(rq_lockp(rq));
> > }
> >
> > +#ifdef CONFIG_SMP
> > +/*
> > + * double_rq_lock - safely lock two runqueues
> > + */
> > +void double_rq_lock(struct rq *rq1, struct rq *rq2)
> > +{
> > + lockdep_assert_irqs_disabled();
> > +
> > + if (rq1->cpu > rq2->cpu)
>
> It's still a bit hard for me to digest this function, I guess using (rq->cpu)
> can't guarantee the sequence of locking when coresched is enabled.
>
> - cpu1 and cpu7 shares lockA
> - cpu2 and cpu8 shares lockB
>
> double_rq_lock(1,8) leads to lock(A) and lock(B)
> double_rq_lock(7,2) leads to lock(B) and lock(A)
>
> change to below to avoid ABBA?
> + if (__rq_lockp(rq1) > __rq_lockp(rq2))
>
> Please correct me if I was wrong.

Great catch Aubrey. This is possibly what is causing the lockups that
Don is seeing.

The proposed usage of __rq_lockp() is prone to race with sched core
being enabled/disabled. It also won't order properly if we do
double_rq_lock(smt0, smt1) vs double_rq_lock(smt1, smt0), since these
would have equivalent __rq_lockp(). I'd propose an alternative but
similar idea: order by core, then break ties by ordering on cpu.

+#ifdef CONFIG_SCHED_CORE
+ if (rq1->core->cpu > rq2->core->cpu)
+ swap(rq1, rq2);
+ else if (rq1->core->cpu == rq2->core->cpu && rq1->cpu > rq2->cpu)
+ swap(rq1, rq2);
+#else
if (rq1->cpu > rq2->cpu)
swap(rq1, rq2);
+#endif

2021-04-29 21:10:35

by Don Hiatt

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Thu, Apr 29, 2021 at 1:48 PM Josh Don <[email protected]> wrote:
>
> On Wed, Apr 28, 2021 at 9:41 AM Don Hiatt <[email protected]> wrote:
> >
> > I'm still seeing hard lockups while repeatedly setting cookies on qemu
> > processes even with
> > the updated patch. If there is any debug you'd like me to turn on,
> > just let me know.
> >
> > Thanks!
> >
> > don
>
> Thanks for the added context on your repro configuration. In addition
> to the updated patch from earlier, could you try the modification to
> double_rq_lock() from
> https://lkml.kernel.org/r/CABk29NuS-B3n4sbmavo0NDA1OCCsz6Zf2VDjjFQvAxBMQoJ_Lg@mail.gmail.com
> ? I have a feeling this is what's causing your lockup.
>
> Best,
> Josh

Hi Josh,

I've been running Aubrey+Peter's patch (attached) for almost 5 hours
and haven't had a single issue. :)

I'm running a set-cookie script every 5 seconds on the two VMs (each
vm is running
'sysbench --threads=1 --time=0 cpu run' to generate some load in the vm) and
I'm running two of the same sysbench runs on the HV while setting cookies
every 5 seconds.

Unless I jinxed us it looks like a great fix. :)

Let me know if there is anything else you'd like me to try. I'm going
to leave the tests running
and see what happens. I update with what I find.

Thanks!

don


Attachments:
aubrey-cosched.diff (2.27 kB)

2021-04-29 21:56:04

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Wed, Apr 28, 2021 at 9:41 AM Don Hiatt <[email protected]> wrote:
>
> I'm still seeing hard lockups while repeatedly setting cookies on qemu
> processes even with
> the updated patch. If there is any debug you'd like me to turn on,
> just let me know.
>
> Thanks!
>
> don

Thanks for the added context on your repro configuration. In addition
to the updated patch from earlier, could you try the modification to
double_rq_lock() from
https://lkml.kernel.org/r/CABk29NuS-B3n4sbmavo0NDA1OCCsz6Zf2VDjjFQvAxBMQoJ_Lg@mail.gmail.com
? I have a feeling this is what's causing your lockup.

Best,
Josh

2021-04-29 23:24:57

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Thu, Apr 29, 2021 at 2:09 PM Don Hiatt <[email protected]> wrote:
>
> On Thu, Apr 29, 2021 at 1:48 PM Josh Don <[email protected]> wrote:
> >
> > On Wed, Apr 28, 2021 at 9:41 AM Don Hiatt <[email protected]> wrote:
> > >
> > > I'm still seeing hard lockups while repeatedly setting cookies on qemu
> > > processes even with
> > > the updated patch. If there is any debug you'd like me to turn on,
> > > just let me know.
> > >
> > > Thanks!
> > >
> > > don
> >
> > Thanks for the added context on your repro configuration. In addition
> > to the updated patch from earlier, could you try the modification to
> > double_rq_lock() from
> > https://lkml.kernel.org/r/CABk29NuS-B3n4sbmavo0NDA1OCCsz6Zf2VDjjFQvAxBMQoJ_Lg@mail.gmail.com
> > ? I have a feeling this is what's causing your lockup.
> >
> > Best,
> > Josh
>
> Hi Josh,
>
> I've been running Aubrey+Peter's patch (attached) for almost 5 hours
> and haven't had a single issue. :)
>
> I'm running a set-cookie script every 5 seconds on the two VMs (each
> vm is running
> 'sysbench --threads=1 --time=0 cpu run' to generate some load in the vm) and
> I'm running two of the same sysbench runs on the HV while setting cookies
> every 5 seconds.
>
> Unless I jinxed us it looks like a great fix. :)
>
> Let me know if there is anything else you'd like me to try. I'm going
> to leave the tests running
> and see what happens. I update with what I find.
>
> Thanks!
>
> don

That's awesome news, thanks for validating. Note that with Aubrey's
patch there is still a race window if sched core is being
enabled/disabled (ie. if you alternate between there being some
cookies in the system, and no cookies). In my reply I posted an
alternative version to avoid that. If your script were to do the
on-off flipping with the old patch, you'd might eventually see another
lockup.

2021-04-30 08:21:47

by Aubrey Li

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Fri, Apr 30, 2021 at 4:40 AM Josh Don <[email protected]> wrote:
>
> On Thu, Apr 29, 2021 at 1:03 AM Aubrey Li <[email protected]> wrote:
> >
> > On Thu, Apr 22, 2021 at 8:39 PM Peter Zijlstra <[email protected]> wrote:
> > ----snip----
> > > @@ -199,6 +224,25 @@ void raw_spin_rq_unlock(struct rq *rq)
> > > raw_spin_unlock(rq_lockp(rq));
> > > }
> > >
> > > +#ifdef CONFIG_SMP
> > > +/*
> > > + * double_rq_lock - safely lock two runqueues
> > > + */
> > > +void double_rq_lock(struct rq *rq1, struct rq *rq2)
> > > +{
> > > + lockdep_assert_irqs_disabled();
> > > +
> > > + if (rq1->cpu > rq2->cpu)
> >
> > It's still a bit hard for me to digest this function, I guess using (rq->cpu)
> > can't guarantee the sequence of locking when coresched is enabled.
> >
> > - cpu1 and cpu7 shares lockA
> > - cpu2 and cpu8 shares lockB
> >
> > double_rq_lock(1,8) leads to lock(A) and lock(B)
> > double_rq_lock(7,2) leads to lock(B) and lock(A)
> >
> > change to below to avoid ABBA?
> > + if (__rq_lockp(rq1) > __rq_lockp(rq2))
> >
> > Please correct me if I was wrong.
>
> Great catch Aubrey. This is possibly what is causing the lockups that
> Don is seeing.
>
> The proposed usage of __rq_lockp() is prone to race with sched core
> being enabled/disabled.It also won't order properly if we do
> double_rq_lock(smt0, smt1) vs double_rq_lock(smt1, smt0), since these
> would have equivalent __rq_lockp()

If __rq_lockp(smt0) == __rq_lockp(smt1), rq0 and rq1 won't swap,
Later only one rq is locked and just returns. I'm not sure how does it not
order properly?

.> I'd propose an alternative but similar idea: order by core, then break ties
> by ordering on cpu.
>
> +#ifdef CONFIG_SCHED_CORE
> + if (rq1->core->cpu > rq2->core->cpu)
> + swap(rq1, rq2);
> + else if (rq1->core->cpu == rq2->core->cpu && rq1->cpu > rq2->cpu)
> + swap(rq1, rq2);

That is, why the "else if" branch is needed?

> +#else
> if (rq1->cpu > rq2->cpu)
> swap(rq1, rq2);
> +#endif

2021-04-30 08:28:42

by Aubrey Li

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Fri, Apr 30, 2021 at 5:09 AM Don Hiatt <[email protected]> wrote:
>
> On Thu, Apr 29, 2021 at 1:48 PM Josh Don <[email protected]> wrote:
> >
> > On Wed, Apr 28, 2021 at 9:41 AM Don Hiatt <[email protected]> wrote:
> > >
> > > I'm still seeing hard lockups while repeatedly setting cookies on qemu
> > > processes even with
> > > the updated patch. If there is any debug you'd like me to turn on,
> > > just let me know.
> > >
> > > Thanks!
> > >
> > > don
> >
> > Thanks for the added context on your repro configuration. In addition
> > to the updated patch from earlier, could you try the modification to
> > double_rq_lock() from
> > https://lkml.kernel.org/r/CABk29NuS-B3n4sbmavo0NDA1OCCsz6Zf2VDjjFQvAxBMQoJ_Lg@mail.gmail.com
> > ? I have a feeling this is what's causing your lockup.
> >
> > Best,
> > Josh
>
> Hi Josh,
>
> I've been running Aubrey+Peter's patch (attached) for almost 5 hours
> and haven't had a single issue. :)
>
> I'm running a set-cookie script every 5 seconds on the two VMs (each
> vm is running
> 'sysbench --threads=1 --time=0 cpu run' to generate some load in the vm) and
> I'm running two of the same sysbench runs on the HV while setting cookies
> every 5 seconds.
>
> Unless I jinxed us it looks like a great fix. :)
>
> Let me know if there is anything else you'd like me to try. I'm going
> to leave the tests running
> and see what happens. I update with what I find.
>

This sounds great, Thanks Don.

However, I guess we have the same problem in _double_lock_balance(), which
needs to be fixed as well.

I or Josh can make a better patch for this after Peter's comments.

Thanks,
-Aubrey

2021-04-30 08:49:46

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Fri, Apr 30, 2021 at 1:20 AM Aubrey Li <[email protected]> wrote:
>
> On Fri, Apr 30, 2021 at 4:40 AM Josh Don <[email protected]> wrote:
> >
> > On Thu, Apr 29, 2021 at 1:03 AM Aubrey Li <[email protected]> wrote:
> > >
> > > On Thu, Apr 22, 2021 at 8:39 PM Peter Zijlstra <[email protected]> wrote:
> > > ----snip----
> > > > @@ -199,6 +224,25 @@ void raw_spin_rq_unlock(struct rq *rq)
> > > > raw_spin_unlock(rq_lockp(rq));
> > > > }
> > > >
> > > > +#ifdef CONFIG_SMP
> > > > +/*
> > > > + * double_rq_lock - safely lock two runqueues
> > > > + */
> > > > +void double_rq_lock(struct rq *rq1, struct rq *rq2)
> > > > +{
> > > > + lockdep_assert_irqs_disabled();
> > > > +
> > > > + if (rq1->cpu > rq2->cpu)
> > >
> > > It's still a bit hard for me to digest this function, I guess using (rq->cpu)
> > > can't guarantee the sequence of locking when coresched is enabled.
> > >
> > > - cpu1 and cpu7 shares lockA
> > > - cpu2 and cpu8 shares lockB
> > >
> > > double_rq_lock(1,8) leads to lock(A) and lock(B)
> > > double_rq_lock(7,2) leads to lock(B) and lock(A)
> > >
> > > change to below to avoid ABBA?
> > > + if (__rq_lockp(rq1) > __rq_lockp(rq2))
> > >
> > > Please correct me if I was wrong.
> >
> > Great catch Aubrey. This is possibly what is causing the lockups that
> > Don is seeing.
> >
> > The proposed usage of __rq_lockp() is prone to race with sched core
> > being enabled/disabled.It also won't order properly if we do
> > double_rq_lock(smt0, smt1) vs double_rq_lock(smt1, smt0), since these
> > would have equivalent __rq_lockp()
>
> If __rq_lockp(smt0) == __rq_lockp(smt1), rq0 and rq1 won't swap,
> Later only one rq is locked and just returns. I'm not sure how does it not
> order properly?

If there is a concurrent switch from sched_core enable <-> disable,
the value of __rq_lockp() will race.

In the version you posted directly above, where we swap rq1 and rq2 if
__rq_lockp(rq1) > __rqlockp(rq2) rather than comparing the cpu, the
following can happen:

cpu 1 and cpu 7 share a core lock when coresched is enabled

- schedcore enabled
- double_lock(7, 1)
- __rq_lockp compares equal for 7 and 1; no swap is done
- schedcore disabled; now __rq_lockp returns the per-rq lock
- lock(__rq_lockp(7)) => lock(7)
- lock(__rq_lockp(1)) => lock(1)

Then we can also have

- schedcore disabled
- double_lock(1, 7)
- __rq_lock(1) < rq_lock(7), so no swap
- lock(__rqlockp(1)) => lock(1)
- lock(__rq_lockp(7)) => lock(7)

So we have in the first 7->1 and in the second 1->7

>
> .> I'd propose an alternative but similar idea: order by core, then break ties
> > by ordering on cpu.
> >
> > +#ifdef CONFIG_SCHED_CORE
> > + if (rq1->core->cpu > rq2->core->cpu)
> > + swap(rq1, rq2);
> > + else if (rq1->core->cpu == rq2->core->cpu && rq1->cpu > rq2->cpu)
> > + swap(rq1, rq2);
>
> That is, why the "else if" branch is needed?

Ensuring that core siblings always take their locks in the same order
if coresched is disabled.

>
> > +#else
> > if (rq1->cpu > rq2->cpu)
> > swap(rq1, rq2);
> > +#endif

Best,
Josh

2021-04-30 14:17:49

by Aubrey Li

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Fri, Apr 30, 2021 at 4:48 PM Josh Don <[email protected]> wrote:
>
> On Fri, Apr 30, 2021 at 1:20 AM Aubrey Li <[email protected]> wrote:
> >
> > On Fri, Apr 30, 2021 at 4:40 AM Josh Don <[email protected]> wrote:
> > >
> > > On Thu, Apr 29, 2021 at 1:03 AM Aubrey Li <[email protected]> wrote:
> > > >
> > > > On Thu, Apr 22, 2021 at 8:39 PM Peter Zijlstra <[email protected]> wrote:
> > > > ----snip----
> > > > > @@ -199,6 +224,25 @@ void raw_spin_rq_unlock(struct rq *rq)
> > > > > raw_spin_unlock(rq_lockp(rq));
> > > > > }
> > > > >
> > > > > +#ifdef CONFIG_SMP
> > > > > +/*
> > > > > + * double_rq_lock - safely lock two runqueues
> > > > > + */
> > > > > +void double_rq_lock(struct rq *rq1, struct rq *rq2)
> > > > > +{
> > > > > + lockdep_assert_irqs_disabled();
> > > > > +
> > > > > + if (rq1->cpu > rq2->cpu)
> > > >
> > > > It's still a bit hard for me to digest this function, I guess using (rq->cpu)
> > > > can't guarantee the sequence of locking when coresched is enabled.
> > > >
> > > > - cpu1 and cpu7 shares lockA
> > > > - cpu2 and cpu8 shares lockB
> > > >
> > > > double_rq_lock(1,8) leads to lock(A) and lock(B)
> > > > double_rq_lock(7,2) leads to lock(B) and lock(A)
> > > >
> > > > change to below to avoid ABBA?
> > > > + if (__rq_lockp(rq1) > __rq_lockp(rq2))
> > > >
> > > > Please correct me if I was wrong.
> > >
> > > Great catch Aubrey. This is possibly what is causing the lockups that
> > > Don is seeing.
> > >
> > > The proposed usage of __rq_lockp() is prone to race with sched core
> > > being enabled/disabled.It also won't order properly if we do
> > > double_rq_lock(smt0, smt1) vs double_rq_lock(smt1, smt0), since these
> > > would have equivalent __rq_lockp()
> >
> > If __rq_lockp(smt0) == __rq_lockp(smt1), rq0 and rq1 won't swap,
> > Later only one rq is locked and just returns. I'm not sure how does it not
> > order properly?
>
> If there is a concurrent switch from sched_core enable <-> disable,
> the value of __rq_lockp() will race.
>
> In the version you posted directly above, where we swap rq1 and rq2 if
> __rq_lockp(rq1) > __rqlockp(rq2) rather than comparing the cpu, the
> following can happen:
>
> cpu 1 and cpu 7 share a core lock when coresched is enabled
>
> - schedcore enabled
> - double_lock(7, 1)
> - __rq_lockp compares equal for 7 and 1; no swap is done
> - schedcore disabled; now __rq_lockp returns the per-rq lock
> - lock(__rq_lockp(7)) => lock(7)
> - lock(__rq_lockp(1)) => lock(1)
>
> Then we can also have
>
> - schedcore disabled
> - double_lock(1, 7)
> - __rq_lock(1) < rq_lock(7), so no swap
> - lock(__rqlockp(1)) => lock(1)
> - lock(__rq_lockp(7)) => lock(7)
>
> So we have in the first 7->1 and in the second 1->7
>
> >
> > .> I'd propose an alternative but similar idea: order by core, then break ties
> > > by ordering on cpu.
> > >
> > > +#ifdef CONFIG_SCHED_CORE
> > > + if (rq1->core->cpu > rq2->core->cpu)
> > > + swap(rq1, rq2);
> > > + else if (rq1->core->cpu == rq2->core->cpu && rq1->cpu > rq2->cpu)
> > > + swap(rq1, rq2);
> >
> > That is, why the "else if" branch is needed?
>
> Ensuring that core siblings always take their locks in the same order
> if coresched is disabled.
>

Both this and above make sense to me. Thanks for the great elaboration, Josh!

Thanks,
-Aubrey

2021-04-30 16:21:50

by Don Hiatt

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Thu, Apr 29, 2021 at 4:22 PM Josh Don <[email protected]> wrote:
>
> On Thu, Apr 29, 2021 at 2:09 PM Don Hiatt <[email protected]> wrote:
> >
> > On Thu, Apr 29, 2021 at 1:48 PM Josh Don <[email protected]> wrote:
> > >
> > > On Wed, Apr 28, 2021 at 9:41 AM Don Hiatt <[email protected]> wrote:
> > > >
> > > > I'm still seeing hard lockups while repeatedly setting cookies on qemu
> > > > processes even with
> > > > the updated patch. If there is any debug you'd like me to turn on,
> > > > just let me know.
> > > >
> > > > Thanks!
> > > >
> > > > don
> > >
> > > Thanks for the added context on your repro configuration. In addition
> > > to the updated patch from earlier, could you try the modification to
> > > double_rq_lock() from
> > > https://lkml.kernel.org/r/CABk29NuS-B3n4sbmavo0NDA1OCCsz6Zf2VDjjFQvAxBMQoJ_Lg@mail.gmail.com
> > > ? I have a feeling this is what's causing your lockup.
> > >
> > > Best,
> > > Josh
> >
> > Hi Josh,
> >
> > I've been running Aubrey+Peter's patch (attached) for almost 5 hours
> > and haven't had a single issue. :)
> >
> > I'm running a set-cookie script every 5 seconds on the two VMs (each
> > vm is running
> > 'sysbench --threads=1 --time=0 cpu run' to generate some load in the vm) and
> > I'm running two of the same sysbench runs on the HV while setting cookies
> > every 5 seconds.
> >
> > Unless I jinxed us it looks like a great fix. :)
> >
> > Let me know if there is anything else you'd like me to try. I'm going
> > to leave the tests running
> > and see what happens. I update with what I find.
> >
> > Thanks!
> >
> > don
>
> That's awesome news, thanks for validating. Note that with Aubrey's
> patch there is still a race window if sched core is being
> enabled/disabled (ie. if you alternate between there being some
> cookies in the system, and no cookies). In my reply I posted an
> alternative version to avoid that. If your script were to do the
> on-off flipping with the old patch, you'd might eventually see another
> lockup.

My tests have been running 24 hours now and all is good. I'll do another
test with your changes next as well as continue to test the core-sched queue.

This and all the other patches in Peter's repo:
Tested-by: Don Hiatt <[email protected]>

Have a great day and thanks again.

don

2021-05-03 19:23:09

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Thu, Apr 29, 2021 at 01:11:54PM -0700, Josh Don wrote:
> On Wed, Apr 28, 2021 at 2:13 AM Peter Zijlstra <[email protected]> wrote:
> >
> > On Tue, Apr 27, 2021 at 04:30:02PM -0700, Josh Don wrote:
> >
> > > Also, did you mean to have a preempt_enable_no_resched() rather than
> > > prempt_enable() in raw_spin_rq_trylock?
> >
> > No, trylock really needs to be preempt_enable(), because it can have
> > failed, at which point it will not have incremented the preemption count
> > and our decrement can hit 0, at which point we really should reschedule.
>
> Ah yes, makes sense. Did you want to use preempt_enable_no_resched()
> at all then? No chance of preempt_count() being 1 at the point of
> enable, as you comment below.

preempt_enable_no_resched() avoids the branch which we know we'll never
take. It generates slightly saner code. It's not much, but every little
bit helps, right? ;-)

> > ( I'm suffering a cold and am really quite slow atm )
>
> No worries, hope it's a mild one.

It's not the super popular one from '19, and the family seems to be
mostly recovered again, so all's well.

> > How's this then?
>
> Looks good to me (other than the synchronize_sched()->synchronize_rcu()).
>
> For these locking patches,
> Reviewed-by: Josh Don <[email protected]>

Thanks!, I'll go fold them into the proper place and update the repo.

2021-05-04 07:45:32

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Thu, Apr 29, 2021 at 01:39:54PM -0700, Josh Don wrote:

> > > +void double_rq_lock(struct rq *rq1, struct rq *rq2)
> > > +{
> > > + lockdep_assert_irqs_disabled();
> > > +
> > > + if (rq1->cpu > rq2->cpu)
> >
> > It's still a bit hard for me to digest this function, I guess using (rq->cpu)
> > can't guarantee the sequence of locking when coresched is enabled.
> >
> > - cpu1 and cpu7 shares lockA
> > - cpu2 and cpu8 shares lockB
> >
> > double_rq_lock(1,8) leads to lock(A) and lock(B)
> > double_rq_lock(7,2) leads to lock(B) and lock(A)

Good one!

> > change to below to avoid ABBA?
> > + if (__rq_lockp(rq1) > __rq_lockp(rq2))

This, however, is broken badly, not only does it suffer the problem Josh
pointed out, it also breaks the rq->__lock ordering vs
__sched_core_flip(), which was the whole reason the ordering needed
changing in the first place.

> I'd propose an alternative but
> similar idea: order by core, then break ties by ordering on cpu.
>
> +#ifdef CONFIG_SCHED_CORE
> + if (rq1->core->cpu > rq2->core->cpu)
> + swap(rq1, rq2);
> + else if (rq1->core->cpu == rq2->core->cpu && rq1->cpu > rq2->cpu)
> + swap(rq1, rq2);
> +#else
> if (rq1->cpu > rq2->cpu)
> swap(rq1, rq2);
> +#endif

I've written it like so:

static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)
{
#ifdef CONFIG_SCHED_CORE
if (rq1->core->cpu < rq2->core->cpu)
return true;
if (rq1->core->cpu > rq2->core->cpu)
return false;
#endif
return rq1->cpu < rq2->cpu;
}

/*
* double_rq_lock - safely lock two runqueues
*/
void double_rq_lock(struct rq *rq1, struct rq *rq2)
{
lockdep_assert_irqs_disabled();

if (rq_order_less(rq2, rq1))
swap(rq1, rq2);

raw_spin_rq_lock(rq1);
if (rq_lockp(rq1) == rq_lockp(rq2))
return;

raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
}

2021-05-05 20:09:13

by Don Hiatt

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Tue, May 4, 2021 at 12:38 AM Peter Zijlstra <[email protected]> wrote:
>
> On Thu, Apr 29, 2021 at 01:39:54PM -0700, Josh Don wrote:
>
> > > > +void double_rq_lock(struct rq *rq1, struct rq *rq2)
> > > > +{
> > > > + lockdep_assert_irqs_disabled();
> > > > +
> > > > + if (rq1->cpu > rq2->cpu)
> > >
> > > It's still a bit hard for me to digest this function, I guess using (rq->cpu)
> > > can't guarantee the sequence of locking when coresched is enabled.
> > >
> > > - cpu1 and cpu7 shares lockA
> > > - cpu2 and cpu8 shares lockB
> > >
> > > double_rq_lock(1,8) leads to lock(A) and lock(B)
> > > double_rq_lock(7,2) leads to lock(B) and lock(A)
>
> Good one!

Hi Peter,

I've been running the same set-cookie tests on your latest repo for
the last 24 hours and haven't had a single lockup. Thank you very
much!

don

2021-05-06 10:34:57

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 04/19] sched: Prepare for Core-wide rq->lock

On Wed, May 05, 2021 at 09:20:38AM -0700, Don Hiatt wrote:
> On Tue, May 4, 2021 at 12:38 AM Peter Zijlstra <[email protected]> wrote:
> >
> > On Thu, Apr 29, 2021 at 01:39:54PM -0700, Josh Don wrote:
> >
> > > > > +void double_rq_lock(struct rq *rq1, struct rq *rq2)
> > > > > +{
> > > > > + lockdep_assert_irqs_disabled();
> > > > > +
> > > > > + if (rq1->cpu > rq2->cpu)
> > > >
> > > > It's still a bit hard for me to digest this function, I guess using (rq->cpu)
> > > > can't guarantee the sequence of locking when coresched is enabled.
> > > >
> > > > - cpu1 and cpu7 shares lockA
> > > > - cpu2 and cpu8 shares lockB
> > > >
> > > > double_rq_lock(1,8) leads to lock(A) and lock(B)
> > > > double_rq_lock(7,2) leads to lock(B) and lock(A)
> >
> > Good one!
>
> Hi Peter,
>
> I've been running the same set-cookie tests on your latest repo for
> the last 24 hours and haven't had a single lockup. Thank you very
> much!

Excellent, applied your Tested-by, thanks!

2021-05-07 12:35:10

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH v2 04/19] sched: Prepare for Core-wide rq->lock


When switching on core-sched, CPUs need to agree which lock to use for
their RQ.

The new rule will be that rq->core_enabled will be toggled while
holding all rq->__locks that belong to a core. This means we need to
double check the rq->core_enabled value after each lock acquire and
retry if it changed.

This also has implications for those sites that take multiple RQ
locks, they need to be careful that the second lock doesn't end up
being the first lock.

Verify the lock pointer after acquiring the first lock, because if
they're on the same core, holding any of the rq->__lock instances will
pin the core state.

While there, change the rq->__lock order to CPU number, instead of rq
address, this greatly simplifies the next patch.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Tested-by: Don Hiatt <[email protected]>
Tested-by: Hongyu Ning <[email protected]>
---
kernel/sched/core.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
kernel/sched/sched.h | 48 +++++++++++++++++-------------------------------
2 files changed, 63 insertions(+), 33 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -186,12 +186,37 @@ int sysctl_sched_rt_runtime = 950000;

void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
{
- raw_spin_lock_nested(rq_lockp(rq), subclass);
+ raw_spinlock_t *lock;
+
+ if (sched_core_disabled()) {
+ raw_spin_lock_nested(&rq->__lock, subclass);
+ return;
+ }
+
+ for (;;) {
+ lock = rq_lockp(rq);
+ raw_spin_lock_nested(lock, subclass);
+ if (likely(lock == rq_lockp(rq)))
+ return;
+ raw_spin_unlock(lock);
+ }
}

bool raw_spin_rq_trylock(struct rq *rq)
{
- return raw_spin_trylock(rq_lockp(rq));
+ raw_spinlock_t *lock;
+ bool ret;
+
+ if (sched_core_disabled())
+ return raw_spin_trylock(&rq->__lock);
+
+ for (;;) {
+ lock = rq_lockp(rq);
+ ret = raw_spin_trylock(lock);
+ if (!ret || (likely(lock == rq_lockp(rq))))
+ return ret;
+ raw_spin_unlock(lock);
+ }
}

void raw_spin_rq_unlock(struct rq *rq)
@@ -199,6 +224,25 @@ void raw_spin_rq_unlock(struct rq *rq)
raw_spin_unlock(rq_lockp(rq));
}

+#ifdef CONFIG_SMP
+/*
+ * double_rq_lock - safely lock two runqueues
+ */
+void double_rq_lock(struct rq *rq1, struct rq *rq2)
+{
+ lockdep_assert_irqs_disabled();
+
+ if (rq_order_less(rq2, rq1))
+ swap(rq1, rq2);
+
+ raw_spin_rq_lock(rq1);
+ if (rq_lockp(rq1) == rq_lockp(rq2))
+ return;
+
+ raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
+}
+#endif
+
/*
* __task_rq_lock - lock the rq @p resides on.
*/
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1113,6 +1113,11 @@ static inline bool is_migration_disabled
#endif
}

+static inline bool sched_core_disabled(void)
+{
+ return true;
+}
+
static inline raw_spinlock_t *rq_lockp(struct rq *rq)
{
return &rq->__lock;
@@ -2231,10 +2236,17 @@ unsigned long arch_scale_freq_capacity(i
}
#endif

+
#ifdef CONFIG_SMP
-#ifdef CONFIG_PREEMPTION

-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
+static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)
+{
+ return rq1->cpu < rq2->cpu;
+}
+
+extern void double_rq_lock(struct rq *rq1, struct rq *rq2);
+
+#ifdef CONFIG_PREEMPTION

/*
* fair double_lock_balance: Safely acquires both rq->locks in a fair
@@ -2274,14 +2286,13 @@ static inline int _double_lock_balance(s
if (likely(raw_spin_rq_trylock(busiest)))
return 0;

- if (rq_lockp(busiest) >= rq_lockp(this_rq)) {
+ if (rq_order_less(this_rq, busiest)) {
raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING);
return 0;
}

raw_spin_rq_unlock(this_rq);
- raw_spin_rq_lock(busiest);
- raw_spin_rq_lock_nested(this_rq, SINGLE_DEPTH_NESTING);
+ double_rq_lock(this_rq, busiest);

return 1;
}
@@ -2334,31 +2345,6 @@ static inline void double_raw_lock(raw_s
}

/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
- __acquires(rq1->lock)
- __acquires(rq2->lock)
-{
- BUG_ON(!irqs_disabled());
- if (rq_lockp(rq1) == rq_lockp(rq2)) {
- raw_spin_rq_lock(rq1);
- __acquire(rq2->lock); /* Fake it out ;) */
- } else {
- if (rq_lockp(rq1) < rq_lockp(rq2)) {
- raw_spin_rq_lock(rq1);
- raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
- } else {
- raw_spin_rq_lock(rq2);
- raw_spin_rq_lock_nested(rq1, SINGLE_DEPTH_NESTING);
- }
- }
-}
-
-/*
* double_rq_unlock - safely unlock two runqueues
*
* Note this does not restore interrupts like task_rq_unlock,
@@ -2368,11 +2354,11 @@ static inline void double_rq_unlock(stru
__releases(rq1->lock)
__releases(rq2->lock)
{
- raw_spin_rq_unlock(rq1);
if (rq_lockp(rq1) != rq_lockp(rq2))
raw_spin_rq_unlock(rq2);
else
__release(rq2->lock);
+ raw_spin_rq_unlock(rq1);
}

extern void set_rq_online (struct rq *rq);

2021-05-08 08:09:15

by Aubrey Li

[permalink] [raw]
Subject: Re: [PATCH v2 04/19] sched: Prepare for Core-wide rq->lock

On Fri, May 7, 2021 at 8:34 PM Peter Zijlstra <[email protected]> wrote:
>
>
> When switching on core-sched, CPUs need to agree which lock to use for
> their RQ.
>
> The new rule will be that rq->core_enabled will be toggled while
> holding all rq->__locks that belong to a core. This means we need to
> double check the rq->core_enabled value after each lock acquire and
> retry if it changed.
>
> This also has implications for those sites that take multiple RQ
> locks, they need to be careful that the second lock doesn't end up
> being the first lock.
>
> Verify the lock pointer after acquiring the first lock, because if
> they're on the same core, holding any of the rq->__lock instances will
> pin the core state.
>
> While there, change the rq->__lock order to CPU number, instead of rq
> address, this greatly simplifies the next patch.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> Tested-by: Don Hiatt <[email protected]>
> Tested-by: Hongyu Ning <[email protected]>
> ---
> kernel/sched/core.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
> kernel/sched/sched.h | 48 +++++++++++++++++-------------------------------
> 2 files changed, 63 insertions(+), 33 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -186,12 +186,37 @@ int sysctl_sched_rt_runtime = 950000;
>
> void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
> {
> - raw_spin_lock_nested(rq_lockp(rq), subclass);
> + raw_spinlock_t *lock;
> +
> + if (sched_core_disabled()) {
> + raw_spin_lock_nested(&rq->__lock, subclass);
> + return;
> + }
> +
> + for (;;) {
> + lock = rq_lockp(rq);
> + raw_spin_lock_nested(lock, subclass);
> + if (likely(lock == rq_lockp(rq)))
> + return;
> + raw_spin_unlock(lock);
> + }
> }
>
> bool raw_spin_rq_trylock(struct rq *rq)
> {
> - return raw_spin_trylock(rq_lockp(rq));
> + raw_spinlock_t *lock;
> + bool ret;
> +
> + if (sched_core_disabled())
> + return raw_spin_trylock(&rq->__lock);
> +
> + for (;;) {
> + lock = rq_lockp(rq);
> + ret = raw_spin_trylock(lock);
> + if (!ret || (likely(lock == rq_lockp(rq))))
> + return ret;
> + raw_spin_unlock(lock);
> + }
> }
>
> void raw_spin_rq_unlock(struct rq *rq)
> @@ -199,6 +224,25 @@ void raw_spin_rq_unlock(struct rq *rq)
> raw_spin_unlock(rq_lockp(rq));
> }
>
> +#ifdef CONFIG_SMP
> +/*
> + * double_rq_lock - safely lock two runqueues
> + */
> +void double_rq_lock(struct rq *rq1, struct rq *rq2)

Do we need the static lock checking here?
__acquires(rq1->lock)
__acquires(rq2->lock)

> +{
> + lockdep_assert_irqs_disabled();
> +
> + if (rq_order_less(rq2, rq1))
> + swap(rq1, rq2);
> +
> + raw_spin_rq_lock(rq1);
> + if (rq_lockp(rq1) == rq_lockp(rq2)) {

And here?
__acquire(rq2->lock);

> + return;
}
> +
> + raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
> +}
> +#endif
> +
> /*
> * __task_rq_lock - lock the rq @p resides on.
> */
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1113,6 +1113,11 @@ static inline bool is_migration_disabled
> #endif
> }
>
> +static inline bool sched_core_disabled(void)
> +{
> + return true;
> +}
> +
> static inline raw_spinlock_t *rq_lockp(struct rq *rq)
> {
> return &rq->__lock;
> @@ -2231,10 +2236,17 @@ unsigned long arch_scale_freq_capacity(i
> }
> #endif
>
> +
> #ifdef CONFIG_SMP
> -#ifdef CONFIG_PREEMPTION
>
> -static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
> +static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)
> +{
> + return rq1->cpu < rq2->cpu;
> +}
> +
> +extern void double_rq_lock(struct rq *rq1, struct rq *rq2);
> +
> +#ifdef CONFIG_PREEMPTION
>
> /*
> * fair double_lock_balance: Safely acquires both rq->locks in a fair
> @@ -2274,14 +2286,13 @@ static inline int _double_lock_balance(s
> if (likely(raw_spin_rq_trylock(busiest)))
> return 0;
>
> - if (rq_lockp(busiest) >= rq_lockp(this_rq)) {
> + if (rq_order_less(this_rq, busiest)) {
> raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING);
> return 0;
> }
>
> raw_spin_rq_unlock(this_rq);
> - raw_spin_rq_lock(busiest);
> - raw_spin_rq_lock_nested(this_rq, SINGLE_DEPTH_NESTING);
> + double_rq_lock(this_rq, busiest);
>
> return 1;
> }
> @@ -2334,31 +2345,6 @@ static inline void double_raw_lock(raw_s
> }
>
> /*
> - * double_rq_lock - safely lock two runqueues
> - *
> - * Note this does not disable interrupts like task_rq_lock,
> - * you need to do so manually before calling.
> - */
> -static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
> - __acquires(rq1->lock)
> - __acquires(rq2->lock)
> -{
> - BUG_ON(!irqs_disabled());
> - if (rq_lockp(rq1) == rq_lockp(rq2)) {
> - raw_spin_rq_lock(rq1);
> - __acquire(rq2->lock); /* Fake it out ;) */
> - } else {
> - if (rq_lockp(rq1) < rq_lockp(rq2)) {
> - raw_spin_rq_lock(rq1);
> - raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
> - } else {
> - raw_spin_rq_lock(rq2);
> - raw_spin_rq_lock_nested(rq1, SINGLE_DEPTH_NESTING);
> - }
> - }
> -}
> -
> -/*
> * double_rq_unlock - safely unlock two runqueues
> *
> * Note this does not restore interrupts like task_rq_unlock,
> @@ -2368,11 +2354,11 @@ static inline void double_rq_unlock(stru
> __releases(rq1->lock)
> __releases(rq2->lock)
> {
> - raw_spin_rq_unlock(rq1);
> if (rq_lockp(rq1) != rq_lockp(rq2))
> raw_spin_rq_unlock(rq2);
> else
> __release(rq2->lock);
> + raw_spin_rq_unlock(rq1);

This change seems not necessary, as the softlockup root cause is not
the misorder lock release.

Thanks,
-Aubrey

2021-05-12 09:08:56

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 04/19] sched: Prepare for Core-wide rq->lock

On Sat, May 08, 2021 at 04:07:35PM +0800, Aubrey Li wrote:
> > +#ifdef CONFIG_SMP
> > +/*
> > + * double_rq_lock - safely lock two runqueues
> > + */
> > +void double_rq_lock(struct rq *rq1, struct rq *rq2)
>
> Do we need the static lock checking here?
> __acquires(rq1->lock)
> __acquires(rq2->lock)
>
> > +{
> > + lockdep_assert_irqs_disabled();
> > +
> > + if (rq_order_less(rq2, rq1))
> > + swap(rq1, rq2);
> > +
> > + raw_spin_rq_lock(rq1);
> > + if (rq_lockp(rq1) == rq_lockp(rq2)) {
>
> And here?
> __acquire(rq2->lock);
>
> > + return;
> }
> > +
> > + raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
> > +}
> > +#endif

I'd as soon rip out all that sparse annotation crud; I don't think I've
ever had any benefit from it.


> > @@ -2368,11 +2354,11 @@ static inline void double_rq_unlock(stru
> > __releases(rq1->lock)
> > __releases(rq2->lock)
> > {
> > - raw_spin_rq_unlock(rq1);
> > if (rq_lockp(rq1) != rq_lockp(rq2))
> > raw_spin_rq_unlock(rq2);
> > else
> > __release(rq2->lock);
> > + raw_spin_rq_unlock(rq1);
>
> This change seems not necessary, as the softlockup root cause is not
> the misorder lock release.

No, it really is needed; rq_lockp() is not stable if we don't hold a
lock.