LinuxLists.cc - [PATCH] sched/core: Avoid obvious double update_rq

2022-04-18 12:32:07

Subject: [PATCH] sched/core: Avoid obvious double update_rq_clock warning

When we use raw_spin_rq_lock to acquire the rq lock and have to
update the rq clock while holding the lock, the kernel may issue
a WARN_DOUBLE_CLOCK warning.

Since we directly use raw_spin_rq_lock to acquire rq lock instead of
rq_lock, there is no corresponding change to rq->clock_update_flags.
In particular, we have obtained the rq lock of other cores,
the core rq->clock_update_flags may be RQCF_UPDATED at this time, and
then calling update_rq_clock will trigger the WARN_DOUBLE_CLOCK warning.

Some call trace reports:
Call Trace 1:
<IRQ>
sched_rt_period_timer+0x10f/0x3a0
? enqueue_top_rt_rq+0x110/0x110
__hrtimer_run_queues+0x1a9/0x490
hrtimer_interrupt+0x10b/0x240
__sysvec_apic_timer_interrupt+0x8a/0x250
sysvec_apic_timer_interrupt+0x9a/0xd0
</IRQ>
<TASK>
asm_sysvec_apic_timer_interrupt+0x12/0x20

Call Trace 2:
<TASK>
activate_task+0x8b/0x110
push_rt_task.part.108+0x241/0x2c0
push_rt_tasks+0x15/0x30
finish_task_switch+0xaa/0x2e0
? __switch_to+0x134/0x420
__schedule+0x343/0x8e0
? hrtimer_start_range_ns+0x101/0x340
schedule+0x4e/0xb0
do_nanosleep+0x8e/0x160
hrtimer_nanosleep+0x89/0x120
? hrtimer_init_sleeper+0x90/0x90
__x64_sys_nanosleep+0x96/0xd0
do_syscall_64+0x34/0x90
entry_SYSCALL_64_after_hwframe+0x44/0xae

Call Trace 3:
<TASK>
deactivate_task+0x93/0xe0
pull_rt_task+0x33e/0x400
balance_rt+0x7e/0x90
__schedule+0x62f/0x8e0
do_task_dead+0x3f/0x50
do_exit+0x7b8/0xbb0
do_group_exit+0x2d/0x90
get_signal+0x9df/0x9e0
? preempt_count_add+0x56/0xa0
? __remove_hrtimer+0x35/0x70
arch_do_signal_or_restart+0x36/0x720
? nanosleep_copyout+0x39/0x50
? do_nanosleep+0x131/0x160
? audit_filter_inodes+0xf5/0x120
exit_to_user_mode_prepare+0x10f/0x1e0
syscall_exit_to_user_mode+0x17/0x30
do_syscall_64+0x40/0x90
entry_SYSCALL_64_after_hwframe+0x44/0xae

Steps to reproduce:
1. Enable CONFIG_SCHED_DEBUG when compiling the kernel
2. echo 1 > /sys/kernel/debug/clear_warn_once
echo "WARN_DOUBLE_CLOCK" > /sys/kernel/debug/sched_features
echo "NO_RT_PUSH_IPI" > /sys/kernel/debug/sched_features
3. Run some rt tasks that periodically change the priority and sleep, e.g.:

void *ThreadFun(void *arg)
{
int cnt = *(int*)arg;
struct sched_param param;

while (1) {
sqrt(MAGIC_NUM);
cnt = cnt % 10 + 1;
param.sched_priority = cnt;
pthread_setschedparam(pthread_self(), SCHED_RR, &param);
sqrt(MAGIC_NUM);
sqrt(MAGIC_NUM);
sleep(cnt);
}
return NULL;
}

Signed-off-by: Hao Jia <[email protected]>
---
kernel/sched/deadline.c | 18 +++++++++++-------
kernel/sched/rt.c | 20 ++++++++++++++++++--
2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fb4255ae0b2c..9207b978cc43 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2258,6 +2258,7 @@ static int push_dl_task(struct rq *rq)
{
struct task_struct *next_task;
struct rq *later_rq;
+ struct rq_flags srf, drf;
int ret = 0;

if (!rq->dl.overloaded)
@@ -2317,16 +2318,14 @@ static int push_dl_task(struct rq *rq)
goto retry;
}

+ rq_pin_lock(rq, &srf);
+ rq_pin_lock(later_rq, &drf);
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, later_rq->cpu);
-
- /*
- * Update the later_rq clock here, because the clock is used
- * by the cpufreq_update_util() inside __add_running_bw().
- */
- update_rq_clock(later_rq);
- activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
+ activate_task(later_rq, next_task, 0);
ret = 1;
+ rq_unpin_lock(rq, &srf);
+ rq_unpin_lock(later_rq, &drf);

resched_curr(later_rq);

@@ -2351,6 +2350,7 @@ static void pull_dl_task(struct rq *this_rq)
struct task_struct *p, *push_task;
bool resched = false;
struct rq *src_rq;
+ struct rq_flags this_rf, src_rf;
u64 dmin = LONG_MAX;

if (likely(!dl_overloaded(this_rq)))
@@ -2413,11 +2413,15 @@ static void pull_dl_task(struct rq *this_rq)
if (is_migration_disabled(p)) {
push_task = get_push_task(src_rq);
} else {
+ rq_pin_lock(this_rq, &this_rf);
+ rq_pin_lock(src_rq, &src_rf);
deactivate_task(src_rq, p, 0);
set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0);
dmin = p->dl.deadline;
resched = true;
+ rq_unpin_lock(this_rq, &this_rf);
+ rq_unpin_lock(src_rq, &src_rf);
}

/* Is there any other task even earlier? */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a32c46889af8..9305ad87fef0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -871,6 +871,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
struct rq *rq = rq_of_rt_rq(rt_rq);
+ struct rq_flags rf;
int skip;

/*
@@ -885,7 +886,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (skip)
continue;

- raw_spin_rq_lock(rq);
+ rq_lock(rq, &rf);
update_rq_clock(rq);

if (rt_rq->rt_time) {
@@ -923,7 +924,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)

if (enqueue)
sched_rt_rq_enqueue(rt_rq);
- raw_spin_rq_unlock(rq);
+ rq_unlock(rq, &rf);
}

if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
@@ -2001,6 +2002,7 @@ static int push_rt_task(struct rq *rq, bool pull)
{
struct task_struct *next_task;
struct rq *lowest_rq;
+ struct rq_flags srf, drf;
int ret = 0;

if (!rq->rt.overloaded)
@@ -2102,9 +2104,18 @@ static int push_rt_task(struct rq *rq, bool pull)
goto retry;
}

+ /*
+ * We may drop rq'lock in double_lock_balance,
+ * so we still need to clean up the RQCF_UPDATED flag
+ * to avoid the WARN_DOUBLE_CLOCK warning.
+ */
+ rq_pin_lock(rq, &srf);
+ rq_pin_lock(lowest_rq, &drf);
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, lowest_rq->cpu);
activate_task(lowest_rq, next_task, 0);
+ rq_unpin_lock(rq, &srf);
+ rq_unpin_lock(lowest_rq, &drf);
resched_curr(lowest_rq);
ret = 1;

@@ -2299,6 +2310,7 @@ static void pull_rt_task(struct rq *this_rq)
bool resched = false;
struct task_struct *p, *push_task;
struct rq *src_rq;
+ struct rq_flags src_rf, this_rf;
int rt_overload_count = rt_overloaded(this_rq);

if (likely(!rt_overload_count))
@@ -2375,10 +2387,14 @@ static void pull_rt_task(struct rq *this_rq)
if (is_migration_disabled(p)) {
push_task = get_push_task(src_rq);
} else {
+ rq_pin_lock(this_rq, &this_rf);
+ rq_pin_lock(src_rq, &src_rf);
deactivate_task(src_rq, p, 0);
set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0);
resched = true;
+ rq_unpin_lock(this_rq, &this_rf);
+ rq_unpin_lock(src_rq, &src_rf);
}
/*
* We continue with the search, just in
--
2.32.0

2022-04-20 11:08:33

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH] sched/core: Avoid obvious double update_rq_clock warning

On Mon, Apr 18, 2022 at 05:09:29PM +0800, Hao Jia wrote:
> When we use raw_spin_rq_lock to acquire the rq lock and have to
> update the rq clock while holding the lock, the kernel may issue
> a WARN_DOUBLE_CLOCK warning.
>
> Since we directly use raw_spin_rq_lock to acquire rq lock instead of
> rq_lock, there is no corresponding change to rq->clock_update_flags.
> In particular, we have obtained the rq lock of other cores,
> the core rq->clock_update_flags may be RQCF_UPDATED at this time, and
> then calling update_rq_clock will trigger the WARN_DOUBLE_CLOCK warning.

> Signed-off-by: Hao Jia <[email protected]>
> ---
> kernel/sched/deadline.c | 18 +++++++++++-------
> kernel/sched/rt.c | 20 ++++++++++++++++++--

Very good for keeping them in sync.

> 2 files changed, 29 insertions(+), 9 deletions(-)
>
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index fb4255ae0b2c..9207b978cc43 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c

> @@ -2317,16 +2318,14 @@ static int push_dl_task(struct rq *rq)
> goto retry;
> }
>
> + rq_pin_lock(rq, &srf);
> + rq_pin_lock(later_rq, &drf);
> deactivate_task(rq, next_task, 0);
> set_task_cpu(next_task, later_rq->cpu);
> -
> - /*
> - * Update the later_rq clock here, because the clock is used
> - * by the cpufreq_update_util() inside __add_running_bw().
> - */
> - update_rq_clock(later_rq);
> - activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
> + activate_task(later_rq, next_task, 0);
> ret = 1;
> + rq_unpin_lock(rq, &srf);
> + rq_unpin_lock(later_rq, &drf);
>
> resched_curr(later_rq);
>

> @@ -2413,11 +2413,15 @@ static void pull_dl_task(struct rq *this_rq)
> if (is_migration_disabled(p)) {
> push_task = get_push_task(src_rq);
> } else {
> + rq_pin_lock(this_rq, &this_rf);
> + rq_pin_lock(src_rq, &src_rf);
> deactivate_task(src_rq, p, 0);
> set_task_cpu(p, this_cpu);
> activate_task(this_rq, p, 0);
> dmin = p->dl.deadline;
> resched = true;
> + rq_unpin_lock(this_rq, &this_rf);
> + rq_unpin_lock(src_rq, &src_rf);
> }
>
> /* Is there any other task even earlier? */

I'm really not sure about this part though. This is a bit of a mess. The
balancer doesn't really need the pinning stuff. I realize you did that
because we got the clock annotation mixed up with that, but urgh.

Basically we want double_rq_lock() / double_lock_balance() to clear
RQCF_UPDATED, right? Perhaps do that directly?

(maybe with an inline helper and a wee comment?)

The only immediate problem with this would appear to be that
_double_rq_lock() behaves differently when it returns 0. Not sure that
matters.

Hmm?

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f259621f4c93..be4baec84430 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -610,10 +610,13 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
swap(rq1, rq2);

raw_spin_rq_lock(rq1);
- if (__rq_lockp(rq1) == __rq_lockp(rq2))
- return;
+ if (__rq_lockp(rq1) != __rq_lockp(rq2))
+ raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);

- raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
+#ifdef CONFIG_SCHED_DEBUG
+ rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+ rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+#endif
}
#endif

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8dccb34eb190..3ca8dd5ca17c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2644,6 +2644,10 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
BUG_ON(rq1 != rq2);
raw_spin_rq_lock(rq1);
__acquire(rq2->lock); /* Fake it out ;) */
+#ifdef CONFIG_SCHED_DEBUG
+ rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+ rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+#endif
}

/*

2022-04-21 12:11:44

by Hao Jia

[permalink] [raw]

Subject: Re: [External] Re: [PATCH] sched/core: Avoid obvious double update_rq_clock warning

On 4/19/22 6:48 PM, Peter Zijlstra wrote:
> On Mon, Apr 18, 2022 at 05:09:29PM +0800, Hao Jia wrote:
>> When we use raw_spin_rq_lock to acquire the rq lock and have to
>> update the rq clock while holding the lock, the kernel may issue
>> a WARN_DOUBLE_CLOCK warning.
>>
>> Since we directly use raw_spin_rq_lock to acquire rq lock instead of
>> rq_lock, there is no corresponding change to rq->clock_update_flags.
>> In particular, we have obtained the rq lock of other cores,
>> the core rq->clock_update_flags may be RQCF_UPDATED at this time, and
>> then calling update_rq_clock will trigger the WARN_DOUBLE_CLOCK warning.
>
>> Signed-off-by: Hao Jia <[email protected]>
>> ---
>> kernel/sched/deadline.c | 18 +++++++++++-------
>> kernel/sched/rt.c | 20 ++++++++++++++++++--
>
> Very good for keeping them in sync.
>
>> 2 files changed, 29 insertions(+), 9 deletions(-)
>>
>> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
>> index fb4255ae0b2c..9207b978cc43 100644
>> --- a/kernel/sched/deadline.c
>> +++ b/kernel/sched/deadline.c
>
>> @@ -2317,16 +2318,14 @@ static int push_dl_task(struct rq *rq)
>> goto retry;
>> }
>>
>> + rq_pin_lock(rq, &srf);
>> + rq_pin_lock(later_rq, &drf);
>> deactivate_task(rq, next_task, 0);
>> set_task_cpu(next_task, later_rq->cpu);
>> -
>> - /*
>> - * Update the later_rq clock here, because the clock is used
>> - * by the cpufreq_update_util() inside __add_running_bw().
>> - */
>> - update_rq_clock(later_rq);
>> - activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
>> + activate_task(later_rq, next_task, 0);
>> ret = 1;
>> + rq_unpin_lock(rq, &srf);
>> + rq_unpin_lock(later_rq, &drf);
>>
>> resched_curr(later_rq);
>>
>
>> @@ -2413,11 +2413,15 @@ static void pull_dl_task(struct rq *this_rq)
>> if (is_migration_disabled(p)) {
>> push_task = get_push_task(src_rq);
>> } else {
>> + rq_pin_lock(this_rq, &this_rf);
>> + rq_pin_lock(src_rq, &src_rf);
>> deactivate_task(src_rq, p, 0);
>> set_task_cpu(p, this_cpu);
>> activate_task(this_rq, p, 0);
>> dmin = p->dl.deadline;
>> resched = true;
>> + rq_unpin_lock(this_rq, &this_rf);
>> + rq_unpin_lock(src_rq, &src_rf);
>> }
>>
>> /* Is there any other task even earlier? */
>
> I'm really not sure about this part though. This is a bit of a mess. The
> balancer doesn't really need the pinning stuff. I realize you did that
> because we got the clock annotation mixed up with that, but urgh.
>
> Basically we want double_rq_lock() / double_lock_balance() to clear
> RQCF_UPDATED, right? Perhaps do that directly?
>
> (maybe with an inline helper and a wee comment?)
>
> The only immediate problem with this would appear to be that
> _double_rq_lock() behaves differently when it returns 0. Not sure that
> matters.
>
> Hmm?

Thanks for your review comments.
As you have prompted, the WARN_DOUBLE_CLOCK warning is still triggered
when _double_rq_lock() returns 0.
Please review the solution below, and based on your review, I will
submit the v2 patch as soon as possible.
Thanks.

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 51efaabac3e4..b73fe46cd6c7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -610,10 +610,13 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
swap(rq1, rq2);

raw_spin_rq_lock(rq1);
- if (__rq_lockp(rq1) == __rq_lockp(rq2))
- return;
+ if (__rq_lockp(rq1) != __rq_lockp(rq2))
+ raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);

- raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
+#ifdef CONFIG_SCHED_DEBUG
+ rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+ rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+#endif
}
#endif

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8dccb34eb190..9fe506a6b7b4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2544,20 +2544,25 @@ static inline int _double_lock_balance(struct rq
*this_rq, struct rq *busiest)
__acquires(this_rq->lock)
{
if (__rq_lockp(this_rq) == __rq_lockp(busiest))
- return 0;
+ goto out;

if (likely(raw_spin_rq_trylock(busiest)))
- return 0;
+ goto out;

if (rq_order_less(this_rq, busiest)) {
raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING);
- return 0;
+ goto out;
+ } else {
+ raw_spin_rq_unlock(this_rq);
+ double_rq_lock(this_rq, busiest);
+ return 1;
}
-
- raw_spin_rq_unlock(this_rq);
- double_rq_lock(this_rq, busiest);
-
- return 1;
+out:
+#ifdef CONFIG_SCHED_DEBUG
+ this_rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+ busiest->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+#endif
+ return 0;
}

#endif /* CONFIG_PREEMPTION */
@@ -2644,6 +2649,9 @@ static inline void double_rq_lock(struct rq *rq1,
struct rq *rq2)
BUG_ON(rq1 != rq2);
raw_spin_rq_lock(rq1);
__acquire(rq2->lock); /* Fake it out ;) */
+#ifdef CONFIG_SCHED_DEBUG
+ rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+#endif
}
>
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index f259621f4c93..be4baec84430 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -610,10 +610,13 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
> swap(rq1, rq2);
>
> raw_spin_rq_lock(rq1);
> - if (__rq_lockp(rq1) == __rq_lockp(rq2))
> - return;
> + if (__rq_lockp(rq1) != __rq_lockp(rq2))
> + raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
>
> - raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
> +#ifdef CONFIG_SCHED_DEBUG
> + rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
> + rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
> +#endif
> }
> #endif
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 8dccb34eb190..3ca8dd5ca17c 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2644,6 +2644,10 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
> BUG_ON(rq1 != rq2);
> raw_spin_rq_lock(rq1);
> __acquire(rq2->lock); /* Fake it out ;) */
> +#ifdef CONFIG_SCHED_DEBUG
> + rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
> + rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
> +#endif
> }
>
> /*

2022-04-22 14:31:16

by Dietmar Eggemann

[permalink] [raw]

Subject: Re: [PATCH] sched/core: Avoid obvious double update_rq_clock warning

On 18/04/2022 11:09, Hao Jia wrote:

[...]

> - /*
> - * Update the later_rq clock here, because the clock is used
> - * by the cpufreq_update_util() inside __add_running_bw().
> - */
> - update_rq_clock(later_rq);
> - activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
> + activate_task(later_rq, next_task, 0);

IMHO, this change should go in a separate deadline patch.

The change to call update_rq_clock() before activate_task()
(840d719604b09) is no longer needed since f4904815f97a removed the
add_running_bw() before the activate_task().

[...]

2022-04-22 18:10:21

by Hao Jia

[permalink] [raw]

Subject: Re: [External] Re: [PATCH] sched/core: Avoid obvious double update_rq_clock warning

On 2022/4/21 Dietmar Eggemann wrote:
> On 18/04/2022 11:09, Hao Jia wrote:
>
> [...]
>
>> - /*
>> - * Update the later_rq clock here, because the clock is used
>> - * by the cpufreq_update_util() inside __add_running_bw().
>> - */
>> - update_rq_clock(later_rq);
>> - activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
>> + activate_task(later_rq, next_task, 0);
>
> IMHO, this change should go in a separate deadline patch.
>

Thanks again for your review.
I will do it in patch v2.
Thanks.

> The change to call update_rq_clock() before activate_task()
> (840d719604b09) is no longer needed since f4904815f97a removed the
> add_running_bw() before the activate_task().
>
> [...]

2022-04-22 21:55:16

by Dietmar Eggemann

[permalink] [raw]

Subject: Re: [External] Re: [PATCH] sched/core: Avoid obvious double update_rq_clock warning

On 20/04/2022 10:29, Hao Jia wrote:
> On 4/19/22 6:48 PM, Peter Zijlstra wrote:
>> On Mon, Apr 18, 2022 at 05:09:29PM +0800, Hao Jia wrote:

[...]

>> I'm really not sure about this part though. This is a bit of a mess. The
>> balancer doesn't really need the pinning stuff. I realize you did that
>> because we got the clock annotation mixed up with that, but urgh.
>>
>> Basically we want double_rq_lock() / double_lock_balance() to clear
>> RQCF_UPDATED, right? Perhaps do that directly?
>>
>> (maybe with an inline helper and a wee comment?)
>>
>> The only immediate problem with this would appear to be that
>> _double_rq_lock() behaves differently when it returns 0. Not sure that
>> matters.
>>
>> Hmm?
>
> Thanks for your review comments.
> As you have prompted, the WARN_DOUBLE_CLOCK warning is still triggered
> when _double_rq_lock() returns 0.
> Please review the solution below, and based on your review, I will
> submit the v2 patch as soon as possible.
> Thanks.

[...]

Maybe something like this:

-->8--

From: Dietmar Eggemann <[email protected]>
Date: Wed, 20 Apr 2022 11:12:10 +0200
Subject: [PATCH] sched/core: Clear RQCF_UPDATED in _double_lock_balance() &
double_rq_lock()

Signed-off-by: Dietmar Eggemann <[email protected]>
---
kernel/sched/core.c | 6 +++---
kernel/sched/sched.h | 20 ++++++++++++++++----
2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 068c088e9584..f4cfe7eea861 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -610,10 +610,10 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
swap(rq1, rq2);

raw_spin_rq_lock(rq1);
- if (__rq_lockp(rq1) == __rq_lockp(rq2))
- return;
+ if (__rq_lockp(rq1) != __rq_lockp(rq2))
+ raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);

- raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
+ rq_clock_clear_update(rq1, rq2);
}
#endif

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 58263f90c559..3a77b10d7cc4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2515,6 +2515,16 @@ static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)

extern void double_rq_lock(struct rq *rq1, struct rq *rq2);

+#ifdef CONFIG_SCHED_DEBUG
+static inline void rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
+{
+ rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+ rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+}
+#else
+static inline void rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {}
+#endif
+
#ifdef CONFIG_PREEMPTION

/*
@@ -2549,14 +2559,15 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
- if (__rq_lockp(this_rq) == __rq_lockp(busiest))
- return 0;
-
- if (likely(raw_spin_rq_trylock(busiest)))
+ if (__rq_lockp(this_rq) == __rq_lockp(busiest) ||
+ likely(raw_spin_rq_trylock(busiest))) {
+ rq_clock_clear_update(this_rq, busiest);
return 0;
+ }

if (rq_order_less(this_rq, busiest)) {
raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING);
+ rq_clock_clear_update(this_rq, busiest);
return 0;
}

@@ -2650,6 +2661,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
BUG_ON(rq1 != rq2);
raw_spin_rq_lock(rq1);
__acquire(rq2->lock); /* Fake it out ;) */
+ rq_clock_clear_update(rq1, rq2);
}

/*
--
2.25.1