2023-04-10 08:18:35

by Hao Jia

[permalink] [raw]
Subject: [PATCH 0/2] Fix two warnings about rq clock

These two patches fix two warnings about rq clock

Patch1 fixes the warning of using the old rq clock caused by
missing update rq clock.

Patch2 fixes the warning that the rq clock was updated multiple
times while holding the rq lock.

Hao Jia (2):
sched/core: Fixed missing rq clock update before calling
set_rq_offline()
sched/core: Avoid double calling update_rq_clock()

kernel/sched/core.c | 11 +++++++----
kernel/sched/fair.c | 9 ++++++---
kernel/sched/topology.c | 10 ++++++----
3 files changed, 19 insertions(+), 11 deletions(-)

--
2.37.0


2023-04-10 08:20:22

by Hao Jia

[permalink] [raw]
Subject: [PATCH 1/2] sched/core: Fixed missing rq clock update before calling set_rq_offline()

This is triggered during cpu offline when CONFIG_CPU_FREQ is enabled:

------------[ cut here ]------------
rq->clock_update_flags < RQCF_ACT_SKIP
WARNING: CPU: 24 PID: 754 at kernel/sched/sched.h:1496
enqueue_top_rt_rq+0x139/0x160
Call Trace:
<TASK>
? intel_pstate_update_util+0x3b0/0x3b0
rq_offline_rt+0x1b7/0x250
set_rq_offline.part.120+0x28/0x60
rq_attach_root+0xc4/0xd0
cpu_attach_domain+0x3dc/0x7f0
? __schedule+0x65e/0x1310
partition_sched_domains_locked+0x2a5/0x3c0
rebuild_sched_domains_locked+0x477/0x830
? percpu_rwsem_wait+0x140/0x140
rebuild_sched_domains+0x1b/0x30
cpuset_hotplug_workfn+0x2ca/0xc90
? balance_push+0x56/0x120
? _raw_spin_unlock+0x15/0x30
? finish_task_switch+0x98/0x2f0
? __switch_to+0x116/0x410
? __schedule+0x65e/0x1310 ? internal_add_timer+0x42/0x60
? _raw_spin_unlock_irqrestore+0x23/0x40
? add_timer_on+0xd5/0x130
process_one_work+0x1bc/0x3d0
worker_thread+0x4c/0x380
? preempt_count_add+0x56/0xa0
? rescuer_thread+0x310/0x310
kthread+0xe6/0x110
? kthread_complete_and_exit+0x20/0x20
ret_from_fork+0x1f/0x30

Before calling set_rq_offline() we need to update the rq clock to avoid
using the old rq clock, and use rq_lock_irqsave()/rq_unlock_irqrestore()
to replace raw_spin_rq_lock_irqsave()/raw_spin_rq_unlock_irqrestore() to
ensure that rq->clock_update_flags are cleared before updating the rq
clock.

Steps to reproduce:
1. Enable CONFIG_SMP and CONFIG_CPU_FREQ when compiling the kernel
2. echo 1 > /sys/kernel/debug/clear_warn_once
3. cpupower -c all frequency-set -g powersave
4. Run some rt tasks e.g. Create 5*n rt (100% running) tasks (on a
system with n CPUs)
5. Offline cpu one by one until the warninng is triggered

Signed-off-by: Hao Jia <[email protected]>
---
kernel/sched/topology.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 051aaf65c749..017f7583f0e2 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -487,15 +487,17 @@ static void free_rootdomain(struct rcu_head *rcu)
void rq_attach_root(struct rq *rq, struct root_domain *rd)
{
struct root_domain *old_rd = NULL;
- unsigned long flags;
+ struct rq_flags rf;

- raw_spin_rq_lock_irqsave(rq, flags);
+ rq_lock_irqsave(rq, &rf);

if (rq->rd) {
old_rd = rq->rd;

- if (cpumask_test_cpu(rq->cpu, old_rd->online))
+ if (cpumask_test_cpu(rq->cpu, old_rd->online)) {
+ update_rq_clock(rq);
set_rq_offline(rq);
+ }

cpumask_clear_cpu(rq->cpu, old_rd->span);

@@ -515,7 +517,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq);

- raw_spin_rq_unlock_irqrestore(rq, flags);
+ rq_unlock_irqrestore(rq, &rf);

if (old_rd)
call_rcu(&old_rd->rcu, free_rootdomain);
--
2.37.0

2023-04-10 08:20:23

by Hao Jia

[permalink] [raw]
Subject: [PATCH 2/2] sched/core: Avoid double calling update_rq_clock()

There are some double rq clock update warnings are triggered.
------------[ cut here ]------------
rq->clock_update_flags & RQCF_UPDATED
WARNING: CPU: 17 PID: 138 at kernel/sched/core.c:741
update_rq_clock+0xaf/0x180
Call Trace:
<TASK>
__balance_push_cpu_stop+0x146/0x180
? migration_cpu_stop+0x2a0/0x2a0
cpu_stopper_thread+0xa3/0x140
smpboot_thread_fn+0x14f/0x210
? sort_range+0x20/0x20
kthread+0xe6/0x110
? kthread_complete_and_exit+0x20/0x20
ret_from_fork+0x1f/0x30

------------[ cut here ]------------
rq->clock_update_flags & RQCF_UPDATED
WARNING: CPU: 54 PID: 0 at kernel/sched/core.c:741
update_rq_clock+0xaf/0x180
Call Trace:
<TASK>
unthrottle_cfs_rq+0x4b/0x300
__cfsb_csd_unthrottle+0xe0/0x100
__flush_smp_call_function_queue+0xaf/0x1d0
flush_smp_call_function_queue+0x49/0x90
do_idle+0x17c/0x270
cpu_startup_entry+0x19/0x20
start_secondary+0xfa/0x120
secondary_startup_64_no_verify+0xce/0xdb

------------[ cut here ]------------
rq->clock_update_flags & RQCF_UPDATED
WARNING: CPU: 0 PID: 3323 at kernel/sched/core.c:741
update_rq_clock+0xaf/0x180
Call Trace:
<TASK>
unthrottle_cfs_rq+0x4b/0x300
rq_offline_fair+0x89/0x90
set_rq_offline.part.118+0x28/0x60
rq_attach_root+0xc4/0xd0
cpu_attach_domain+0x3dc/0x7f0
partition_sched_domains_locked+0x2a5/0x3c0
rebuild_sched_domains_locked+0x477/0x830
rebuild_sched_domains+0x1b/0x30
cpuset_hotplug_workfn+0x2ca/0xc90
? balance_push+0x56/0xf0
? _raw_spin_unlock+0x15/0x30
? finish_task_switch+0x98/0x2f0
? __switch_to+0x291/0x410
? __schedule+0x65e/0x1310
process_one_work+0x1bc/0x3d0
worker_thread+0x4c/0x380
? preempt_count_add+0x92/0xa0
? rescuer_thread+0x310/0x310
kthread+0xe6/0x110
? kthread_complete_and_exit+0x20/0x20
ret_from_fork+0x1f/0x30

For the __balance_push_cpu_stop() case, we remove update_rq_clock() from
the __migrate_task() function to avoid double updating the rq clock.
And in order to avoid missing rq clock update, add update_rq_clock()
call before migration_cpu_stop() calls __migrate_task().

This also works for unthrottle_cfs_rq(), so we also removed
update_rq_clock() from the unthrottle_cfs_rq() function to avoid
warnings caused by calling it multiple times, such as
__cfsb_csd_unthrottle() and unthrottle_offline_cfs_rqs(). and
in order to avoid missing rq clock update, we correspondingly add
update_rq_clock() calls before unthrottle_cfs_rq() runs.

Note that the rq clock has been updated before the set_rq_offline()
function runs, so we don't need to add update_rq_clock() call in
unthrottle_offline_cfs_rqs().

Signed-off-by: Hao Jia <[email protected]>
---
kernel/sched/core.c | 11 +++++++----
kernel/sched/fair.c | 9 ++++++---
2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0d18c3969f90..c1abd88ec1ed 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2376,7 +2376,6 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
if (!is_cpu_allowed(p, dest_cpu))
return rq;

- update_rq_clock(rq);
rq = move_queued_task(rq, rf, p, dest_cpu);

return rq;
@@ -2434,10 +2433,12 @@ static int migration_cpu_stop(void *data)
goto out;
}

- if (task_on_rq_queued(p))
+ if (task_on_rq_queued(p)) {
+ update_rq_clock(rq);
rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
- else
+ } else {
p->wake_cpu = arg->dest_cpu;
+ }

/*
* XXX __migrate_task() can fail, at which point we might end
@@ -10759,8 +10760,10 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
cfs_rq->runtime_enabled = runtime_enabled;
cfs_rq->runtime_remaining = 0;

- if (cfs_rq->throttled)
+ if (cfs_rq->throttled) {
+ update_rq_clock(rq);
unthrottle_cfs_rq(cfs_rq);
+ }
rq_unlock_irq(rq, &rf);
}
if (runtime_was_enabled && !runtime_enabled)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6986ea31c984..03ced34e992e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5438,8 +5438,6 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)

cfs_rq->throttled = 0;

- update_rq_clock(rq);
-
raw_spin_lock(&cfs_b->lock);
cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
list_del_rcu(&cfs_rq->throttled_list);
@@ -5518,6 +5516,7 @@ static void __cfsb_csd_unthrottle(void *arg)
struct rq_flags rf;

rq_lock(rq, &rf);
+ update_rq_clock(rq);

/*
* Since we hold rq lock we're safe from concurrent manipulation of
@@ -5547,6 +5546,7 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
bool first;

if (rq == this_rq()) {
+ update_rq_clock(rq);
unthrottle_cfs_rq(cfs_rq);
return;
}
@@ -5563,6 +5563,7 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
#else
static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
{
+ update_rq_clock(rq_of(cfs_rq));
unthrottle_cfs_rq(cfs_rq);
}
#endif
@@ -5640,8 +5641,10 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
if (local_unthrottle) {
rq = cpu_rq(this_cpu);
rq_lock_irqsave(rq, &rf);
- if (cfs_rq_throttled(local_unthrottle))
+ if (cfs_rq_throttled(local_unthrottle)) {
+ update_rq_clock(rq);
unthrottle_cfs_rq(local_unthrottle);
+ }
rq_unlock_irqrestore(rq, &rf);
}

--
2.37.0

2023-04-20 11:23:02

by Hao Jia

[permalink] [raw]
Subject: Re: [PATCH 0/2] Fix two warnings about rq clock

kindly ping...


On 2023/4/10 Hao Jia wrote:
> These two patches fix two warnings about rq clock
>
> Patch1 fixes the warning of using the old rq clock caused by
> missing update rq clock.
>
> Patch2 fixes the warning that the rq clock was updated multiple
> times while holding the rq lock.
>
> Hao Jia (2):
> sched/core: Fixed missing rq clock update before calling
> set_rq_offline()
> sched/core: Avoid double calling update_rq_clock()
>
> kernel/sched/core.c | 11 +++++++----
> kernel/sched/fair.c | 9 ++++++---
> kernel/sched/topology.c | 10 ++++++----
> 3 files changed, 19 insertions(+), 11 deletions(-)
>

2023-04-21 13:59:36

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH 2/2] sched/core: Avoid double calling update_rq_clock()

On Mon, 10 Apr 2023 at 10:12, Hao Jia <[email protected]> wrote:
>
> There are some double rq clock update warnings are triggered.
> ------------[ cut here ]------------
> rq->clock_update_flags & RQCF_UPDATED
> WARNING: CPU: 17 PID: 138 at kernel/sched/core.c:741
> update_rq_clock+0xaf/0x180
> Call Trace:
> <TASK>
> __balance_push_cpu_stop+0x146/0x180
> ? migration_cpu_stop+0x2a0/0x2a0
> cpu_stopper_thread+0xa3/0x140
> smpboot_thread_fn+0x14f/0x210
> ? sort_range+0x20/0x20
> kthread+0xe6/0x110
> ? kthread_complete_and_exit+0x20/0x20
> ret_from_fork+0x1f/0x30
>
> ------------[ cut here ]------------
> rq->clock_update_flags & RQCF_UPDATED
> WARNING: CPU: 54 PID: 0 at kernel/sched/core.c:741
> update_rq_clock+0xaf/0x180
> Call Trace:
> <TASK>
> unthrottle_cfs_rq+0x4b/0x300
> __cfsb_csd_unthrottle+0xe0/0x100
> __flush_smp_call_function_queue+0xaf/0x1d0
> flush_smp_call_function_queue+0x49/0x90
> do_idle+0x17c/0x270
> cpu_startup_entry+0x19/0x20
> start_secondary+0xfa/0x120
> secondary_startup_64_no_verify+0xce/0xdb
>
> ------------[ cut here ]------------
> rq->clock_update_flags & RQCF_UPDATED
> WARNING: CPU: 0 PID: 3323 at kernel/sched/core.c:741
> update_rq_clock+0xaf/0x180
> Call Trace:
> <TASK>
> unthrottle_cfs_rq+0x4b/0x300
> rq_offline_fair+0x89/0x90
> set_rq_offline.part.118+0x28/0x60

So this is generated by patch 1, isn't it ?

> rq_attach_root+0xc4/0xd0
> cpu_attach_domain+0x3dc/0x7f0
> partition_sched_domains_locked+0x2a5/0x3c0
> rebuild_sched_domains_locked+0x477/0x830
> rebuild_sched_domains+0x1b/0x30
> cpuset_hotplug_workfn+0x2ca/0xc90
> ? balance_push+0x56/0xf0
> ? _raw_spin_unlock+0x15/0x30
> ? finish_task_switch+0x98/0x2f0
> ? __switch_to+0x291/0x410
> ? __schedule+0x65e/0x1310
> process_one_work+0x1bc/0x3d0
> worker_thread+0x4c/0x380
> ? preempt_count_add+0x92/0xa0
> ? rescuer_thread+0x310/0x310
> kthread+0xe6/0x110
> ? kthread_complete_and_exit+0x20/0x20
> ret_from_fork+0x1f/0x30
>
> For the __balance_push_cpu_stop() case, we remove update_rq_clock() from
> the __migrate_task() function to avoid double updating the rq clock.
> And in order to avoid missing rq clock update, add update_rq_clock()
> call before migration_cpu_stop() calls __migrate_task().
>
> This also works for unthrottle_cfs_rq(), so we also removed
> update_rq_clock() from the unthrottle_cfs_rq() function to avoid
> warnings caused by calling it multiple times, such as
> __cfsb_csd_unthrottle() and unthrottle_offline_cfs_rqs(). and
> in order to avoid missing rq clock update, we correspondingly add
> update_rq_clock() calls before unthrottle_cfs_rq() runs.
>
> Note that the rq clock has been updated before the set_rq_offline()
> function runs, so we don't need to add update_rq_clock() call in
> unthrottle_offline_cfs_rqs().
>
> Signed-off-by: Hao Jia <[email protected]>
> ---
> kernel/sched/core.c | 11 +++++++----
> kernel/sched/fair.c | 9 ++++++---
> 2 files changed, 13 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 0d18c3969f90..c1abd88ec1ed 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2376,7 +2376,6 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
> if (!is_cpu_allowed(p, dest_cpu))
> return rq;
>
> - update_rq_clock(rq);
> rq = move_queued_task(rq, rf, p, dest_cpu);
>
> return rq;
> @@ -2434,10 +2433,12 @@ static int migration_cpu_stop(void *data)
> goto out;
> }
>
> - if (task_on_rq_queued(p))
> + if (task_on_rq_queued(p)) {
> + update_rq_clock(rq);
> rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
> - else
> + } else {
> p->wake_cpu = arg->dest_cpu;
> + }
>
> /*
> * XXX __migrate_task() can fail, at which point we might end
> @@ -10759,8 +10760,10 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
> cfs_rq->runtime_enabled = runtime_enabled;
> cfs_rq->runtime_remaining = 0;
>
> - if (cfs_rq->throttled)
> + if (cfs_rq->throttled) {
> + update_rq_clock(rq);
> unthrottle_cfs_rq(cfs_rq);
> + }
> rq_unlock_irq(rq, &rf);
> }
> if (runtime_was_enabled && !runtime_enabled)
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6986ea31c984..03ced34e992e 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5438,8 +5438,6 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>
> cfs_rq->throttled = 0;
>
> - update_rq_clock(rq);
> -
> raw_spin_lock(&cfs_b->lock);
> cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
> list_del_rcu(&cfs_rq->throttled_list);
> @@ -5518,6 +5516,7 @@ static void __cfsb_csd_unthrottle(void *arg)
> struct rq_flags rf;
>
> rq_lock(rq, &rf);
> + update_rq_clock(rq);
>
> /*
> * Since we hold rq lock we're safe from concurrent manipulation of
> @@ -5547,6 +5546,7 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
> bool first;
>
> if (rq == this_rq()) {
> + update_rq_clock(rq);
> unthrottle_cfs_rq(cfs_rq);
> return;
> }
> @@ -5563,6 +5563,7 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
> #else
> static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
> {
> + update_rq_clock(rq_of(cfs_rq));
> unthrottle_cfs_rq(cfs_rq);
> }
> #endif
> @@ -5640,8 +5641,10 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
> if (local_unthrottle) {
> rq = cpu_rq(this_cpu);
> rq_lock_irqsave(rq, &rf);
> - if (cfs_rq_throttled(local_unthrottle))
> + if (cfs_rq_throttled(local_unthrottle)) {
> + update_rq_clock(rq);
> unthrottle_cfs_rq(local_unthrottle);
> + }
> rq_unlock_irqrestore(rq, &rf);
> }
>
> --
> 2.37.0
>

2023-05-02 10:28:21

by Hao Jia

[permalink] [raw]
Subject: Re: [External] Re: [PATCH 2/2] sched/core: Avoid double calling update_rq_clock()



On 2023/4/21 Vincent Guittot wrote:
> On Mon, 10 Apr 2023 at 10:12, Hao Jia <[email protected]> wrote:
>>
>> There are some double rq clock update warnings are triggered.
>> ------------[ cut here ]------------
>> rq->clock_update_flags & RQCF_UPDATED
>> WARNING: CPU: 17 PID: 138 at kernel/sched/core.c:741
>> update_rq_clock+0xaf/0x180
>> Call Trace:
>> <TASK>
>> __balance_push_cpu_stop+0x146/0x180
>> ? migration_cpu_stop+0x2a0/0x2a0
>> cpu_stopper_thread+0xa3/0x140
>> smpboot_thread_fn+0x14f/0x210
>> ? sort_range+0x20/0x20
>> kthread+0xe6/0x110
>> ? kthread_complete_and_exit+0x20/0x20
>> ret_from_fork+0x1f/0x30
>>
>> ------------[ cut here ]------------
>> rq->clock_update_flags & RQCF_UPDATED
>> WARNING: CPU: 54 PID: 0 at kernel/sched/core.c:741
>> update_rq_clock+0xaf/0x180
>> Call Trace:
>> <TASK>
>> unthrottle_cfs_rq+0x4b/0x300
>> __cfsb_csd_unthrottle+0xe0/0x100
>> __flush_smp_call_function_queue+0xaf/0x1d0
>> flush_smp_call_function_queue+0x49/0x90
>> do_idle+0x17c/0x270
>> cpu_startup_entry+0x19/0x20
>> start_secondary+0xfa/0x120
>> secondary_startup_64_no_verify+0xce/0xdb
>>
>> ------------[ cut here ]------------
>> rq->clock_update_flags & RQCF_UPDATED
>> WARNING: CPU: 0 PID: 3323 at kernel/sched/core.c:741
>> update_rq_clock+0xaf/0x180
>> Call Trace:
>> <TASK>
>> unthrottle_cfs_rq+0x4b/0x300
>> rq_offline_fair+0x89/0x90
>> set_rq_offline.part.118+0x28/0x60
>
> So this is generated by patch 1, isn't it ?

Sorry for the late reply, I just got back from a long term.

IIRC, this is not generated by patch1.

In the unthrottle_offline_cfs_rqs() function, we traverse task_groups
through list_for_each_entry_rcu, so unthrottle_cfs_rq() may be called
multiple times, resulting in multiple updates to the rq clock.

Thanks,
Hao
>
>> rq_attach_root+0xc4/0xd0
>> cpu_attach_domain+0x3dc/0x7f0
>> partition_sched_domains_locked+0x2a5/0x3c0
>> rebuild_sched_domains_locked+0x477/0x830
>> rebuild_sched_domains+0x1b/0x30
>> cpuset_hotplug_workfn+0x2ca/0xc90
>> ? balance_push+0x56/0xf0
>> ? _raw_spin_unlock+0x15/0x30
>> ? finish_task_switch+0x98/0x2f0
>> ? __switch_to+0x291/0x410
>> ? __schedule+0x65e/0x1310
>> process_one_work+0x1bc/0x3d0
>> worker_thread+0x4c/0x380
>> ? preempt_count_add+0x92/0xa0
>> ? rescuer_thread+0x310/0x310
>> kthread+0xe6/0x110
>> ? kthread_complete_and_exit+0x20/0x20
>> ret_from_fork+0x1f/0x30
>>
>> For the __balance_push_cpu_stop() case, we remove update_rq_clock() from
>> the __migrate_task() function to avoid double updating the rq clock.
>> And in order to avoid missing rq clock update, add update_rq_clock()
>> call before migration_cpu_stop() calls __migrate_task().
>>
>> This also works for unthrottle_cfs_rq(), so we also removed
>> update_rq_clock() from the unthrottle_cfs_rq() function to avoid
>> warnings caused by calling it multiple times, such as
>> __cfsb_csd_unthrottle() and unthrottle_offline_cfs_rqs(). and
>> in order to avoid missing rq clock update, we correspondingly add
>> update_rq_clock() calls before unthrottle_cfs_rq() runs.
>>
>> Note that the rq clock has been updated before the set_rq_offline()
>> function runs, so we don't need to add update_rq_clock() call in
>> unthrottle_offline_cfs_rqs().
>>



2023-05-04 16:42:01

by Vincent Guittot

[permalink] [raw]
Subject: Re: [External] Re: [PATCH 2/2] sched/core: Avoid double calling update_rq_clock()

Le mardi 02 mai 2023 ? 18:22:56 (+0800), Hao Jia a ?crit :
>
>
> On 2023/4/21 Vincent Guittot wrote:
> > On Mon, 10 Apr 2023 at 10:12, Hao Jia <[email protected]> wrote:
> > >
> > > There are some double rq clock update warnings are triggered.
> > > ------------[ cut here ]------------
> > > rq->clock_update_flags & RQCF_UPDATED
> > > WARNING: CPU: 17 PID: 138 at kernel/sched/core.c:741
> > > update_rq_clock+0xaf/0x180
> > > Call Trace:
> > > <TASK>
> > > __balance_push_cpu_stop+0x146/0x180
> > > ? migration_cpu_stop+0x2a0/0x2a0
> > > cpu_stopper_thread+0xa3/0x140
> > > smpboot_thread_fn+0x14f/0x210
> > > ? sort_range+0x20/0x20
> > > kthread+0xe6/0x110
> > > ? kthread_complete_and_exit+0x20/0x20
> > > ret_from_fork+0x1f/0x30
> > >
> > > ------------[ cut here ]------------
> > > rq->clock_update_flags & RQCF_UPDATED
> > > WARNING: CPU: 54 PID: 0 at kernel/sched/core.c:741
> > > update_rq_clock+0xaf/0x180
> > > Call Trace:
> > > <TASK>
> > > unthrottle_cfs_rq+0x4b/0x300
> > > __cfsb_csd_unthrottle+0xe0/0x100
> > > __flush_smp_call_function_queue+0xaf/0x1d0
> > > flush_smp_call_function_queue+0x49/0x90
> > > do_idle+0x17c/0x270
> > > cpu_startup_entry+0x19/0x20
> > > start_secondary+0xfa/0x120
> > > secondary_startup_64_no_verify+0xce/0xdb
> > >
> > > ------------[ cut here ]------------
> > > rq->clock_update_flags & RQCF_UPDATED
> > > WARNING: CPU: 0 PID: 3323 at kernel/sched/core.c:741
> > > update_rq_clock+0xaf/0x180
> > > Call Trace:
> > > <TASK>
> > > unthrottle_cfs_rq+0x4b/0x300
> > > rq_offline_fair+0x89/0x90
> > > set_rq_offline.part.118+0x28/0x60
> >
> > So this is generated by patch 1, isn't it ?
>
> Sorry for the late reply, I just got back from a long term.
>
> IIRC, this is not generated by patch1.

yeah, that's in the cfs loop

>
> In the unthrottle_offline_cfs_rqs() function, we traverse task_groups
> through list_for_each_entry_rcu, so unthrottle_cfs_rq() may be called
> multiple times, resulting in multiple updates to the rq clock.
>
> Thanks,
> Hao
> >
> > > rq_attach_root+0xc4/0xd0
> > > cpu_attach_domain+0x3dc/0x7f0
> > > partition_sched_domains_locked+0x2a5/0x3c0
> > > rebuild_sched_domains_locked+0x477/0x830
> > > rebuild_sched_domains+0x1b/0x30
> > > cpuset_hotplug_workfn+0x2ca/0xc90
> > > ? balance_push+0x56/0xf0
> > > ? _raw_spin_unlock+0x15/0x30
> > > ? finish_task_switch+0x98/0x2f0
> > > ? __switch_to+0x291/0x410
> > > ? __schedule+0x65e/0x1310
> > > process_one_work+0x1bc/0x3d0
> > > worker_thread+0x4c/0x380
> > > ? preempt_count_add+0x92/0xa0
> > > ? rescuer_thread+0x310/0x310
> > > kthread+0xe6/0x110
> > > ? kthread_complete_and_exit+0x20/0x20
> > > ret_from_fork+0x1f/0x30
> > >
> > > For the __balance_push_cpu_stop() case, we remove update_rq_clock() from
> > > the __migrate_task() function to avoid double updating the rq clock.
> > > And in order to avoid missing rq clock update, add update_rq_clock()
> > > call before migration_cpu_stop() calls __migrate_task().

Can we do the opposite ?
AFAICT, update_rq_clock() in __balance_push_cpu_stop() is only there for
__migrate_task(). I prefer to keep the update_rq_clock() as close as possible
to the user

> > >
> > > This also works for unthrottle_cfs_rq(), so we also removed
> > > update_rq_clock() from the unthrottle_cfs_rq() function to avoid
> > > warnings caused by calling it multiple times, such as
> > > __cfsb_csd_unthrottle() and unthrottle_offline_cfs_rqs(). and

This happens with the for loop added by
commit: 8ad075c2eb1f ("sched: Async unthrottling for cfs bandwidth")

> > > in order to avoid missing rq clock update, we correspondingly add
> > > update_rq_clock() calls before unthrottle_cfs_rq() runs.

These are special cases that happen because of the for_each.
As said above, I would prefer keeping update_rq_clock close the their user

could we use something similar to rq_clock_skip_update() for those list ?

Something like the below:

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 898fa3bc2765..7495b6fb229d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9386,8 +9386,6 @@ static int __balance_push_cpu_stop(void *arg)
raw_spin_lock_irq(&p->pi_lock);
rq_lock(rq, &rf);

- update_rq_clock(rq);
-
if (task_rq(p) == rq && task_on_rq_queued(p)) {
cpu = select_fallback_rq(rq->cpu, p);
rq = __migrate_task(rq, &rf, p, cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 503b7617d7e1..a1d47d907360 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5518,6 +5518,12 @@ static void __cfsb_csd_unthrottle(void *arg)
struct rq_flags rf;

rq_lock(rq, &rf);
+ /*
+ * Iterating over the list can trigger several call to update_rq_clock.
+ * Do it once and skip the potential next ones.
+ */
+ update_rq_clock(rq);
+ rq_clock_loop_update(rq);

/*
* Since we hold rq lock we're safe from concurrent manipulation of
@@ -6058,6 +6064,13 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)

lockdep_assert_rq_held(rq);

+ /*
+ * Iterating over the list can trigger several call to update_rq_clock.
+ * Do it once and skip the potential next ones.
+ */
+ update_rq_clock(rq);
+ rq_clock_loop_update(rq);
+
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 66bacd50d381..681924367891 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1536,6 +1536,19 @@ static inline void rq_clock_skip_update(struct rq *rq)
rq->clock_update_flags |= RQCF_REQ_SKIP;
}

+/*
+ * During cpu offlining and rq wide unthrottling, we can trigger
+ * an update_rq_clock() for several cfs and rt runqueues (Typically
+ * when using list_for_each_entry_*)
+ * rq_clock_loop_update() can be called after updating the clock once
+ * and before iterating over the list to prevent multiple update.
+ */
+static inline void rq_clock_loop_update(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ rq->clock_update_flags |= RQCF_ACT_SKIP;
+}
+
/*
* See rt task throttling, which is the only time a skip
* request is canceled.


> > >
> > > Note that the rq clock has been updated before the set_rq_offline()
> > > function runs, so we don't need to add update_rq_clock() call in
> > > unthrottle_offline_cfs_rqs().
> > >
>
>
>

2023-05-05 12:33:01

by Hao Jia

[permalink] [raw]
Subject: Re: [External] Re: [PATCH 2/2] sched/core: Avoid double calling update_rq_clock()



On 2023/5/5 Vincent Guittot wrote:

>>>> rq_attach_root+0xc4/0xd0
>>>> cpu_attach_domain+0x3dc/0x7f0
>>>> partition_sched_domains_locked+0x2a5/0x3c0
>>>> rebuild_sched_domains_locked+0x477/0x830
>>>> rebuild_sched_domains+0x1b/0x30
>>>> cpuset_hotplug_workfn+0x2ca/0xc90
>>>> ? balance_push+0x56/0xf0
>>>> ? _raw_spin_unlock+0x15/0x30
>>>> ? finish_task_switch+0x98/0x2f0
>>>> ? __switch_to+0x291/0x410
>>>> ? __schedule+0x65e/0x1310
>>>> process_one_work+0x1bc/0x3d0
>>>> worker_thread+0x4c/0x380
>>>> ? preempt_count_add+0x92/0xa0
>>>> ? rescuer_thread+0x310/0x310
>>>> kthread+0xe6/0x110
>>>> ? kthread_complete_and_exit+0x20/0x20
>>>> ret_from_fork+0x1f/0x30
>>>>
>>>> For the __balance_push_cpu_stop() case, we remove update_rq_clock() from
>>>> the __migrate_task() function to avoid double updating the rq clock.
>>>> And in order to avoid missing rq clock update, add update_rq_clock()
>>>> call before migration_cpu_stop() calls __migrate_task().
>
> Can we do the opposite ?
> AFAICT, update_rq_clock() in __balance_push_cpu_stop() is only there for
> __migrate_task(). I prefer to keep the update_rq_clock() as close as possible
> to the user

I'm afraid not, the rq clock also needs to be updated before
select_fallback_rq() is called.


>
>>>>
>>>> This also works for unthrottle_cfs_rq(), so we also removed
>>>> update_rq_clock() from the unthrottle_cfs_rq() function to avoid
>>>> warnings caused by calling it multiple times, such as
>>>> __cfsb_csd_unthrottle() and unthrottle_offline_cfs_rqs(). and
>
> This happens with the for loop added by
> commit: 8ad075c2eb1f ("sched: Async unthrottling for cfs bandwidth")
>

Yes, the warning caused by this commit.

>>>> in order to avoid missing rq clock update, we correspondingly add
>>>> update_rq_clock() calls before unthrottle_cfs_rq() runs.
>
> These are special cases that happen because of the for_each.
> As said above, I would prefer keeping update_rq_clock close the their user
>
> could we use something similar to rq_clock_skip_update() for those list ?
>

I try to do it with the method you provided. Some things maybe like this?

We also need to clear RQCF_ACT_SKIP after calling rq_clock_loop_update()
to avoid some warnings.


diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index efdab1489113..f48b5d912d8c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2420,7 +2420,6 @@ static struct rq *__migrate_task(struct rq *rq,
struct rq_flags *rf,
if (!is_cpu_allowed(p, dest_cpu))
return rq;

- update_rq_clock(rq);
rq = move_queued_task(rq, rf, p, dest_cpu);

return rq;
@@ -2478,10 +2477,12 @@ static int migration_cpu_stop(void *data)
goto out;
}

- if (task_on_rq_queued(p))
+ if (task_on_rq_queued(p)) {
+ update_rq_clock(rq);
rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
- else
+ } else {
p->wake_cpu = arg->dest_cpu;
+ }

/*
* XXX __migrate_task() can fail, at which point we
might end
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 373ff5f55884..1dcef273bebe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5575,6 +5575,13 @@ static void __cfsb_csd_unthrottle(void *arg)
struct rq_flags rf;

rq_lock(rq, &rf);
+ /*
+ * Iterating over the list can trigger several call to
update_rq_clock()
+ * in unthrottle_cfs_rq().
+ * Do it once and skip the potential next ones.
+ */
+ update_rq_clock(rq);
+ rq_clock_loop_update(rq);

/*
* Since we hold rq lock we're safe from concurrent manipulation of
@@ -5595,6 +5602,7 @@ static void __cfsb_csd_unthrottle(void *arg)

rcu_read_unlock();

+ rq_clock_cancel_loop_update(rq);
rq_unlock(rq, &rf);
}

@@ -6114,6 +6122,12 @@ static void __maybe_unused
unthrottle_offline_cfs_rqs(struct rq *rq)
struct task_group *tg;

lockdep_assert_rq_held(rq);
+ /*
+ * The rq clock has already been updated before the
+ * set_rq_offline() runs, so we should skip updating
+ * the rq clock again in unthrottle_cfs_rq().
+ */
+ rq_clock_loop_update(rq);

rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
@@ -6137,6 +6151,7 @@ static void __maybe_unused
unthrottle_offline_cfs_rqs(struct rq *rq)
unthrottle_cfs_rq(cfs_rq);
}
rcu_read_unlock();
+ rq_clock_cancel_loop_update(rq);
}

#else /* CONFIG_CFS_BANDWIDTH */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ec7b3e0a2b20..3d4981d354a9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1536,6 +1536,25 @@ static inline void rq_clock_skip_update(struct rq
*rq)
rq->clock_update_flags |= RQCF_REQ_SKIP;
}

+/*
+ * During cpu offlining and rq wide unthrottling, we can trigger
+ * an update_rq_clock() for several cfs and rt runqueues (Typically
+ * when using list_for_each_entry_*)
+ * rq_clock_loop_update() can be called after updating the clock once
+ * and before iterating over the list to prevent multiple update.
+ */
+static inline void rq_clock_loop_update(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ rq->clock_update_flags |= RQCF_ACT_SKIP;
+}
+
+static inline void rq_clock_cancel_loop_update(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ rq->clock_update_flags &= ~RQCF_ACT_SKIP;
+}
+
/*
* See rt task throttling, which is the only time a skip

2023-05-09 16:37:37

by Vincent Guittot

[permalink] [raw]
Subject: Re: [External] Re: [PATCH 2/2] sched/core: Avoid double calling update_rq_clock()

On Fri, 5 May 2023 at 14:13, Hao Jia <[email protected]> wrote:
>
>
>
> On 2023/5/5 Vincent Guittot wrote:
>
> >>>> rq_attach_root+0xc4/0xd0
> >>>> cpu_attach_domain+0x3dc/0x7f0
> >>>> partition_sched_domains_locked+0x2a5/0x3c0
> >>>> rebuild_sched_domains_locked+0x477/0x830
> >>>> rebuild_sched_domains+0x1b/0x30
> >>>> cpuset_hotplug_workfn+0x2ca/0xc90
> >>>> ? balance_push+0x56/0xf0
> >>>> ? _raw_spin_unlock+0x15/0x30
> >>>> ? finish_task_switch+0x98/0x2f0
> >>>> ? __switch_to+0x291/0x410
> >>>> ? __schedule+0x65e/0x1310
> >>>> process_one_work+0x1bc/0x3d0
> >>>> worker_thread+0x4c/0x380
> >>>> ? preempt_count_add+0x92/0xa0
> >>>> ? rescuer_thread+0x310/0x310
> >>>> kthread+0xe6/0x110
> >>>> ? kthread_complete_and_exit+0x20/0x20
> >>>> ret_from_fork+0x1f/0x30
> >>>>
> >>>> For the __balance_push_cpu_stop() case, we remove update_rq_clock() from
> >>>> the __migrate_task() function to avoid double updating the rq clock.
> >>>> And in order to avoid missing rq clock update, add update_rq_clock()
> >>>> call before migration_cpu_stop() calls __migrate_task().
> >
> > Can we do the opposite ?
> > AFAICT, update_rq_clock() in __balance_push_cpu_stop() is only there for
> > __migrate_task(). I prefer to keep the update_rq_clock() as close as possible
> > to the user
>
> I'm afraid not, the rq clock also needs to be updated before
> select_fallback_rq() is called.

yes you're right

>
>
> >
> >>>>
> >>>> This also works for unthrottle_cfs_rq(), so we also removed
> >>>> update_rq_clock() from the unthrottle_cfs_rq() function to avoid
> >>>> warnings caused by calling it multiple times, such as
> >>>> __cfsb_csd_unthrottle() and unthrottle_offline_cfs_rqs(). and
> >
> > This happens with the for loop added by
> > commit: 8ad075c2eb1f ("sched: Async unthrottling for cfs bandwidth")
> >
>
> Yes, the warning caused by this commit.
>
> >>>> in order to avoid missing rq clock update, we correspondingly add
> >>>> update_rq_clock() calls before unthrottle_cfs_rq() runs.
> >
> > These are special cases that happen because of the for_each.
> > As said above, I would prefer keeping update_rq_clock close the their user
> >
> > could we use something similar to rq_clock_skip_update() for those list ?
> >
>
> I try to do it with the method you provided. Some things maybe like this?
>
> We also need to clear RQCF_ACT_SKIP after calling rq_clock_loop_update()
> to avoid some warnings.
>
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index efdab1489113..f48b5d912d8c 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2420,7 +2420,6 @@ static struct rq *__migrate_task(struct rq *rq,
> struct rq_flags *rf,
> if (!is_cpu_allowed(p, dest_cpu))
> return rq;
>
> - update_rq_clock(rq);
> rq = move_queued_task(rq, rf, p, dest_cpu);
>
> return rq;
> @@ -2478,10 +2477,12 @@ static int migration_cpu_stop(void *data)
> goto out;
> }
>
> - if (task_on_rq_queued(p))
> + if (task_on_rq_queued(p)) {
> + update_rq_clock(rq);
> rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
> - else
> + } else {
> p->wake_cpu = arg->dest_cpu;
> + }
>
> /*
> * XXX __migrate_task() can fail, at which point we
> might end
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 373ff5f55884..1dcef273bebe 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5575,6 +5575,13 @@ static void __cfsb_csd_unthrottle(void *arg)
> struct rq_flags rf;
>
> rq_lock(rq, &rf);
> + /*
> + * Iterating over the list can trigger several call to
> update_rq_clock()
> + * in unthrottle_cfs_rq().
> + * Do it once and skip the potential next ones.
> + */
> + update_rq_clock(rq);
> + rq_clock_loop_update(rq);
>
> /*
> * Since we hold rq lock we're safe from concurrent manipulation of
> @@ -5595,6 +5602,7 @@ static void __cfsb_csd_unthrottle(void *arg)
>
> rcu_read_unlock();
>
> + rq_clock_cancel_loop_update(rq);
> rq_unlock(rq, &rf);
> }
>
> @@ -6114,6 +6122,12 @@ static void __maybe_unused
> unthrottle_offline_cfs_rqs(struct rq *rq)
> struct task_group *tg;
>
> lockdep_assert_rq_held(rq);
> + /*
> + * The rq clock has already been updated before the
> + * set_rq_offline() runs, so we should skip updating
> + * the rq clock again in unthrottle_cfs_rq().
> + */
> + rq_clock_loop_update(rq);
>
> rcu_read_lock();
> list_for_each_entry_rcu(tg, &task_groups, list) {
> @@ -6137,6 +6151,7 @@ static void __maybe_unused
> unthrottle_offline_cfs_rqs(struct rq *rq)
> unthrottle_cfs_rq(cfs_rq);
> }
> rcu_read_unlock();
> + rq_clock_cancel_loop_update(rq);
> }
>
> #else /* CONFIG_CFS_BANDWIDTH */
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index ec7b3e0a2b20..3d4981d354a9 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1536,6 +1536,25 @@ static inline void rq_clock_skip_update(struct rq
> *rq)
> rq->clock_update_flags |= RQCF_REQ_SKIP;
> }
>
> +/*
> + * During cpu offlining and rq wide unthrottling, we can trigger
> + * an update_rq_clock() for several cfs and rt runqueues (Typically
> + * when using list_for_each_entry_*)
> + * rq_clock_loop_update() can be called after updating the clock once
> + * and before iterating over the list to prevent multiple update.
> + */
> +static inline void rq_clock_loop_update(struct rq *rq)

maybe use rq_clock_start_loop_update

> +{
> + lockdep_assert_rq_held(rq);
> + rq->clock_update_flags |= RQCF_ACT_SKIP;
> +}
> +
> +static inline void rq_clock_cancel_loop_update(struct rq *rq)

and rq_clock_stop_loop_update here

> +{
> + lockdep_assert_rq_held(rq);
> + rq->clock_update_flags &= ~RQCF_ACT_SKIP;
> +}
> +
> /*
> * See rt task throttling, which is the only time a skip