LinuxLists.cc - [PATCH RFC] tick/nohz: fix data races in get_cpu_idle_time

2023-01-28 01:43:15

Subject: [PATCH RFC] tick/nohz: fix data races in get_cpu_idle_time_us()

selftest/proc/proc-uptime-001 complains:
Euler:/mnt # while true; do ./proc-uptime-001; done
proc-uptime-001: proc-uptime-001.c:41: main: Assertion `i1 >= i0' failed.
proc-uptime-001: proc-uptime-001.c:41: main: Assertion `i1 >= i0' failed.

/proc/uptime should be monotonically increasing. This occurs because
the data races between get_cpu_idle_time_us and
tick_nohz_stop_idle/tick_nohz_start_idle, for example:

CPU0 CPU1
get_cpu_idle_time_us

tick_nohz_idle_exit
now = ktime_get();
tick_nohz_stop_idle
update_ts_time_stats
delta = ktime_sub(now, ts->idle_entrytime);
ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta)
ts->idle_entrytime = now

now = ktime_get();
if (ts->idle_active && !nr_iowait_cpu(cpu)) {
ktime_t delta = ktime_sub(now, ts->idle_entrytime);
idle = ktime_add(ts->idle_sleeptime, delta);
//idle is slightly greater than the actual value
} else {
idle = ts->idle_sleeptime;
}
ts->idle_active = 0

After this, idle = idle_sleeptime(actual idle value) + now(CPU0) - now(CPU1).
If get_cpu_idle_time_us() is called immediately after ts->idle_active = 0,
only ts->idle_sleeptime is returned, which is smaller than the previously
read one, resulting in a non-monotonically increasing idle time. In
addition, there are other data race scenarios not listed here.

This patch introduce a lock to prevent data races.

Fixes: a130e8fbc7de ("fs/proc/uptime.c: Fix idle time reporting in /proc/uptime")
Signed-off-by: Yu Liao <[email protected]>
---
kernel/time/tick-sched.c | 15 ++++++++++++++-
kernel/time/tick-sched.h | 1 +
2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b0e3c9205946..ad7d47098a58 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -36,7 +36,9 @@
/*
* Per-CPU nohz control structure
*/
-static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
+static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched) = {
+ .idle_time_lock = __SPIN_LOCK_UNLOCKED(tick_cpu_sched.idle_time_lock),
+};

struct tick_sched *tick_get_tick_sched(int cpu)
{
@@ -661,16 +663,24 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda

static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ts->idle_time_lock, flags);
update_ts_time_stats(smp_processor_id(), ts, now, NULL);
ts->idle_active = 0;
+ spin_unlock_irqrestore(&ts->idle_time_lock, flags);

sched_clock_idle_wakeup_event();
}

static void tick_nohz_start_idle(struct tick_sched *ts)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ts->idle_time_lock, flags);
ts->idle_entrytime = ktime_get();
ts->idle_active = 1;
+ spin_unlock_irqrestore(&ts->idle_time_lock, flags);
sched_clock_idle_sleep_event();
}

@@ -691,12 +701,14 @@ static void tick_nohz_start_idle(struct tick_sched *ts)
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ unsigned long flags;
ktime_t now, idle;

if (!tick_nohz_active)
return -1;

now = ktime_get();
+ spin_lock_irqsave(&ts->idle_time_lock, flags);
if (last_update_time) {
update_ts_time_stats(cpu, ts, now, last_update_time);
idle = ts->idle_sleeptime;
@@ -709,6 +721,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
idle = ts->idle_sleeptime;
}
}
+ spin_unlock_irqrestore(&ts->idle_time_lock, flags);

return ktime_to_us(idle);

diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 504649513399..a64d4781e7af 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -81,6 +81,7 @@ struct tick_sched {
atomic_t tick_dep_mask;
unsigned long last_tick_jiffies;
unsigned int stalled_jiffies;
+ spinlock_t idle_time_lock;
};

extern struct tick_sched *tick_get_tick_sched(int cpu);
--
2.25.1

2023-01-31 14:44:07

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [PATCH RFC] tick/nohz: fix data races in get_cpu_idle_time_us()

On Sat, Jan 28 2023 at 10:00, Yu Liao wrote:
> selftest/proc/proc-uptime-001 complains:
> Euler:/mnt # while true; do ./proc-uptime-001; done
> proc-uptime-001: proc-uptime-001.c:41: main: Assertion `i1 >= i0' failed.
> proc-uptime-001: proc-uptime-001.c:41: main: Assertion `i1 >= i0' failed.
>
> /proc/uptime should be monotonically increasing. This occurs because
> the data races between get_cpu_idle_time_us and
> tick_nohz_stop_idle/tick_nohz_start_idle, for example:
>
> CPU0 CPU1
> get_cpu_idle_time_us
>
> tick_nohz_idle_exit
> now = ktime_get();
> tick_nohz_stop_idle
> update_ts_time_stats
> delta = ktime_sub(now, ts->idle_entrytime);
> ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta)
> ts->idle_entrytime = now
>
> now = ktime_get();
> if (ts->idle_active && !nr_iowait_cpu(cpu)) {
> ktime_t delta = ktime_sub(now, ts->idle_entrytime);
> idle = ktime_add(ts->idle_sleeptime, delta);
> //idle is slightly greater than the actual value
> } else {
> idle = ts->idle_sleeptime;
> }
> ts->idle_active = 0
>
> After this, idle = idle_sleeptime(actual idle value) + now(CPU0) - now(CPU1).
> If get_cpu_idle_time_us() is called immediately after ts->idle_active = 0,
> only ts->idle_sleeptime is returned, which is smaller than the previously
> read one, resulting in a non-monotonically increasing idle time. In
> addition, there are other data race scenarios not listed here.

Seriously this procfs accuracy is the least of the problems and if this
would be the only issue then we could trivially fix it by declaring that
the procfs output might go backwards. It's an estimate after all. If
there would be a real reason to ensure monotonicity there then we could
easily do that in the readout code.

But the real issue is that both get_cpu_idle_time_us() and
get_cpu_iowait_time_us() can invoke update_ts_time_stats() which is way
worse than the above procfs idle time going backwards.

If update_ts_time_stats() is invoked concurrently for the same CPU then
ts->idle_sleeptime and ts->iowait_sleeptime are turning into random
numbers.

This has been broken 12 years ago in commit 595aac488b54 ("sched:
Introduce a function to update the idle statistics").

> This patch introduce a lock to prevent data races.

Please search for 'This patch' in Documentation/process and act
accordingly.

> diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
> index 504649513399..a64d4781e7af 100644
> --- a/kernel/time/tick-sched.h
> +++ b/kernel/time/tick-sched.h
> @@ -81,6 +81,7 @@ struct tick_sched {
> atomic_t tick_dep_mask;
> unsigned long last_tick_jiffies;
> unsigned int stalled_jiffies;
> + spinlock_t idle_time_lock;

That must be a raw_spinlock_t and we need to look at the placement
inside the structure.

@Frederic: This structure is patently unstructured and if we fix this
issue here then we really want to look at the cache line layout for
real.

Also the name is misleading. It's protecting way more than the idle
time muck.

> static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
> {
> + unsigned long flags;
> +
> + spin_lock_irqsave(&ts->idle_time_lock, flags);

raw_spin_lock();

tick_nohz_start/stop_idle() are called with interrupts disabled.

> u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)

u64 get_cpu_iowait_time_us() requires the same treatment.

Thanks,

tglx

P.S.: I hate the spinlock in the idle code path, but I don't have a
better idea.

2023-01-31 18:36:12

by Alexey Dobriyan

[permalink] [raw]

Subject: Re: [PATCH RFC] tick/nohz: fix data races in get_cpu_idle_time_us()

On Tue, Jan 31, 2023 at 03:44:00PM +0100, Thomas Gleixner wrote:
> On Sat, Jan 28 2023 at 10:00, Yu Liao wrote:
> > selftest/proc/proc-uptime-001 complains:
> > Euler:/mnt # while true; do ./proc-uptime-001; done
> > proc-uptime-001: proc-uptime-001.c:41: main: Assertion `i1 >= i0' failed.
> > proc-uptime-001: proc-uptime-001.c:41: main: Assertion `i1 >= i0' failed.
> >
> > /proc/uptime should be monotonically increasing. This occurs because
> > the data races between get_cpu_idle_time_us and
> > tick_nohz_stop_idle/tick_nohz_start_idle, for example:
> >
> > CPU0 CPU1
> > get_cpu_idle_time_us
> >
> > tick_nohz_idle_exit
> > now = ktime_get();
> > tick_nohz_stop_idle
> > update_ts_time_stats
> > delta = ktime_sub(now, ts->idle_entrytime);
> > ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta)
> > ts->idle_entrytime = now
> >
> > now = ktime_get();
> > if (ts->idle_active && !nr_iowait_cpu(cpu)) {
> > ktime_t delta = ktime_sub(now, ts->idle_entrytime);
> > idle = ktime_add(ts->idle_sleeptime, delta);
> > //idle is slightly greater than the actual value
> > } else {
> > idle = ts->idle_sleeptime;
> > }
> > ts->idle_active = 0
> >
> > After this, idle = idle_sleeptime(actual idle value) + now(CPU0) - now(CPU1).
> > If get_cpu_idle_time_us() is called immediately after ts->idle_active = 0,
> > only ts->idle_sleeptime is returned, which is smaller than the previously
> > read one, resulting in a non-monotonically increasing idle time. In
> > addition, there are other data race scenarios not listed here.
>
> Seriously this procfs accuracy is the least of the problems and if this
> would be the only issue then we could trivially fix it by declaring that
> the procfs output might go backwards.

Declarations on l-k are meaningless.

> If there would be a real reason to ensure monotonicity there then we could
> easily do that in the readout code.

People expect it to be monotonic. I wrote this test fully expecting
that /proc/uptime is monotonic. It didn't ever occured to me that
idletime can go backwards (nor uptime, but uptime is not buggy).

> But the real issue is that both get_cpu_idle_time_us() and
> get_cpu_iowait_time_us() can invoke update_ts_time_stats() which is way
> worse than the above procfs idle time going backwards.
>
> If update_ts_time_stats() is invoked concurrently for the same CPU then
> ts->idle_sleeptime and ts->iowait_sleeptime are turning into random
> numbers.
>
> This has been broken 12 years ago in commit 595aac488b54 ("sched:
> Introduce a function to update the idle statistics").

2023-01-31 19:58:17

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH RFC] tick/nohz: fix data races in get_cpu_idle_time_us()

On Tue, Jan 31, 2023 at 03:44:00PM +0100, Thomas Gleixner wrote:

> P.S.: I hate the spinlock in the idle code path, but I don't have a
> better idea.

seqcount? It would avoid the obvious interleave and put most of the onus
on the reader (which never happens anyway).

2023-01-31 19:59:25

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH RFC] tick/nohz: fix data races in get_cpu_idle_time_us()

On Tue, Jan 31, 2023 at 09:35:39PM +0300, Alexey Dobriyan wrote:

> > Seriously this procfs accuracy is the least of the problems and if this
> > would be the only issue then we could trivially fix it by declaring that
> > the procfs output might go backwards.
>
> Declarations on l-k are meaningless.

Not really, we often do the -EWONTFIX thing.

> > If there would be a real reason to ensure monotonicity there then we could
> > easily do that in the readout code.
>
> People expect it to be monotonic. I wrote this test fully expecting
> that /proc/uptime is monotonic. It didn't ever occured to me that
> idletime can go backwards (nor uptime, but uptime is not buggy).

People want ponies too -- people will just have to cope with not having
ponies.

2023-01-31 21:11:44

by Frederic Weisbecker

[permalink] [raw]

Subject: Re: [PATCH RFC] tick/nohz: fix data races in get_cpu_idle_time_us()

On Tue, Jan 31, 2023 at 08:57:59PM +0100, Peter Zijlstra wrote:
> On Tue, Jan 31, 2023 at 03:44:00PM +0100, Thomas Gleixner wrote:
>
> > P.S.: I hate the spinlock in the idle code path, but I don't have a
> > better idea.
>
> seqcount? It would avoid the obvious interleave and put most of the onus
> on the reader (which never happens anyway).

Yep, and do the update locally only on idle exit. But note that neither
seqcount nor spinlock will fix the nr_iowait_cpu() based thing. This counter
can be decremented remotely even during the idle period so the reader
can see an iowait period that may eventually be accounted as !iowait,
or the reverse. Breaking the monotonicity and even coherency.

That stuff is broken by design and this is the reason why it got never
really fixed. The seqcount/spinlock would make it just a bit less worse.

2023-02-01 09:03:59

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH RFC] tick/nohz: fix data races in get_cpu_idle_time_us()

On Tue, Jan 31, 2023 at 10:11:35PM +0100, Frederic Weisbecker wrote:
> On Tue, Jan 31, 2023 at 08:57:59PM +0100, Peter Zijlstra wrote:
> > On Tue, Jan 31, 2023 at 03:44:00PM +0100, Thomas Gleixner wrote:
> >
> > > P.S.: I hate the spinlock in the idle code path, but I don't have a
> > > better idea.
> >
> > seqcount? It would avoid the obvious interleave and put most of the onus
> > on the reader (which never happens anyway).
>
> Yep, and do the update locally only on idle exit. But note that neither
> seqcount nor spinlock will fix the nr_iowait_cpu() based thing. This counter
> can be decremented remotely even during the idle period so the reader
> can see an iowait period that may eventually be accounted as !iowait,
> or the reverse. Breaking the monotonicity and even coherency.
>
> That stuff is broken by design and this is the reason why it got never
> really fixed. The seqcount/spinlock would make it just a bit less worse.

Yeah, iowait is a random number generator, -EWONTFIX on that.

2023-02-01 12:02:50

by Frederic Weisbecker

[permalink] [raw]

Subject: Re: [PATCH RFC] tick/nohz: fix data races in get_cpu_idle_time_us()

On Wed, Feb 01, 2023 at 12:53:02PM +0800, Hillf Danton wrote:
> On Tue, 31 Jan 2023 15:44:00 +0100 Thomas Gleixner <[email protected]>
> >
> > Seriously this procfs accuracy is the least of the problems and if this
> > would be the only issue then we could trivially fix it by declaring that
> > the procfs output might go backwards. It's an estimate after all. If
> > there would be a real reason to ensure monotonicity there then we could
> > easily do that in the readout code.
> >
> > But the real issue is that both get_cpu_idle_time_us() and
> > get_cpu_iowait_time_us() can invoke update_ts_time_stats() which is way
> > worse than the above procfs idle time going backwards.
> >
> > If update_ts_time_stats() is invoked concurrently for the same CPU then
> > ts->idle_sleeptime and ts->iowait_sleeptime are turning into random
> > numbers.
> >
> > This has been broken 12 years ago in commit 595aac488b54 ("sched:
> > Introduce a function to update the idle statistics").
>
> [...]
>
> >
> > P.S.: I hate the spinlock in the idle code path, but I don't have a
> > better idea.
>
> Provided the percpu rule is enforced, the random numbers mentioned above
> could be erased without another spinlock added.
>
> Hillf
> +++ b/kernel/time/tick-sched.c
> @@ -640,13 +640,26 @@ static void tick_nohz_update_jiffies(kti
> /*
> * Updates the per-CPU time idle statistics counters
> */
> -static void
> -update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
> +static u64 update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now,
> + int io, u64 *last_update_time)
> {
> ktime_t delta;
>
> + if (last_update_time)
> + *last_update_time = ktime_to_us(now);
> +
> if (ts->idle_active) {
> delta = ktime_sub(now, ts->idle_entrytime);
> +
> + /* update is only expected on the local CPU */
> + if (cpu != smp_processor_id()) {

Why not just updating it only on idle exit then?

> + if (io)

I fear it's not up to the caller to decides if the idle time is IO or not.

> + delta = ktime_add(ts->iowait_sleeptime, delta);
> + else
> + delta = ktime_add(ts->idle_sleeptime, delta);
> + return ktime_to_us(delta);
> + }
> +
> if (nr_iowait_cpu(cpu) > 0)
> ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
> else

But you kept the old update above.

So if this is not the local CPU, what do you do?

You'd need to return (without updating iowait_sleeptime):

ts->idle_sleeptime + ktime_sub(ktime_get(), ts->idle_entrytime)

Right? But then you may race with the local updater, risking to return
the delta added twice. So you need at least a seqcount.

But in the end, nr_iowait_cpu() is broken because that counter can be
decremented remotely and so the whole thing is beyond repair:

CPU 0 CPU 1 CPU 2
----- ----- ------
//io_schedule() TASK A
current->in_iowait = 1
rq(0)->nr_iowait++
//switch to idle
// READ /proc/stat
// See nr_iowait_cpu(0) == 1
return ts->iowait_sleeptime + ktime_sub(ktime_get(), ts->idle_entrytime)

//try_to_wake_up(TASK A)
rq(0)->nr_iowait--
//idle exit
// See nr_iowait_cpu(0) == 0
ts->idle_sleeptime += ktime_sub(ktime_get(), ts->idle_entrytime)

Thanks.

2023-02-01 14:28:57

by Frederic Weisbecker

[permalink] [raw]

Subject: Re: [PATCH RFC] tick/nohz: fix data races in get_cpu_idle_time_us()

On Wed, Feb 01, 2023 at 10:01:17PM +0800, Hillf Danton wrote:
> > > +++ b/kernel/time/tick-sched.c
> > > @@ -640,13 +640,26 @@ static void tick_nohz_update_jiffies(kti
> > > /*
> > > * Updates the per-CPU time idle statistics counters
> > > */
> > > -static void
> > > -update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
> > > +static u64 update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now,
> > > + int io, u64 *last_update_time)
> > > {
> > > ktime_t delta;
> > >
> > > + if (last_update_time)
> > > + *last_update_time = ktime_to_us(now);
> > > +
> > > if (ts->idle_active) {
> > > delta = ktime_sub(now, ts->idle_entrytime);
> > > +
> > > + /* update is only expected on the local CPU */
> > > + if (cpu != smp_processor_id()) {
> >
> > Why not just updating it only on idle exit then?
>
> This aligns to idle exit as much as it can by disallowing remote update.

I mean why bother updating if idle does it for us already?

One possibility is that we get some more precise values if we read during
long idle periods with nr_iowait_cpu() changes in the middle.

> >
> > > + if (io)
> >
> > I fear it's not up to the caller to decides if the idle time is IO or not.
>
> Could you specify a bit on your concern, given the callers of this function?

You are randomly stating if the elapsing idle time is IO or not depending on
the caller, without verifying nr_iowait_cpu(). Or am I missing something?

> >
> > > + delta = ktime_add(ts->iowait_sleeptime, delta);
> > > + else
> > > + delta = ktime_add(ts->idle_sleeptime, delta);
> > > + return ktime_to_us(delta);
>
> Based on the above comments, I guest you missed this line which prevents
> get_cpu_idle_time_us() and get_cpu_iowait_time_us() from updating ts.

Right...

> > But then you may race with the local updater, risking to return
> > the delta added twice. So you need at least a seqcount.
>
> Add seqcount if needed. No problem.
> >
> > But in the end, nr_iowait_cpu() is broken because that counter can be
> > decremented remotely and so the whole thing is beyond repair:
> >
> > CPU 0 CPU 1 CPU 2
> > ----- ----- ------
> > //io_schedule() TASK A
> > current->in_iowait = 1
> > rq(0)->nr_iowait++
> > //switch to idle
> > // READ /proc/stat
> > // See nr_iowait_cpu(0) == 1
> > return ts->iowait_sleeptime + ktime_sub(ktime_get(), ts->idle_entrytime)
> >
> > //try_to_wake_up(TASK A)
> > rq(0)->nr_iowait--
> > //idle exit
> > // See nr_iowait_cpu(0) == 0
> > ts->idle_sleeptime += ktime_sub(ktime_get(), ts->idle_entrytime)
>
> Ah see your point.
>
> The diff disallows remotely updating ts, and it is updated in idle exit
> after my proposal, so what nr_iowait_cpu() breaks is mitigated.

Only halfway mitigated. This doesn't prevent from backward or forward jumps
when non-updating readers are involved at all.

Thanks.

>
> Thanks for taking a look, particularly the race linked to nr_iowait_cpu().
>
> Hillf

2023-02-07 05:25:29

by Mirsad Todorovac

[permalink] [raw]

Subject: Re: [PATCH RFC] tick/nohz: fix data races in get_cpu_idle_time_us()

Hi,Yu Liao,

On 28. 01. 2023. 03:00, Yu Liao wrote:
> selftest/proc/proc-uptime-001 complains:
> Euler:/mnt # while true; do ./proc-uptime-001; done
> proc-uptime-001: proc-uptime-001.c:41: main: Assertion `i1 >= i0' failed.
> proc-uptime-001: proc-uptime-001.c:41: main: Assertion `i1 >= i0' failed.
>
> /proc/uptime should be monotonically increasing. This occurs because
> the data races between get_cpu_idle_time_us and
> tick_nohz_stop_idle/tick_nohz_start_idle, for example:
>
> CPU0 CPU1
> get_cpu_idle_time_us
>
> tick_nohz_idle_exit
> now = ktime_get();
> tick_nohz_stop_idle
> update_ts_time_stats
> delta = ktime_sub(now, ts->idle_entrytime);
> ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta)
> ts->idle_entrytime = now
>
> now = ktime_get();
> if (ts->idle_active && !nr_iowait_cpu(cpu)) {
> ktime_t delta = ktime_sub(now, ts->idle_entrytime);
> idle = ktime_add(ts->idle_sleeptime, delta);
> //idle is slightly greater than the actual value
> } else {
> idle = ts->idle_sleeptime;
> }
> ts->idle_active = 0
>
> After this, idle = idle_sleeptime(actual idle value) + now(CPU0) - now(CPU1).
> If get_cpu_idle_time_us() is called immediately after ts->idle_active = 0,
> only ts->idle_sleeptime is returned, which is smaller than the previously
> read one, resulting in a non-monotonically increasing idle time. In
> addition, there are other data race scenarios not listed here.
>
> This patch introduce a lock to prevent data races.
>
> Fixes: a130e8fbc7de ("fs/proc/uptime.c: Fix idle time reporting in /proc/uptime")
> Signed-off-by: Yu Liao <[email protected]>
> ---
> kernel/time/tick-sched.c | 15 ++++++++++++++-
> kernel/time/tick-sched.h | 1 +
> 2 files changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
> index b0e3c9205946..ad7d47098a58 100644
> --- a/kernel/time/tick-sched.c
> +++ b/kernel/time/tick-sched.c
> @@ -36,7 +36,9 @@
> /*
> * Per-CPU nohz control structure
> */
> -static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
> +static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched) = {
> + .idle_time_lock = __SPIN_LOCK_UNLOCKED(tick_cpu_sched.idle_time_lock),
> +};
>
> struct tick_sched *tick_get_tick_sched(int cpu)
> {
> @@ -661,16 +663,24 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda
>
> static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
> {
> + unsigned long flags;
> +
> + spin_lock_irqsave(&ts->idle_time_lock, flags);
> update_ts_time_stats(smp_processor_id(), ts, now, NULL);
> ts->idle_active = 0;
> + spin_unlock_irqrestore(&ts->idle_time_lock, flags);
>
> sched_clock_idle_wakeup_event();
> }
>
> static void tick_nohz_start_idle(struct tick_sched *ts)
> {
> + unsigned long flags;
> +
> + spin_lock_irqsave(&ts->idle_time_lock, flags);
> ts->idle_entrytime = ktime_get();
> ts->idle_active = 1;
> + spin_unlock_irqrestore(&ts->idle_time_lock, flags);
> sched_clock_idle_sleep_event();
> }
>
> @@ -691,12 +701,14 @@ static void tick_nohz_start_idle(struct tick_sched *ts)
> u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
> {
> struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
> + unsigned long flags;
> ktime_t now, idle;
>
> if (!tick_nohz_active)
> return -1;
>
> now = ktime_get();
> + spin_lock_irqsave(&ts->idle_time_lock, flags);
> if (last_update_time) {
> update_ts_time_stats(cpu, ts, now, last_update_time);
> idle = ts->idle_sleeptime;
> @@ -709,6 +721,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
> idle = ts->idle_sleeptime;
> }
> }
> + spin_unlock_irqrestore(&ts->idle_time_lock, flags);
>
> return ktime_to_us(idle);
>
> diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
> index 504649513399..a64d4781e7af 100644
> --- a/kernel/time/tick-sched.h
> +++ b/kernel/time/tick-sched.h
> @@ -81,6 +81,7 @@ struct tick_sched {
> atomic_t tick_dep_mask;
> unsigned long last_tick_jiffies;
> unsigned int stalled_jiffies;
> + spinlock_t idle_time_lock;
> };
>
> extern struct tick_sched *tick_get_tick_sched(int cpu);

Thank you for the patch RFC.

I can confirm that v6.2-rc7 still exhibits the issue:
marvin@marvin-IdeaPad-3-15ITL6:~/linux/kernel/linux_torvalds/tools/testing/selftests/proc$ while true; do ./proc-uptime-001; done
proc-uptime-001: proc-uptime-001.c:39: main: Assertion `i1 >= i0' failed.
Aborted (core dumped)
proc-uptime-001: proc-uptime-001.c:39: main: Assertion `i1 >= i0' failed.
Aborted (core dumped)
proc-uptime-001: proc-uptime-001.c:39: main: Assertion `i1 >= i0' failed.
Aborted (core dumped)
^C
marvin@marvin-IdeaPad-3-15ITL6:~/linux/kernel/linux_torvalds/tools/testing/selftests/proc$ uname -rms
Linux 6.2.0-rc7-mglru-kmemlk x86_64
marvin@marvin-IdeaPad-3-15ITL6:~/linux/kernel/linux_torvalds/tools/testing/selftests/proc$

Your patch, however, apparently fixes the problem.

Rebooting with the patched kernel eliminates the Assertion above, on the same hardware platform.

All the programs in the tools/testing/seltest/proc now appear to pass the test:

make[2]: Entering directory '/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/proc'
TAP version 13
1..21
# selftests: proc: fd-001-lookup
ok 1 selftests: proc: fd-001-lookup
# selftests: proc: fd-002-posix-eq
ok 2 selftests: proc: fd-002-posix-eq
# selftests: proc: fd-003-kthread
ok 3 selftests: proc: fd-003-kthread
# selftests: proc: proc-loadavg-001
ok 4 selftests: proc: proc-loadavg-001
# selftests: proc: proc-empty-vm
ok 5 selftests: proc: proc-empty-vm
# selftests: proc: proc-pid-vm
ok 6 selftests: proc: proc-pid-vm
# selftests: proc: proc-self-map-files-001
ok 7 selftests: proc: proc-self-map-files-001
# selftests: proc: proc-self-map-files-002
ok 8 selftests: proc: proc-self-map-files-002
# selftests: proc: proc-self-syscall
ok 9 selftests: proc: proc-self-syscall
# selftests: proc: proc-self-wchan
ok 10 selftests: proc: proc-self-wchan
# selftests: proc: proc-subset-pid
ok 11 selftests: proc: proc-subset-pid
# selftests: proc: proc-tid0
ok 12 selftests: proc: proc-tid0
# selftests: proc: proc-uptime-001
ok 13 selftests: proc: proc-uptime-001
# selftests: proc: proc-uptime-002
ok 14 selftests: proc: proc-uptime-002
# selftests: proc: read
ok 15 selftests: proc: read
# selftests: proc: self
ok 16 selftests: proc: self
# selftests: proc: setns-dcache
ok 17 selftests: proc: setns-dcache
# selftests: proc: setns-sysvipc
ok 18 selftests: proc: setns-sysvipc
# selftests: proc: thread-self
ok 19 selftests: proc: thread-self
# selftests: proc: proc-multiple-procfs
ok 20 selftests: proc: proc-multiple-procfs
# selftests: proc: proc-fsconfig-hidepid
ok 21 selftests: proc: proc-fsconfig-hidepid
make[2]: Leaving directory '/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/proc'

What was applied was:

# git diff
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b0e3c9205946..ad7d47098a58 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -36,7 +36,9 @@
/*
* Per-CPU nohz control structure
*/
-static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
+static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched) = {
+ .idle_time_lock = __SPIN_LOCK_UNLOCKED(tick_cpu_sched.idle_time_lock),
+};

struct tick_sched *tick_get_tick_sched(int cpu)
{
@@ -661,16 +663,24 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda

static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ts->idle_time_lock, flags);
update_ts_time_stats(smp_processor_id(), ts, now, NULL);
ts->idle_active = 0;
+ spin_unlock_irqrestore(&ts->idle_time_lock, flags);

sched_clock_idle_wakeup_event();
}

static void tick_nohz_start_idle(struct tick_sched *ts)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ts->idle_time_lock, flags);
ts->idle_entrytime = ktime_get();
ts->idle_active = 1;
+ spin_unlock_irqrestore(&ts->idle_time_lock, flags);
sched_clock_idle_sleep_event();
}

@@ -691,12 +701,14 @@ static void tick_nohz_start_idle(struct tick_sched *ts)
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ unsigned long flags;
ktime_t now, idle;

if (!tick_nohz_active)
return -1;

now = ktime_get();
+ spin_lock_irqsave(&ts->idle_time_lock, flags);
if (last_update_time) {
update_ts_time_stats(cpu, ts, now, last_update_time);
idle = ts->idle_sleeptime;
@@ -709,6 +721,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
idle = ts->idle_sleeptime;
}
}
+ spin_unlock_irqrestore(&ts->idle_time_lock, flags);

return ktime_to_us(idle);

diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 504649513399..a64d4781e7af 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -81,6 +81,7 @@ struct tick_sched {
atomic_t tick_dep_mask;
unsigned long last_tick_jiffies;
unsigned int stalled_jiffies;
+ spinlock_t idle_time_lock;
};

extern struct tick_sched *tick_get_tick_sched(int cpu);

Feel free to add: Tested-by: Mirsad Goran Todorovac <[email protected]>

Regards,
Mirsad

--
Mirsad Goran Todorovac
Sistem inženjer
Grafički fakultet | Akademija likovnih umjetnosti
Sveučilište u Zagrebu

System engineer
Faculty of Graphic Arts | Academy of Fine Arts
University of Zagreb, Republic of Croatia
The European Union

2023-02-07 08:04:17

by Mirsad Todorovac

[permalink] [raw]

Subject: Re: [PATCH RFC] tick/nohz: fix data races in get_cpu_idle_time_us()

On 7.2.2023. 6:25, Mirsad Goran Todorovac wrote:
> Hi,Yu Liao,
>
> On 28. 01. 2023. 03:00, Yu Liao wrote:
>> selftest/proc/proc-uptime-001 complains:
>> Euler:/mnt # while true; do ./proc-uptime-001; done
>> proc-uptime-001: proc-uptime-001.c:41: main: Assertion `i1 >= i0' failed.
>> proc-uptime-001: proc-uptime-001.c:41: main: Assertion `i1 >= i0' failed.
>>
>> /proc/uptime should be monotonically increasing. This occurs because
>> the data races between get_cpu_idle_time_us and
>> tick_nohz_stop_idle/tick_nohz_start_idle, for example:
>>
>> CPU0 CPU1
>> get_cpu_idle_time_us
>>
>> tick_nohz_idle_exit
>> now = ktime_get();
>> tick_nohz_stop_idle
>> update_ts_time_stats
>> delta = ktime_sub(now, ts->idle_entrytime);
>> ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta)
>> ts->idle_entrytime = now
>>
>> now = ktime_get();
>> if (ts->idle_active && !nr_iowait_cpu(cpu)) {
>> ktime_t delta = ktime_sub(now, ts->idle_entrytime);
>> idle = ktime_add(ts->idle_sleeptime, delta);
>> //idle is slightly greater than the actual value
>> } else {
>> idle = ts->idle_sleeptime;
>> }
>> ts->idle_active = 0
>>
>> After this, idle = idle_sleeptime(actual idle value) + now(CPU0) - now(CPU1).
>> If get_cpu_idle_time_us() is called immediately after ts->idle_active = 0,
>> only ts->idle_sleeptime is returned, which is smaller than the previously
>> read one, resulting in a non-monotonically increasing idle time. In
>> addition, there are other data race scenarios not listed here.
>>
>> This patch introduce a lock to prevent data races.
>>
>> Fixes: a130e8fbc7de ("fs/proc/uptime.c: Fix idle time reporting in /proc/uptime")
>> Signed-off-by: Yu Liao <[email protected]>
>> ---
>> kernel/time/tick-sched.c | 15 ++++++++++++++-
>> kernel/time/tick-sched.h | 1 +
>> 2 files changed, 15 insertions(+), 1 deletion(-)
>>
>> diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
>> index b0e3c9205946..ad7d47098a58 100644
>> --- a/kernel/time/tick-sched.c
>> +++ b/kernel/time/tick-sched.c
>> @@ -36,7 +36,9 @@
>> /*
>> * Per-CPU nohz control structure
>> */
>> -static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
>> +static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched) = {
>> + .idle_time_lock = __SPIN_LOCK_UNLOCKED(tick_cpu_sched.idle_time_lock),
>> +};
>>
>> struct tick_sched *tick_get_tick_sched(int cpu)
>> {
>> @@ -661,16 +663,24 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda
>>
>> static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
>> {
>> + unsigned long flags;
>> +
>> + spin_lock_irqsave(&ts->idle_time_lock, flags);
>> update_ts_time_stats(smp_processor_id(), ts, now, NULL);
>> ts->idle_active = 0;
>> + spin_unlock_irqrestore(&ts->idle_time_lock, flags);
>>
>> sched_clock_idle_wakeup_event();
>> }
>>
>> static void tick_nohz_start_idle(struct tick_sched *ts)
>> {
>> + unsigned long flags;
>> +
>> + spin_lock_irqsave(&ts->idle_time_lock, flags);
>> ts->idle_entrytime = ktime_get();
>> ts->idle_active = 1;
>> + spin_unlock_irqrestore(&ts->idle_time_lock, flags);
>> sched_clock_idle_sleep_event();
>> }
>>
>> @@ -691,12 +701,14 @@ static void tick_nohz_start_idle(struct tick_sched *ts)
>> u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
>> {
>> struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
>> + unsigned long flags;
>> ktime_t now, idle;
>>
>> if (!tick_nohz_active)
>> return -1;
>>
>> now = ktime_get();
>> + spin_lock_irqsave(&ts->idle_time_lock, flags);
>> if (last_update_time) {
>> update_ts_time_stats(cpu, ts, now, last_update_time);
>> idle = ts->idle_sleeptime;
>> @@ -709,6 +721,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
>> idle = ts->idle_sleeptime;
>> }
>> }
>> + spin_unlock_irqrestore(&ts->idle_time_lock, flags);
>>
>> return ktime_to_us(idle);
>>
>> diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
>> index 504649513399..a64d4781e7af 100644
>> --- a/kernel/time/tick-sched.h
>> +++ b/kernel/time/tick-sched.h
>> @@ -81,6 +81,7 @@ struct tick_sched {
>> atomic_t tick_dep_mask;
>> unsigned long last_tick_jiffies;
>> unsigned int stalled_jiffies;
>> + spinlock_t idle_time_lock;
>> };
>>
>> extern struct tick_sched *tick_get_tick_sched(int cpu);
>
> Thank you for the patch RFC.
>
> I can confirm that v6.2-rc7 still exhibits the issue:
> marvin@marvin-IdeaPad-3-15ITL6:~/linux/kernel/linux_torvalds/tools/testing/selftests/proc$ while true; do ./proc-uptime-001; done
> proc-uptime-001: proc-uptime-001.c:39: main: Assertion `i1 >= i0' failed.
> Aborted (core dumped)
> proc-uptime-001: proc-uptime-001.c:39: main: Assertion `i1 >= i0' failed.
> Aborted (core dumped)
> proc-uptime-001: proc-uptime-001.c:39: main: Assertion `i1 >= i0' failed.
> Aborted (core dumped)
> ^C
> marvin@marvin-IdeaPad-3-15ITL6:~/linux/kernel/linux_torvalds/tools/testing/selftests/proc$ uname -rms
> Linux 6.2.0-rc7-mglru-kmemlk x86_64
> marvin@marvin-IdeaPad-3-15ITL6:~/linux/kernel/linux_torvalds/tools/testing/selftests/proc$
>
> Your patch, however, apparently fixes the problem.
>
> Rebooting with the patched kernel eliminates the Assertion above, on the same hardware platform.
>
> All the programs in the tools/testing/seltest/proc now appear to pass the test:
>
> make[2]: Entering directory '/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/proc'
> TAP version 13
> 1..21
> # selftests: proc: fd-001-lookup
> ok 1 selftests: proc: fd-001-lookup
> # selftests: proc: fd-002-posix-eq
> ok 2 selftests: proc: fd-002-posix-eq
> # selftests: proc: fd-003-kthread
> ok 3 selftests: proc: fd-003-kthread
> # selftests: proc: proc-loadavg-001
> ok 4 selftests: proc: proc-loadavg-001
> # selftests: proc: proc-empty-vm
> ok 5 selftests: proc: proc-empty-vm
> # selftests: proc: proc-pid-vm
> ok 6 selftests: proc: proc-pid-vm
> # selftests: proc: proc-self-map-files-001
> ok 7 selftests: proc: proc-self-map-files-001
> # selftests: proc: proc-self-map-files-002
> ok 8 selftests: proc: proc-self-map-files-002
> # selftests: proc: proc-self-syscall
> ok 9 selftests: proc: proc-self-syscall
> # selftests: proc: proc-self-wchan
> ok 10 selftests: proc: proc-self-wchan
> # selftests: proc: proc-subset-pid
> ok 11 selftests: proc: proc-subset-pid
> # selftests: proc: proc-tid0
> ok 12 selftests: proc: proc-tid0
> # selftests: proc: proc-uptime-001
> ok 13 selftests: proc: proc-uptime-001
> # selftests: proc: proc-uptime-002
> ok 14 selftests: proc: proc-uptime-002
> # selftests: proc: read
> ok 15 selftests: proc: read
> # selftests: proc: self
> ok 16 selftests: proc: self
> # selftests: proc: setns-dcache
> ok 17 selftests: proc: setns-dcache
> # selftests: proc: setns-sysvipc
> ok 18 selftests: proc: setns-sysvipc
> # selftests: proc: thread-self
> ok 19 selftests: proc: thread-self
> # selftests: proc: proc-multiple-procfs
> ok 20 selftests: proc: proc-multiple-procfs
> # selftests: proc: proc-fsconfig-hidepid
> ok 21 selftests: proc: proc-fsconfig-hidepid
> make[2]: Leaving directory '/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/proc'
>
> What was applied was:
>
> # git diff
> diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
> index b0e3c9205946..ad7d47098a58 100644
> --- a/kernel/time/tick-sched.c
> +++ b/kernel/time/tick-sched.c
> @@ -36,7 +36,9 @@
> /*
> * Per-CPU nohz control structure
> */
> -static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
> +static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched) = {
> + .idle_time_lock = __SPIN_LOCK_UNLOCKED(tick_cpu_sched.idle_time_lock),
> +};
>
> struct tick_sched *tick_get_tick_sched(int cpu)
> {
> @@ -661,16 +663,24 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda
>
> static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
> {
> + unsigned long flags;
> +
> + spin_lock_irqsave(&ts->idle_time_lock, flags);
> update_ts_time_stats(smp_processor_id(), ts, now, NULL);
> ts->idle_active = 0;
> + spin_unlock_irqrestore(&ts->idle_time_lock, flags);
>
> sched_clock_idle_wakeup_event();
> }
>
> static void tick_nohz_start_idle(struct tick_sched *ts)
> {
> + unsigned long flags;
> +
> + spin_lock_irqsave(&ts->idle_time_lock, flags);
> ts->idle_entrytime = ktime_get();
> ts->idle_active = 1;
> + spin_unlock_irqrestore(&ts->idle_time_lock, flags);
> sched_clock_idle_sleep_event();
> }
>
> @@ -691,12 +701,14 @@ static void tick_nohz_start_idle(struct tick_sched *ts)
> u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
> {
> struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
> + unsigned long flags;
> ktime_t now, idle;
>
> if (!tick_nohz_active)
> return -1;
>
> now = ktime_get();
> + spin_lock_irqsave(&ts->idle_time_lock, flags);
> if (last_update_time) {
> update_ts_time_stats(cpu, ts, now, last_update_time);
> idle = ts->idle_sleeptime;
> @@ -709,6 +721,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
> idle = ts->idle_sleeptime;
> }
> }
> + spin_unlock_irqrestore(&ts->idle_time_lock, flags);
>
> return ktime_to_us(idle);
>
> diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
> index 504649513399..a64d4781e7af 100644
> --- a/kernel/time/tick-sched.h
> +++ b/kernel/time/tick-sched.h
> @@ -81,6 +81,7 @@ struct tick_sched {
> atomic_t tick_dep_mask;
> unsigned long last_tick_jiffies;
> unsigned int stalled_jiffies;
> + spinlock_t idle_time_lock;
> };
>
> extern struct tick_sched *tick_get_tick_sched(int cpu);
>
>
> Feel free to add: Tested-by: Mirsad Goran Todorovac <[email protected]>

P.S.

The fact that it works for me didn't take into account the kernel test robot result
I overlooked, which makes my test irrelevant.

Thanks,

--
Mirsad Todorovac
System engineer
Faculty of Graphic Arts | Academy of Fine Arts
University of Zagreb
Republic of Croatia, the European Union
--
Sistem inženjer
Grafički fakultet | Akademija likovnih umjetnosti
Sveučilište u Zagrebu

2023-02-08 15:19:39

by Frederic Weisbecker

[permalink] [raw]

Subject: [PATCH] timers/nohz: Restructure and reshuffle struct tick_sched

On Tue, Jan 31, 2023 at 03:44:00PM +0100, Thomas Gleixner wrote:
> @Frederic: This structure is patently unstructured and if we fix this
> issue here then we really want to look at the cache line layout for
> real.

I'm not really good at that, but here is my try:

---
From: Frederic Weisbecker <[email protected]>
Date: Wed, 8 Feb 2023 16:08:11 +0100
Subject: [PATCH] timers/nohz: Restructure and reshuffle struct tick_sched

Restructure and group fields by access in order to optimize cache
layout. While at it, also add missing kernel doc for two fields:
@last_jiffies and @idle_expires.

Reported-by: Thomas Gleixner <[email protected]>
Signed-off-by: Frederic Weisbecker <[email protected]>
---
kernel/time/tick-sched.h | 66 +++++++++++++++++++++++++---------------
1 file changed, 41 insertions(+), 25 deletions(-)

diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 504649513399..c6663254d17d 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -22,65 +22,81 @@ enum tick_nohz_mode {

/**
* struct tick_sched - sched tick emulation and no idle tick control/stats
- * @sched_timer: hrtimer to schedule the periodic tick in high
- * resolution mode
- * @check_clocks: Notification mechanism about clocksource changes
- * @nohz_mode: Mode - one state of tick_nohz_mode
+ *
* @inidle: Indicator that the CPU is in the tick idle mode
* @tick_stopped: Indicator that the idle tick has been stopped
* @idle_active: Indicator that the CPU is actively in the tick idle mode;
* it is reset during irq handling phases.
- * @do_timer_lst: CPU was the last one doing do_timer before going idle
+ * @do_timer_last: CPU was the last one doing do_timer before going idle
* @got_idle_tick: Tick timer function has run with @inidle set
+ * @stalled_jiffies: Number of stalled jiffies detected across ticks
+ * @last_tick_jiffies: Value of jiffies seen on last tick
+ * @sched_timer: hrtimer to schedule the periodic tick in high
+ * resolution mode
* @last_tick: Store the last tick expiry time when the tick
* timer is modified for nohz sleeps. This is necessary
* to resume the tick timer operation in the timeline
* when the CPU returns from nohz sleep.
* @next_tick: Next tick to be fired when in dynticks mode.
* @idle_jiffies: jiffies at the entry to idle for idle time accounting
+ * @idle_waketime: Time when the idle was interrupted
+ * @idle_entrytime: Time when the idle call was entered
+ * @nohz_mode: Mode - one state of tick_nohz_mode
+ * @last_jiffies: Base jiffies snapshot when next event was last computed
+ * @timer_expires_base: Base time clock monotonic for @timer_expires
+ * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
+ * @next_timer: Expiry time of next expiring timer for debugging purpose only
+ * @idle_expires: Next tick in idle, for debugging purpose only
* @idle_calls: Total number of idle calls
* @idle_sleeps: Number of idle calls, where the sched tick was stopped
- * @idle_entrytime: Time when the idle call was entered
- * @idle_waketime: Time when the idle was interrupted
* @idle_exittime: Time when the idle state was left
* @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
* @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
- * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
- * @timer_expires_base: Base time clock monotonic for @timer_expires
- * @next_timer: Expiry time of next expiring timer for debugging purpose only
* @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick
- * @last_tick_jiffies: Value of jiffies seen on last tick
- * @stalled_jiffies: Number of stalled jiffies detected across ticks
+ * @check_clocks: Notification mechanism about clocksource changes
*/
struct tick_sched {
- struct hrtimer sched_timer;
- unsigned long check_clocks;
- enum tick_nohz_mode nohz_mode;
-
+ /* Common flags */
unsigned int inidle : 1;
unsigned int tick_stopped : 1;
unsigned int idle_active : 1;
unsigned int do_timer_last : 1;
unsigned int got_idle_tick : 1;

+ /* Tick handling: jiffies stall check */
+ unsigned int stalled_jiffies;
+ unsigned long last_tick_jiffies;
+
+ /* Tick handling */
+ struct hrtimer sched_timer;
ktime_t last_tick;
ktime_t next_tick;
unsigned long idle_jiffies;
- unsigned long idle_calls;
- unsigned long idle_sleeps;
- ktime_t idle_entrytime;
ktime_t idle_waketime;
- ktime_t idle_exittime;
- ktime_t idle_sleeptime;
- ktime_t iowait_sleeptime;
+
+ /* Idle entry */
+ ktime_t idle_entrytime;
+
+ /* Tick stop */
+ enum tick_nohz_mode nohz_mode;
unsigned long last_jiffies;
- u64 timer_expires;
u64 timer_expires_base;
+ u64 timer_expires;
u64 next_timer;
ktime_t idle_expires;
+ unsigned long idle_calls;
+ unsigned long idle_sleeps;
+
+ /* Idle exit */
+ ktime_t idle_exittime;
+ ktime_t idle_sleeptime;
+ ktime_t iowait_sleeptime;
+
+ /* Full dynticks handling */
atomic_t tick_dep_mask;
- unsigned long last_tick_jiffies;
- unsigned int stalled_jiffies;
+
+ /* Clocksource changes */
+ unsigned long check_clocks;
};

extern struct tick_sched *tick_get_tick_sched(int cpu);
--
2.34.1

2023-02-06 07:04:20

by Oliver Sang

[permalink] [raw]

Subject: Re: [PATCH RFC] tick/nohz: fix data races in get_cpu_idle_time_us()

Greeting,

FYI, we noticed BUG:spinlock_bad_magic_on_CPU due to commit (built with gcc-11):

commit: 56718b9a847cc1fb7e5e4a3ef2e076e6ea3482f3 ("[PATCH RFC] tick/nohz: fix data races in get_cpu_idle_time_us()")
url: https://github.com/intel-lab-lkp/linux/commits/Yu-Liao/tick-nohz-fix-data-races-in-get_cpu_idle_time_us/20230128-120617
base: https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git 5af6ce7049365952f7f023155234fe091693ead1
patch link: https://lore.kernel.org/all/[email protected]/
patch subject: [PATCH RFC] tick/nohz: fix data races in get_cpu_idle_time_us()

in testcase: kernel-selftests
version: kernel-selftests-x86_64-d4cf28ee-1_20230110
with following parameters:

group: group-01

test-description: The kernel contains a set of "self tests" under the tools/testing/selftests/ directory. These are intended to be small unit tests to exercise individual code paths in the kernel.
test-url: https://www.kernel.org/doc/Documentation/kselftest.txt

on test machine: 4 threads Intel(R) Xeon(R) CPU E3-1225 v5 @ 3.30GHz (Skylake) with 16G memory

caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):

If you fix the issue, kindly add following tag
| Reported-by: kernel test robot <[email protected]>
| Link: https://lore.kernel.org/oe-lkp/[email protected]

[ 78.048403][ T0] BUG: spinlock bad magic on CPU#3, swapper/3/0
[ 78.054523][ T0] lock: 0xffff88837f5ac3a0, .magic: 00000000, .owner: <none>/-1, .owner_cpu: 0
[ 78.063433][ T0] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 6.2.0-rc5-00200-g56718b9a847c #1
[ 78.072057][ T0] Hardware name: HP HP Z238 Microtower Workstation/8183, BIOS N51 Ver. 01.63 10/05/2017
[ 78.081637][ T0] Call Trace:
[ 78.084789][ T0] <TASK>
[ 78.087593][ T0] dump_stack_lvl (lib/dump_stack.c:107 (discriminator 4))
[ 78.091970][ T0] do_raw_spin_lock (kernel/locking/spinlock_debug.c:85 kernel/locking/spinlock_debug.c:114)
[ 78.096691][ T0] ? rwlock_bug+0x90/0x90
[ 78.101499][ T0] ? lock_downgrade (kernel/locking/lockdep.c:5320)
[ 78.106221][ T0] _raw_spin_lock_irqsave (kernel/locking/spinlock.c:162)
[ 78.111289][ T0] ? tick_nohz_idle_enter (kernel/time/tick-sched.c:680 (discriminator 3) kernel/time/tick-sched.c:1173 (discriminator 3))
[ 78.116442][ T0] tick_nohz_idle_enter (kernel/time/tick-sched.c:680 (discriminator 3) kernel/time/tick-sched.c:1173 (discriminator 3))
[ 78.121437][ T0] do_idle (include/linux/cpumask.h:444 include/linux/cpumask.h:1030 kernel/sched/idle.c:284)
[ 78.125285][ T0] cpu_startup_entry (kernel/sched/idle.c:399 (discriminator 1))
[ 78.129919][ T0] start_secondary (arch/x86/kernel/smpboot.c:198 arch/x86/kernel/smpboot.c:232)
[ 78.134544][ T0] ? set_cpu_sibling_map (arch/x86/kernel/smpboot.c:216)
[ 78.139874][ T0] secondary_startup_64_no_verify (arch/x86/kernel/head_64.S:358)
[ 78.145638][ T0] </TASK>

To reproduce:

git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
sudo bin/lkp install job.yaml # job file is attached in this email
bin/lkp split-job --compatible job.yaml # generate the yaml file for lkp run
sudo bin/lkp run generated-yaml-file

# if come across any failure that blocks the test,
# please remove ~/.lkp and /lkp dir to run from a clean state.

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests

Attachments:

(No filename) (3.37 kB)
config-6.2.0-rc5-00200-g56718b9a847c (166.68 kB)
job-script (6.60 kB)
dmesg.xz (142.26 kB)
kernel-selftests (248.34 kB)
job.yaml (5.42 kB)
reproduce (1.31 kB)
Download all attachments