2019-03-13 15:09:50

by Phil Auld

[permalink] [raw]
Subject: [PATCH] sched/fair: Limit sched_cfs_period_timer loop to avoid hard lockup

With extremely short cfs_period_us setting on a parent task group with a large
number of children the for loop in sched_cfs_period_timer can run until the
watchdog fires. There is no guarantee that the call to hrtimer_forward_now()
will ever return 0. The large number of children can make
do_sched_cfs_period_timer() take longer than the period.

[ 217.264946] NMI watchdog: Watchdog detected hard LOCKUP on cpu 24
[ 217.264948] Modules linked in: sunrpc iTCO_wdt gpio_ich iTCO_vendor_support intel_powerclamp coretemp kvm_intel
+kvm ipmi_ssif irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel ipmi_si intel_cstate joydev
+ipmi_devintf pcspkr hpilo intel_uncore sg hpwdt ipmi_msghandler acpi_power_meter i7core_edac lpc_ich acpi_cpufreq
+ip_tables xfs libcrc32c sr_mod sd_mod cdrom ata_generic radeon i2c_algo_bit drm_kms_helper syscopyarea
+sysfillrect sysimgblt fb_sys_fops ttm ata_piix drm serio_raw crc32c_intel hpsa myri10ge libata dca
+scsi_transport_sas netxen_nic dm_mirror dm_region_hash dm_log dm_mod
[ 217.264964] CPU: 24 PID: 46684 Comm: myapp Not tainted 5.0.0-rc7+ #25
[ 217.264965] Hardware name: HP ProLiant DL580 G7, BIOS P65 08/16/2015
[ 217.264965] RIP: 0010:tg_nop+0x0/0x10
[ 217.264966] Code: 83 c9 08 f0 48 0f b1 0f 48 39 c2 74 0e 48 89 c2 f7 c2 00 00 20 00 75 dc 31 c0 c3 b8 01 00 00
+00 c3 66 0f 1f 84 00 00 00 00 00 <66> 66 66 66 90 31 c0 c3 0f 1f 84 00 00 00 00 00 66 66 66 66 90 8b
[ 217.264967] RSP: 0000:ffff976a7f703e48 EFLAGS: 00000087
[ 217.264967] RAX: ffff976a7c7c8f00 RBX: ffff976a6f4fad00 RCX: ffff976a7c7c90f0
[ 217.264968] RDX: ffff976a6f4faee0 RSI: ffff976a7f961f00 RDI: ffff976a6f4fad00
[ 217.264968] RBP: ffff976a7f961f00 R08: 0000000000000002 R09: ffffffad2c9b3331
[ 217.264969] R10: 0000000000000000 R11: 0000000000000000 R12: ffff976a7c7c8f00
[ 217.264969] R13: ffffffffb2305c00 R14: 0000000000000000 R15: ffffffffb22f8510
[ 217.264970] FS: 00007f6240678740(0000) GS:ffff976a7f700000(0000) knlGS:0000000000000000
[ 217.264970] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 217.264971] CR2: 00000000006dee20 CR3: 000000bf2bffc005 CR4: 00000000000206e0
[ 217.264971] Call Trace:
[ 217.264971] <IRQ>
[ 217.264972] walk_tg_tree_from+0x29/0xb0
[ 217.264972] unthrottle_cfs_rq+0xe0/0x1a0
[ 217.264972] distribute_cfs_runtime+0xd3/0xf0
[ 217.264973] sched_cfs_period_timer+0xcb/0x160
[ 217.264973] ? sched_cfs_slack_timer+0xd0/0xd0
[ 217.264973] __hrtimer_run_queues+0xfb/0x270
[ 217.264974] hrtimer_interrupt+0x122/0x270
[ 217.264974] smp_apic_timer_interrupt+0x6a/0x140
[ 217.264975] apic_timer_interrupt+0xf/0x20
[ 217.264975] </IRQ>
[ 217.264975] RIP: 0033:0x7f6240125fe5
[ 217.264976] Code: 44 17 d0 f3 44 0f 7f 47 30 f3 44 0f 7f 44 17 c0 48 01 fa 48 83 e2 c0 48 39 d1 74 a3 66 0f 1f
+84 00 00 00 00 00 66 44 0f 7f 01 <66> 44 0f 7f 41 10 66 44 0f 7f 41 20 66 44 0f 7f 41 30 48 83 c1 40
...

To prevent this we add protection to the loop that detects when the loop has run
too many times and scales the period and quota up, proportionally, so that the timer
can complete before then next period expires. This preserves the relative runtime
quota while preventing the hard lockup.

A warning is issued reporting this state and the new values.

The scaling by average time over count was suggested by Ben Segall <[email protected]>.

Signed-off-by: Phil Auld <[email protected]>
Cc: Ben Segall <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra (Intel) <[email protected]>
---

Note: This is against v5.0 as suggested by the documentation. It won't apply to 5.0+ due to
the change to raw_spin_lock_irqsave. I can respin as needed.

kernel/sched/fair.c | 32 ++++++++++++++++++++++++++++++++
1 file changed, 32 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 310d0637fe4b..90cc67bbf592 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4859,19 +4859,51 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
return HRTIMER_NORESTART;
}

+extern const u64 max_cfs_quota_period;
+int cfs_period_autotune_loop_limit = 8;
+int cfs_period_autotune_cushion_pct = 15; /* percentage added to period recalculation */
+
static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, period_timer);
+ s64 nsstart, nsnow, new_period;
int overrun;
int idle = 0;
+ int count = 0;

raw_spin_lock(&cfs_b->lock);
+ nsstart = ktime_to_ns(hrtimer_cb_get_time(timer));
for (;;) {
overrun = hrtimer_forward_now(timer, cfs_b->period);
if (!overrun)
break;

+ if (++count > cfs_period_autotune_loop_limit) {
+ ktime_t old_period = ktime_to_ns(cfs_b->period);
+
+ nsnow = ktime_to_ns(hrtimer_cb_get_time(timer));
+ new_period = (nsnow - nsstart)/cfs_period_autotune_loop_limit;
+
+ /* Make sure new period will be larger than old. */
+ if (new_period < old_period) {
+ new_period = old_period;
+ }
+ new_period += (new_period * cfs_period_autotune_cushion_pct) / 100;
+
+ if (new_period > max_cfs_quota_period)
+ new_period = max_cfs_quota_period;
+
+ cfs_b->period = ns_to_ktime(new_period);
+ cfs_b->quota += (cfs_b->quota * ((new_period - old_period) * 100)/old_period)/100;
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(), cfs_b->period/NSEC_PER_USEC, cfs_b->quota/NSEC_PER_USEC);
+
+ /* reset count so we don't come right back in here */
+ count = 0;
+ }
+
idle = do_sched_cfs_period_timer(cfs_b, overrun);
}
if (idle)
--
2.18.0



2019-03-15 10:15:06

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: Limit sched_cfs_period_timer loop to avoid hard lockup

On Wed, Mar 13, 2019 at 11:08:26AM -0400, Phil Auld wrote:
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 310d0637fe4b..90cc67bbf592 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4859,19 +4859,51 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
> return HRTIMER_NORESTART;
> }
>
> +extern const u64 max_cfs_quota_period;
> +int cfs_period_autotune_loop_limit = 8;
> +int cfs_period_autotune_cushion_pct = 15; /* percentage added to period recalculation */

static const ?

> +
> static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> {
> struct cfs_bandwidth *cfs_b =
> container_of(timer, struct cfs_bandwidth, period_timer);
> + s64 nsstart, nsnow, new_period;

u64

> int overrun;
> int idle = 0;
> + int count = 0;
>
> raw_spin_lock(&cfs_b->lock);
> + nsstart = ktime_to_ns(hrtimer_cb_get_time(timer));

we really should kill ktime :/ Anyway, you now do two indirect timer
calls back to back :/

And this is unconditional overhead.

> for (;;) {
> overrun = hrtimer_forward_now(timer, cfs_b->period);
> if (!overrun)
> break;
>
> + if (++count > cfs_period_autotune_loop_limit) {
> + ktime_t old_period = ktime_to_ns(cfs_b->period);
> +
> + nsnow = ktime_to_ns(hrtimer_cb_get_time(timer));
> + new_period = (nsnow - nsstart)/cfs_period_autotune_loop_limit;
> +
> + /* Make sure new period will be larger than old. */
> + if (new_period < old_period) {
> + new_period = old_period;
> + }
> + new_period += (new_period * cfs_period_autotune_cushion_pct) / 100;

Computers _suck_ at /100. And since you're free to pick the constant,
pick a power of two, computers love those.

> +
> + if (new_period > max_cfs_quota_period)
> + new_period = max_cfs_quota_period;
> +
> + cfs_b->period = ns_to_ktime(new_period);
> + cfs_b->quota += (cfs_b->quota * ((new_period - old_period) * 100)/old_period)/100;

srsly!? Again, you can pick the constant to be anything, and you pick
such a horrid number?!

> + pr_warn_ratelimited(
> + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
> + smp_processor_id(), cfs_b->period/NSEC_PER_USEC, cfs_b->quota/NSEC_PER_USEC);

period was ktime_t, remember...

> +

And these here lines all all waaay too long.

Also, is that complexity really needed?

> + /* reset count so we don't come right back in here */
> + count = 0;
> + }
> +
> idle = do_sched_cfs_period_timer(cfs_b, overrun);
> }
> if (idle)


Would not something simpler like the below also work?


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ea74d43924b2..b71557be6b42 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4885,6 +4885,8 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
return HRTIMER_NORESTART;
}

+extern const u64 max_cfs_quota_period;
+
static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
@@ -4892,6 +4894,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
unsigned long flags;
int overrun;
int idle = 0;
+ int count = 0;

raw_spin_lock_irqsave(&cfs_b->lock, flags);
for (;;) {
@@ -4899,6 +4902,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (!overrun)
break;

+ if (++count > 3) {
+ u64 new, old = ktime_to_ns(cfs_b->period);
+
+ new = (old * 147) / 128; /* ~115% */
+ new = min(new, max_cfs_quota_period);
+
+ cfs_b->period = ns_to_ktime(new);
+
+ /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
+ cfs_b->quota *= new;
+ cfs_b->quota /= old;
+
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ new/NSEC_PER_USEC,
+ cfs_b->quota/NSEC_PER_USEC);
+
+ /* reset count so we don't come right back in here */
+ count = 0;
+ }
+
idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
}
if (idle)

2019-03-15 11:24:45

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: Limit sched_cfs_period_timer loop to avoid hard lockup

On Fri, Mar 15, 2019 at 11:11:50AM +0100, Peter Zijlstra wrote:
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index ea74d43924b2..b71557be6b42 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4885,6 +4885,8 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
> return HRTIMER_NORESTART;
> }
>
> +extern const u64 max_cfs_quota_period;
> +
> static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> {
> struct cfs_bandwidth *cfs_b =
> @@ -4892,6 +4894,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> unsigned long flags;
> int overrun;
> int idle = 0;
> + int count = 0;
>
> raw_spin_lock_irqsave(&cfs_b->lock, flags);
> for (;;) {
> @@ -4899,6 +4902,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> if (!overrun)
> break;
>
> + if (++count > 3) {
> + u64 new, old = ktime_to_ns(cfs_b->period);
> +
> + new = (old * 147) / 128; /* ~115% */
> + new = min(new, max_cfs_quota_period);

Also, we can still engineer things to come unstuck; if we explicitly
configure period at 1e9 and then set a really small quota and then
create this insane amount of cgroups you have..

this code has no room to manouvre left.

Do we want to do anything about that? Or leave it as is, don't do that
then?

> +
> + cfs_b->period = ns_to_ktime(new);
> +
> + /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
> + cfs_b->quota *= new;
> + cfs_b->quota /= old;
> +
> + pr_warn_ratelimited(
> + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
> + smp_processor_id(),
> + new/NSEC_PER_USEC,
> + cfs_b->quota/NSEC_PER_USEC);
> +
> + /* reset count so we don't come right back in here */
> + count = 0;
> + }
> +
> idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
> }
> if (idle)

2019-03-15 13:31:50

by Phil Auld

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: Limit sched_cfs_period_timer loop to avoid hard lockup

On Fri, Mar 15, 2019 at 11:11:50AM +0100 Peter Zijlstra wrote:
> On Wed, Mar 13, 2019 at 11:08:26AM -0400, Phil Auld wrote:
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 310d0637fe4b..90cc67bbf592 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4859,19 +4859,51 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
> > return HRTIMER_NORESTART;
> > }
> >
> > +extern const u64 max_cfs_quota_period;
> > +int cfs_period_autotune_loop_limit = 8;
> > +int cfs_period_autotune_cushion_pct = 15; /* percentage added to period recalculation */
>
> static const ?
>
> > +
> > static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> > {
> > struct cfs_bandwidth *cfs_b =
> > container_of(timer, struct cfs_bandwidth, period_timer);
> > + s64 nsstart, nsnow, new_period;
>
> u64
>
> > int overrun;
> > int idle = 0;
> > + int count = 0;
> >
> > raw_spin_lock(&cfs_b->lock);
> > + nsstart = ktime_to_ns(hrtimer_cb_get_time(timer));
>
> we really should kill ktime :/ Anyway, you now do two indirect timer
> calls back to back :/
>
> And this is unconditional overhead.
>
> > for (;;) {
> > overrun = hrtimer_forward_now(timer, cfs_b->period);
> > if (!overrun)
> > break;
> >
> > + if (++count > cfs_period_autotune_loop_limit) {
> > + ktime_t old_period = ktime_to_ns(cfs_b->period);
> > +
> > + nsnow = ktime_to_ns(hrtimer_cb_get_time(timer));
> > + new_period = (nsnow - nsstart)/cfs_period_autotune_loop_limit;
> > +
> > + /* Make sure new period will be larger than old. */
> > + if (new_period < old_period) {
> > + new_period = old_period;
> > + }
> > + new_period += (new_period * cfs_period_autotune_cushion_pct) / 100;
>
> Computers _suck_ at /100. And since you're free to pick the constant,
> pick a power of two, computers love those.
>

Fair enough, I was thinking percents. And also that once we get in here it's not a really hot path.

> > +
> > + if (new_period > max_cfs_quota_period)
> > + new_period = max_cfs_quota_period;
> > +
> > + cfs_b->period = ns_to_ktime(new_period);
> > + cfs_b->quota += (cfs_b->quota * ((new_period - old_period) * 100)/old_period)/100;
>
> srsly!? Again, you can pick the constant to be anything, and you pick
> such a horrid number?!
>

Same as above :)


> > + pr_warn_ratelimited(
> > + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
> > + smp_processor_id(), cfs_b->period/NSEC_PER_USEC, cfs_b->quota/NSEC_PER_USEC);
>
> period was ktime_t, remember...
>
> > +
>
> And these here lines all all waaay too long.
>
> Also, is that complexity really needed?
>
> > + /* reset count so we don't come right back in here */
> > + count = 0;
> > + }
> > +
> > idle = do_sched_cfs_period_timer(cfs_b, overrun);
> > }
> > if (idle)
>
>
> Would not something simpler like the below also work?
>

I expect it would.... but the original concept increased the period to
a bit more (15%) than the average time it was taking to go around the loop.
This version will run the loop a lot longer as it's only increasing by
15% each time.

In my setup it would go from ~2000 to ~11000 and be done. This one
will go off, raise 2000 to 2300, then fire again, raise to 2645 , etc.

Unless it's getting reset this would still be a one time thing so that
may not matter.

The math calculations do look better though. I knew there had to be a
better way to scale up the quota. It just wasn't jumping out at me :)


Thanks,

Phil

>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index ea74d43924b2..b71557be6b42 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4885,6 +4885,8 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
> return HRTIMER_NORESTART;
> }
>
> +extern const u64 max_cfs_quota_period;
> +
> static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> {
> struct cfs_bandwidth *cfs_b =
> @@ -4892,6 +4894,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> unsigned long flags;
> int overrun;
> int idle = 0;
> + int count = 0;
>
> raw_spin_lock_irqsave(&cfs_b->lock, flags);
> for (;;) {
> @@ -4899,6 +4902,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> if (!overrun)
> break;
>
> + if (++count > 3) {
> + u64 new, old = ktime_to_ns(cfs_b->period);
> +
> + new = (old * 147) / 128; /* ~115% */
> + new = min(new, max_cfs_quota_period);
> +
> + cfs_b->period = ns_to_ktime(new);
> +
> + /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
> + cfs_b->quota *= new;
> + cfs_b->quota /= old;
> +
> + pr_warn_ratelimited(
> + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
> + smp_processor_id(),
> + new/NSEC_PER_USEC,
> + cfs_b->quota/NSEC_PER_USEC);
> +
> + /* reset count so we don't come right back in here */
> + count = 0;
> + }
> +
> idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
> }
> if (idle)

--

2019-03-15 13:52:31

by Phil Auld

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: Limit sched_cfs_period_timer loop to avoid hard lockup

On Fri, Mar 15, 2019 at 11:33:57AM +0100 Peter Zijlstra wrote:
> On Fri, Mar 15, 2019 at 11:11:50AM +0100, Peter Zijlstra wrote:
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index ea74d43924b2..b71557be6b42 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4885,6 +4885,8 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
> > return HRTIMER_NORESTART;
> > }
> >
> > +extern const u64 max_cfs_quota_period;
> > +
> > static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> > {
> > struct cfs_bandwidth *cfs_b =
> > @@ -4892,6 +4894,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> > unsigned long flags;
> > int overrun;
> > int idle = 0;
> > + int count = 0;
> >
> > raw_spin_lock_irqsave(&cfs_b->lock, flags);
> > for (;;) {
> > @@ -4899,6 +4902,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> > if (!overrun)
> > break;
> >
> > + if (++count > 3) {
> > + u64 new, old = ktime_to_ns(cfs_b->period);
> > +
> > + new = (old * 147) / 128; /* ~115% */
> > + new = min(new, max_cfs_quota_period);
>
> Also, we can still engineer things to come unstuck; if we explicitly
> configure period at 1e9 and then set a really small quota and then
> create this insane amount of cgroups you have..
>
> this code has no room to manouvre left.
>
> Do we want to do anything about that? Or leave it as is, don't do that
> then?
>

If the period is 1s it would be hard to make this loop fire repeatedly. I don't think
it's that dependent on the quota other than getting some rqs throttled. The small quota
would also mean fewer of them would get unthrottled per distribute call. You'd probably
need _significantly_ more cgroups than my insane 2500 to hit it.

Right now it settles out with a new period of ~12-15ms. So ~200,000 cgroups?

Ben and I talked a little about this in another thread. I think hitting this is enough of
an edge case that this approach will make the problem go away. The only alternative we
came up with to reduce the time taken in unthrottle involved a fair bit of complexity
added to the every day code paths. And might not help if the children all had their
own quota/period settings active.

Thoughts?


Cheers,
Phil



> > +
> > + cfs_b->period = ns_to_ktime(new);
> > +
> > + /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
> > + cfs_b->quota *= new;
> > + cfs_b->quota /= old;
> > +
> > + pr_warn_ratelimited(
> > + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
> > + smp_processor_id(),
> > + new/NSEC_PER_USEC,
> > + cfs_b->quota/NSEC_PER_USEC);
> > +
> > + /* reset count so we don't come right back in here */
> > + count = 0;
> > + }
> > +
> > idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
> > }
> > if (idle)

--

2019-03-15 15:31:51

by Phil Auld

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: Limit sched_cfs_period_timer loop to avoid hard lockup

On Fri, Mar 15, 2019 at 11:11:50AM +0100 Peter Zijlstra wrote:
> On Wed, Mar 13, 2019 at 11:08:26AM -0400, Phil Auld wrote:

...

> Computers _suck_ at /100. And since you're free to pick the constant,
> pick a power of two, computers love those.
>
> > +
> > + if (new_period > max_cfs_quota_period)
> > + new_period = max_cfs_quota_period;
> > +
> > + cfs_b->period = ns_to_ktime(new_period);
> > + cfs_b->quota += (cfs_b->quota * ((new_period - old_period) * 100)/old_period)/100;
>
> srsly!? Again, you can pick the constant to be anything, and you pick
> such a horrid number?!
>

In my defense here, all the fair.c imbalance pct code also uses 100 :)


> > + pr_warn_ratelimited(
> > + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
> > + smp_processor_id(), cfs_b->period/NSEC_PER_USEC, cfs_b->quota/NSEC_PER_USEC);
>
> period was ktime_t, remember...

Indeed. Worked but was incorrect.

>
> Would not something simpler like the below also work?


With my version:

[ 4246.030004] cfs_period_timer[cpu16]: period too short, scaling up (new cfs_period_us 5276, cfs_quota_us = 303973)
[ 4246.346978] cfs_period_timer[cpu16]: period too short, scaling up (new cfs_period_us 17474, cfs_quota_us = 1006569)

(most of the time it's only one message. Sometimes it does a smaller increase once first like this)



with the below:

[ 117.235804] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 2492, cfs_quota_us = 143554)
[ 117.346807] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 2862, cfs_quota_us = 164863)
[ 117.470569] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 3286, cfs_quota_us = 189335)
[ 117.574883] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 3774, cfs_quota_us = 217439)
[ 117.652907] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 4335, cfs_quota_us = 249716)
[ 118.090535] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 4978, cfs_quota_us = 286783)
[ 122.098009] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 5717, cfs_quota_us = 329352)
[ 126.255209] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 6566, cfs_quota_us = 378240)
[ 126.358060] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 7540, cfs_quota_us = 434385)
[ 126.538358] cfs_period_timer[cpu9]: period too short, scaling up (new cfs_period_us 8660, cfs_quota_us = 498865)
[ 126.614304] cfs_period_timer[cpu9]: period too short, scaling up (new cfs_period_us 9945, cfs_quota_us = 572915)
[ 126.817085] cfs_period_timer[cpu9]: period too short, scaling up (new cfs_period_us 11422, cfs_quota_us = 657957)
[ 127.352038] cfs_period_timer[cpu9]: period too short, scaling up (new cfs_period_us 13117, cfs_quota_us = 755623)
[ 127.598043] cfs_period_timer[cpu9]: period too short, scaling up (new cfs_period_us 15064, cfs_quota_us = 867785)


Plus on repeats I see an occasional

[ 152.803384] sched_cfs_period_timer: 9 callbacks suppressed



I'll rework the maths in the averaged version and post v2 if that makes sense.

It may have the extra timer fetch, although maybe I could rework it so that it used the
nsstart time the first time and did not need to do it twice in a row. I had originally
reverted the hrtimer_forward_now() to hrtimer_forward() but put that back.


Thanks for looking at it.


Also, fwiw, this was reported earlier by Anton Blanchard in https://lkml.org/lkml/2018/12/3/1047


Cheers,
Phil

>
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index ea74d43924b2..b71557be6b42 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4885,6 +4885,8 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
> return HRTIMER_NORESTART;
> }
>
> +extern const u64 max_cfs_quota_period;
> +
> static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> {
> struct cfs_bandwidth *cfs_b =
> @@ -4892,6 +4894,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> unsigned long flags;
> int overrun;
> int idle = 0;
> + int count = 0;
>
> raw_spin_lock_irqsave(&cfs_b->lock, flags);
> for (;;) {
> @@ -4899,6 +4902,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> if (!overrun)
> break;
>
> + if (++count > 3) {
> + u64 new, old = ktime_to_ns(cfs_b->period);
> +
> + new = (old * 147) / 128; /* ~115% */
> + new = min(new, max_cfs_quota_period);
> +
> + cfs_b->period = ns_to_ktime(new);
> +
> + /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
> + cfs_b->quota *= new;
> + cfs_b->quota /= old;
> +
> + pr_warn_ratelimited(
> + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
> + smp_processor_id(),
> + new/NSEC_PER_USEC,
> + cfs_b->quota/NSEC_PER_USEC);
> +
> + /* reset count so we don't come right back in here */
> + count = 0;
> + }
> +
> idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
> }
> if (idle)

--

2019-03-15 16:02:08

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: Limit sched_cfs_period_timer loop to avoid hard lockup

On Fri, Mar 15, 2019 at 09:51:25AM -0400, Phil Auld wrote:
> On Fri, Mar 15, 2019 at 11:33:57AM +0100 Peter Zijlstra wrote:
> > On Fri, Mar 15, 2019 at 11:11:50AM +0100, Peter Zijlstra wrote:
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index ea74d43924b2..b71557be6b42 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -4885,6 +4885,8 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
> > > return HRTIMER_NORESTART;
> > > }
> > >
> > > +extern const u64 max_cfs_quota_period;
> > > +
> > > static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> > > {
> > > struct cfs_bandwidth *cfs_b =
> > > @@ -4892,6 +4894,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> > > unsigned long flags;
> > > int overrun;
> > > int idle = 0;
> > > + int count = 0;
> > >
> > > raw_spin_lock_irqsave(&cfs_b->lock, flags);
> > > for (;;) {
> > > @@ -4899,6 +4902,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> > > if (!overrun)
> > > break;
> > >
> > > + if (++count > 3) {
> > > + u64 new, old = ktime_to_ns(cfs_b->period);
> > > +
> > > + new = (old * 147) / 128; /* ~115% */
> > > + new = min(new, max_cfs_quota_period);
> >
> > Also, we can still engineer things to come unstuck; if we explicitly
> > configure period at 1e9 and then set a really small quota and then
> > create this insane amount of cgroups you have..
> >
> > this code has no room to manouvre left.
> >
> > Do we want to do anything about that? Or leave it as is, don't do that
> > then?
> >
>
> If the period is 1s it would be hard to make this loop fire repeatedly. I don't think
> it's that dependent on the quota other than getting some rqs throttled. The small quota
> would also mean fewer of them would get unthrottled per distribute call. You'd probably
> need _significantly_ more cgroups than my insane 2500 to hit it.
>
> Right now it settles out with a new period of ~12-15ms. So ~200,000 cgroups?
>
> Ben and I talked a little about this in another thread. I think hitting this is enough of
> an edge case that this approach will make the problem go away. The only alternative we
> came up with to reduce the time taken in unthrottle involved a fair bit of complexity
> added to the every day code paths. And might not help if the children all had their
> own quota/period settings active.

Ah right. I forgot that part. And yes, I remember what was proposed to
avoid the tree walk, that wouldn't have been pretty.

2019-03-15 16:05:00

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: Limit sched_cfs_period_timer loop to avoid hard lockup

On Fri, Mar 15, 2019 at 11:30:42AM -0400, Phil Auld wrote:

> In my defense here, all the fair.c imbalance pct code also uses 100 :)

Yes, I know, I hate on that too ;-) Just never got around to fixing
that.


> with the below:
>
> [ 117.235804] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 2492, cfs_quota_us = 143554)
> [ 117.346807] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 2862, cfs_quota_us = 164863)
> [ 117.470569] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 3286, cfs_quota_us = 189335)
> [ 117.574883] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 3774, cfs_quota_us = 217439)
> [ 117.652907] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 4335, cfs_quota_us = 249716)
> [ 118.090535] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 4978, cfs_quota_us = 286783)
> [ 122.098009] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 5717, cfs_quota_us = 329352)
> [ 126.255209] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 6566, cfs_quota_us = 378240)
> [ 126.358060] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 7540, cfs_quota_us = 434385)
> [ 126.538358] cfs_period_timer[cpu9]: period too short, scaling up (new cfs_period_us 8660, cfs_quota_us = 498865)
> [ 126.614304] cfs_period_timer[cpu9]: period too short, scaling up (new cfs_period_us 9945, cfs_quota_us = 572915)
> [ 126.817085] cfs_period_timer[cpu9]: period too short, scaling up (new cfs_period_us 11422, cfs_quota_us = 657957)
> [ 127.352038] cfs_period_timer[cpu9]: period too short, scaling up (new cfs_period_us 13117, cfs_quota_us = 755623)
> [ 127.598043] cfs_period_timer[cpu9]: period too short, scaling up (new cfs_period_us 15064, cfs_quota_us = 867785)
>
>
> Plus on repeats I see an occasional
>
> [ 152.803384] sched_cfs_period_timer: 9 callbacks suppressed

That should be fine, right? It's a fallback for an edge case and
shouldn't trigger too often anyway.

>> I'll rework the maths in the averaged version and post v2 if that makes sense.
>
> It may have the extra timer fetch, although maybe I could rework it so that it used the
> nsstart time the first time and did not need to do it twice in a row. I had originally
> reverted the hrtimer_forward_now() to hrtimer_forward() but put that back.

Sure; but remember, simpler is often better, esp. for code that
typically 'never' runs.

> Also, fwiw, this was reported earlier by Anton Blanchard in https://lkml.org/lkml/2018/12/3/1047

Bah, yes, I sometimes loose track of things :/

2019-03-15 16:19:31

by Phil Auld

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: Limit sched_cfs_period_timer loop to avoid hard lockup

On Fri, Mar 15, 2019 at 05:03:47PM +0100 Peter Zijlstra wrote:
> On Fri, Mar 15, 2019 at 11:30:42AM -0400, Phil Auld wrote:
>
> > In my defense here, all the fair.c imbalance pct code also uses 100 :)
>
> Yes, I know, I hate on that too ;-) Just never got around to fixing
> that.
>
>
> > with the below:
> >
> > [ 117.235804] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 2492, cfs_quota_us = 143554)
> > [ 117.346807] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 2862, cfs_quota_us = 164863)
> > [ 117.470569] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 3286, cfs_quota_us = 189335)
> > [ 117.574883] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 3774, cfs_quota_us = 217439)
> > [ 117.652907] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 4335, cfs_quota_us = 249716)
> > [ 118.090535] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 4978, cfs_quota_us = 286783)
> > [ 122.098009] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 5717, cfs_quota_us = 329352)
> > [ 126.255209] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 6566, cfs_quota_us = 378240)
> > [ 126.358060] cfs_period_timer[cpu2]: period too short, scaling up (new cfs_period_us 7540, cfs_quota_us = 434385)
> > [ 126.538358] cfs_period_timer[cpu9]: period too short, scaling up (new cfs_period_us 8660, cfs_quota_us = 498865)
> > [ 126.614304] cfs_period_timer[cpu9]: period too short, scaling up (new cfs_period_us 9945, cfs_quota_us = 572915)
> > [ 126.817085] cfs_period_timer[cpu9]: period too short, scaling up (new cfs_period_us 11422, cfs_quota_us = 657957)
> > [ 127.352038] cfs_period_timer[cpu9]: period too short, scaling up (new cfs_period_us 13117, cfs_quota_us = 755623)
> > [ 127.598043] cfs_period_timer[cpu9]: period too short, scaling up (new cfs_period_us 15064, cfs_quota_us = 867785)
> >
> >
> > Plus on repeats I see an occasional
> >
> > [ 152.803384] sched_cfs_period_timer: 9 callbacks suppressed
>
> That should be fine, right? It's a fallback for an edge case and
> shouldn't trigger too often anyway.

It doesn't hit the NMI, just takes a bit longer to get out. It is a little messier
output, but as you say, it's a fallback. If you're okay with it do you want to
just use your patch?

Otherwise, I'm happy to do a fixup v2.

>
> >> I'll rework the maths in the averaged version and post v2 if that makes sense.
> >
> > It may have the extra timer fetch, although maybe I could rework it so that it used the
> > nsstart time the first time and did not need to do it twice in a row. I had originally
> > reverted the hrtimer_forward_now() to hrtimer_forward() but put that back.
>
> Sure; but remember, simpler is often better, esp. for code that
> typically 'never' runs.
>
> > Also, fwiw, this was reported earlier by Anton Blanchard in https://lkml.org/lkml/2018/12/3/1047
>
> Bah, yes, I sometimes loose track of things :/


No worries. I just meant that to show I was not the only one with these low settings,
and to give credit, or whatever :)


Cheers,
Phil

--

2019-03-15 16:20:51

by Phil Auld

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: Limit sched_cfs_period_timer loop to avoid hard lockup

On Fri, Mar 15, 2019 at 04:59:33PM +0100 Peter Zijlstra wrote:
> On Fri, Mar 15, 2019 at 09:51:25AM -0400, Phil Auld wrote:
> > On Fri, Mar 15, 2019 at 11:33:57AM +0100 Peter Zijlstra wrote:
> > > On Fri, Mar 15, 2019 at 11:11:50AM +0100, Peter Zijlstra wrote:
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > index ea74d43924b2..b71557be6b42 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -4885,6 +4885,8 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
> > > > return HRTIMER_NORESTART;
> > > > }
> > > >
> > > > +extern const u64 max_cfs_quota_period;
> > > > +
> > > > static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> > > > {
> > > > struct cfs_bandwidth *cfs_b =
> > > > @@ -4892,6 +4894,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> > > > unsigned long flags;
> > > > int overrun;
> > > > int idle = 0;
> > > > + int count = 0;
> > > >
> > > > raw_spin_lock_irqsave(&cfs_b->lock, flags);
> > > > for (;;) {
> > > > @@ -4899,6 +4902,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> > > > if (!overrun)
> > > > break;
> > > >
> > > > + if (++count > 3) {
> > > > + u64 new, old = ktime_to_ns(cfs_b->period);
> > > > +
> > > > + new = (old * 147) / 128; /* ~115% */
> > > > + new = min(new, max_cfs_quota_period);
> > >
> > > Also, we can still engineer things to come unstuck; if we explicitly
> > > configure period at 1e9 and then set a really small quota and then
> > > create this insane amount of cgroups you have..
> > >
> > > this code has no room to manouvre left.
> > >
> > > Do we want to do anything about that? Or leave it as is, don't do that
> > > then?
> > >
> >
> > If the period is 1s it would be hard to make this loop fire repeatedly. I don't think
> > it's that dependent on the quota other than getting some rqs throttled. The small quota
> > would also mean fewer of them would get unthrottled per distribute call. You'd probably
> > need _significantly_ more cgroups than my insane 2500 to hit it.
> >
> > Right now it settles out with a new period of ~12-15ms. So ~200,000 cgroups?
> >
> > Ben and I talked a little about this in another thread. I think hitting this is enough of
> > an edge case that this approach will make the problem go away. The only alternative we
> > came up with to reduce the time taken in unthrottle involved a fair bit of complexity
> > added to the every day code paths. And might not help if the children all had their
> > own quota/period settings active.
>
> Ah right. I forgot that part. And yes, I remember what was proposed to
> avoid the tree walk, that wouldn't have been pretty.

I'm glad I was not the only one who was not excited by that :)



--

2019-03-15 16:30:15

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: Limit sched_cfs_period_timer loop to avoid hard lockup

On Fri, Mar 15, 2019 at 09:30:42AM -0400, Phil Auld wrote:
> On Fri, Mar 15, 2019 at 11:11:50AM +0100 Peter Zijlstra wrote:

> > Computers _suck_ at /100. And since you're free to pick the constant,
> > pick a power of two, computers love those.
> >
>
> Fair enough, I was thinking percents. And also that once we get in
> here it's not a really hot path.

I've seen people say that before; but that's not something I can relate
to. The basic idea is fractions.


2019-03-18 13:30:13

by Phil Auld

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: Limit sched_cfs_period_timer loop to avoid hard lockup

On Fri, Mar 15, 2019 at 05:03:47PM +0100 Peter Zijlstra wrote:
> On Fri, Mar 15, 2019 at 11:30:42AM -0400, Phil Auld wrote:
>
> >> I'll rework the maths in the averaged version and post v2 if that makes sense.
> >
> > It may have the extra timer fetch, although maybe I could rework it so that it used the
> > nsstart time the first time and did not need to do it twice in a row. I had originally
> > reverted the hrtimer_forward_now() to hrtimer_forward() but put that back.
>
> Sure; but remember, simpler is often better, esp. for code that
> typically 'never' runs.

I reworked it to the below. This settles a bit faster. The average is sort of squishy because
it's 3 samples divided by 4. And if we stay in a single call after updating the period the "average"
will be even less accurate.

It settles at a larger value faster so produces fewer messages and none of the callback supressed ones.
The added complexity may not be worth it, though.

I think this or your version, either one, would work.

What needs to happen now to get one of them to land somewhere? Should I just repost one with my
signed-off and let you add whatever other tags? And if so do you have a preference for which one?

Also, Ben, thoughts?

Cheers,
Phil

--

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ea74d43924b2..297fd228fdb0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4885,6 +4885,8 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
return HRTIMER_NORESTART;
}

+extern const u64 max_cfs_quota_period;
+
static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
@@ -4892,14 +4894,46 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
unsigned long flags;
int overrun;
int idle = 0;
+ int count = 0;
+ u64 start, now;

raw_spin_lock_irqsave(&cfs_b->lock, flags);
+ now = start = ktime_to_ns(hrtimer_cb_get_time(timer));
for (;;) {
- overrun = hrtimer_forward_now(timer, cfs_b->period);
+ overrun = hrtimer_forward(timer, now, cfs_b->period);
if (!overrun)
break;

+ if (++count > 3) {
+ u64 new, old = ktime_to_ns(cfs_b->period);
+
+ /* rough average of the time each loop is taking
+ * really should be (n-s)/3 but this is easier for the machine
+ */
+ new = (now - start) >> 2;
+ if (new < old)
+ new = old;
+ new = (new * 147) / 128; /* ~115% */
+ new = min(new, max_cfs_quota_period);
+
+ cfs_b->period = ns_to_ktime(new);
+
+ /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
+ cfs_b->quota *= new;
+ cfs_b->quota /= old;
+
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ new/NSEC_PER_USEC,
+ cfs_b->quota/NSEC_PER_USEC);
+
+ /* reset count so we don't come right back in here */
+ count = 0;
+ }
+
idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
+ now = ktime_to_ns(hrtimer_cb_get_time(timer));
}
if (idle)
cfs_b->period_active = 0;



--

2019-03-18 17:15:23

by Benjamin Segall

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: Limit sched_cfs_period_timer loop to avoid hard lockup

Phil Auld <[email protected]> writes:

> On Fri, Mar 15, 2019 at 05:03:47PM +0100 Peter Zijlstra wrote:
>> On Fri, Mar 15, 2019 at 11:30:42AM -0400, Phil Auld wrote:
>>
>> >> I'll rework the maths in the averaged version and post v2 if that makes sense.
>> >
>> > It may have the extra timer fetch, although maybe I could rework it so that it used the
>> > nsstart time the first time and did not need to do it twice in a row. I had originally
>> > reverted the hrtimer_forward_now() to hrtimer_forward() but put that back.
>>
>> Sure; but remember, simpler is often better, esp. for code that
>> typically 'never' runs.
>
> I reworked it to the below. This settles a bit faster. The average is sort of squishy because
> it's 3 samples divided by 4. And if we stay in a single call after updating the period the "average"
> will be even less accurate.
>
> It settles at a larger value faster so produces fewer messages and none of the callback supressed ones.
> The added complexity may not be worth it, though.
>
> I think this or your version, either one, would work.
>
> What needs to happen now to get one of them to land somewhere? Should I just repost one with my
> signed-off and let you add whatever other tags? And if so do you have a preference for which one?
>
> Also, Ben, thoughts?

It would probably make sense to have it just be ++count > 4 then I
think? But otherwise yeah, I'm fine with either.

>
> Cheers,
> Phil
>
> --
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index ea74d43924b2..297fd228fdb0 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4885,6 +4885,8 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
> return HRTIMER_NORESTART;
> }
>
> +extern const u64 max_cfs_quota_period;
> +
> static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> {
> struct cfs_bandwidth *cfs_b =
> @@ -4892,14 +4894,46 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> unsigned long flags;
> int overrun;
> int idle = 0;
> + int count = 0;
> + u64 start, now;
>
> raw_spin_lock_irqsave(&cfs_b->lock, flags);
> + now = start = ktime_to_ns(hrtimer_cb_get_time(timer));
> for (;;) {
> - overrun = hrtimer_forward_now(timer, cfs_b->period);
> + overrun = hrtimer_forward(timer, now, cfs_b->period);
> if (!overrun)
> break;
>
> + if (++count > 3) {
> + u64 new, old = ktime_to_ns(cfs_b->period);
> +
> + /* rough average of the time each loop is taking
> + * really should be (n-s)/3 but this is easier for the machine
> + */
> + new = (now - start) >> 2;
> + if (new < old)
> + new = old;
> + new = (new * 147) / 128; /* ~115% */
> + new = min(new, max_cfs_quota_period);
> +
> + cfs_b->period = ns_to_ktime(new);
> +
> + /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
> + cfs_b->quota *= new;
> + cfs_b->quota /= old;
> +
> + pr_warn_ratelimited(
> + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
> + smp_processor_id(),
> + new/NSEC_PER_USEC,
> + cfs_b->quota/NSEC_PER_USEC);
> +
> + /* reset count so we don't come right back in here */
> + count = 0;
> + }
> +
> idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
> + now = ktime_to_ns(hrtimer_cb_get_time(timer));
> }
> if (idle)
> cfs_b->period_active = 0;

2019-03-18 17:55:04

by Phil Auld

[permalink] [raw]
Subject: Re: [PATCH] sched/fair: Limit sched_cfs_period_timer loop to avoid hard lockup

On Mon, Mar 18, 2019 at 10:14:22AM -0700 [email protected] wrote:
> Phil Auld <[email protected]> writes:
>
> > On Fri, Mar 15, 2019 at 05:03:47PM +0100 Peter Zijlstra wrote:
> >> On Fri, Mar 15, 2019 at 11:30:42AM -0400, Phil Auld wrote:
> >>
> >> >> I'll rework the maths in the averaged version and post v2 if that makes sense.
> >> >
> >> > It may have the extra timer fetch, although maybe I could rework it so that it used the
> >> > nsstart time the first time and did not need to do it twice in a row. I had originally
> >> > reverted the hrtimer_forward_now() to hrtimer_forward() but put that back.
> >>
> >> Sure; but remember, simpler is often better, esp. for code that
> >> typically 'never' runs.
> >
> > I reworked it to the below. This settles a bit faster. The average is sort of squishy because
> > it's 3 samples divided by 4. And if we stay in a single call after updating the period the "average"
> > will be even less accurate.
> >
> > It settles at a larger value faster so produces fewer messages and none of the callback supressed ones.
> > The added complexity may not be worth it, though.
> >
> > I think this or your version, either one, would work.
> >
> > What needs to happen now to get one of them to land somewhere? Should I just repost one with my
> > signed-off and let you add whatever other tags? And if so do you have a preference for which one?
> >
> > Also, Ben, thoughts?
>
> It would probably make sense to have it just be ++count > 4 then I
> think? But otherwise yeah, I'm fine with either.
>

That would certainly work, of course :)

I liked the 3 to catch it as soon as possible but in the end one more loop won't likely matter,
and it may prevent more since we are more likely to have it right.


Cheers,
Phil

> >
> > Cheers,
> > Phil
> >
> > --
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index ea74d43924b2..297fd228fdb0 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4885,6 +4885,8 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
> > return HRTIMER_NORESTART;
> > }
> >
> > +extern const u64 max_cfs_quota_period;
> > +
> > static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> > {
> > struct cfs_bandwidth *cfs_b =
> > @@ -4892,14 +4894,46 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> > unsigned long flags;
> > int overrun;
> > int idle = 0;
> > + int count = 0;
> > + u64 start, now;
> >
> > raw_spin_lock_irqsave(&cfs_b->lock, flags);
> > + now = start = ktime_to_ns(hrtimer_cb_get_time(timer));
> > for (;;) {
> > - overrun = hrtimer_forward_now(timer, cfs_b->period);
> > + overrun = hrtimer_forward(timer, now, cfs_b->period);
> > if (!overrun)
> > break;
> >
> > + if (++count > 3) {
> > + u64 new, old = ktime_to_ns(cfs_b->period);
> > +
> > + /* rough average of the time each loop is taking
> > + * really should be (n-s)/3 but this is easier for the machine
> > + */
> > + new = (now - start) >> 2;
> > + if (new < old)
> > + new = old;
> > + new = (new * 147) / 128; /* ~115% */
> > + new = min(new, max_cfs_quota_period);
> > +
> > + cfs_b->period = ns_to_ktime(new);
> > +
> > + /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
> > + cfs_b->quota *= new;
> > + cfs_b->quota /= old;
> > +
> > + pr_warn_ratelimited(
> > + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
> > + smp_processor_id(),
> > + new/NSEC_PER_USEC,
> > + cfs_b->quota/NSEC_PER_USEC);
> > +
> > + /* reset count so we don't come right back in here */
> > + count = 0;
> > + }
> > +
> > idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
> > + now = ktime_to_ns(hrtimer_cb_get_time(timer));
> > }
> > if (idle)
> > cfs_b->period_active = 0;

--