2015-04-15 09:55:27

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 2/3] sched: Cleanup bandwidth timers

Roman reported a 3 cpu lockup scenario involving __start_cfs_bandwidth().

The more I look at that code the more I'm convinced its crack, that
entire __start_cfs_bandwidth() thing is brain melting, we don't need to
cancel a timer before starting it, *hrtimer_start*() will happily remove
the timer for you if its still enqueued.

Removing that, removes a big part of the problem, no more ugly cancel
loop to get stuck in.

So now, if I understand things right, the entire reason you have this
cfs_b->lock guarded ->timer_active nonsense is to make sure we don't
accidentally loose the timer.

It appears to me that it should be possible to guarantee that same by
unconditionally (re)starting the timer when !queued. Because regardless
what hrtimer::function will return, if we beat it to (re)enqueue the
timer, it doesn't matter.

Now, because hrtimers don't come with any serialization guarantees we
must ensure both handler and (re)start loop serialize their access to
the hrtimer to avoid both trying to forward the timer at the same
time.

Update the rt bandwidth timer to match.

This effectively reverts: 09dc4ab03936 ("sched/fair: Fix
tg_set_cfs_bandwidth() deadlock on rq->lock").

Cc: Thomas Gleixner <[email protected]>
Cc: Paul Turner <[email protected]>
Cc: Ben Segall <[email protected]>
Reported-by: Roman Gushchin <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/core.c | 15 ++++++------
kernel/sched/fair.c | 59 ++++++++++++---------------------------------------
kernel/sched/rt.c | 14 +++++-------
kernel/sched/sched.h | 4 +--
4 files changed, 31 insertions(+), 61 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -92,10 +92,13 @@

void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
{
- if (hrtimer_active(period_timer))
- return;
+ /*
+ * Do not forward the expiration time of active timers;
+ * we do not want to loose an overrun.
+ */
+ if (!hrtimer_active(period_timer))
+ hrtimer_forward_now(period_timer, period);

- hrtimer_forward_now(period_timer, period);
hrtimer_start_expires(period_timer, HRTIMER_MODE_ABS_PINNED);
}

@@ -8098,10 +8101,8 @@ static int tg_set_cfs_bandwidth(struct t

__refill_cfs_bandwidth_runtime(cfs_b);
/* restart the period timer (if active) to handle new period expiry */
- if (runtime_enabled && cfs_b->timer_active) {
- /* force a reprogram */
- __start_cfs_bandwidth(cfs_b, true);
- }
+ if (runtime_enabled)
+ start_cfs_bandwidth(cfs_b);
raw_spin_unlock_irq(&cfs_b->lock);

for_each_online_cpu(i) {
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3476,16 +3476,7 @@ static int assign_cfs_rq_runtime(struct
if (cfs_b->quota == RUNTIME_INF)
amount = min_amount;
else {
- /*
- * If the bandwidth pool has become inactive, then at least one
- * period must have elapsed since the last consumption.
- * Refresh the global state and ensure bandwidth timer becomes
- * active.
- */
- if (!cfs_b->timer_active) {
- __refill_cfs_bandwidth_runtime(cfs_b);
- __start_cfs_bandwidth(cfs_b, false);
- }
+ start_cfs_bandwidth(cfs_b);

if (cfs_b->runtime > 0) {
amount = min(cfs_b->runtime, min_amount);
@@ -3634,6 +3625,7 @@ static void throttle_cfs_rq(struct cfs_r
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, dequeue = 1;
+ bool empty;

se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];

@@ -3663,13 +3655,21 @@ static void throttle_cfs_rq(struct cfs_r
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
+ empty = list_empty(&cfs_rq->throttled_list);
+
/*
* Add to the _head_ of the list, so that an already-started
* distribute_cfs_runtime will not see us
*/
list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
- if (!cfs_b->timer_active)
- __start_cfs_bandwidth(cfs_b, false);
+
+ /*
+ * If we're the first throttled task, make sure the bandwidth
+ * timer is running.
+ */
+ if (empty)
+ start_cfs_bandwidth(cfs_b);
+
raw_spin_unlock(&cfs_b->lock);
}

@@ -3784,13 +3784,6 @@ static int do_sched_cfs_period_timer(str
if (cfs_b->idle && !throttled)
goto out_deactivate;

- /*
- * if we have relooped after returning idle once, we need to update our
- * status as actually running, so that other cpus doing
- * __start_cfs_bandwidth will stop trying to cancel us.
- */
- cfs_b->timer_active = 1;
-
__refill_cfs_bandwidth_runtime(cfs_b);

if (!throttled) {
@@ -3835,7 +3828,6 @@ static int do_sched_cfs_period_timer(str
return 0;

out_deactivate:
- cfs_b->timer_active = 0;
return 1;
}

@@ -3999,6 +3991,7 @@ static enum hrtimer_restart sched_cfs_sl
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, slack_timer);
+
do_sched_cfs_slack_timer(cfs_b);

return HRTIMER_NORESTART;
@@ -4008,15 +4001,12 @@ static enum hrtimer_restart sched_cfs_pe
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, period_timer);
- ktime_t now;
int overrun;
int idle = 0;

raw_spin_lock(&cfs_b->lock);
for (;;) {
- now = hrtimer_cb_get_time(timer);
- overrun = hrtimer_forward(timer, now, cfs_b->period);
-
+ overrun = hrtimer_forward_now(timer, cfs_b->period);
if (!overrun)
break;

@@ -4047,27 +4037,8 @@ static void init_cfs_rq_runtime(struct c
INIT_LIST_HEAD(&cfs_rq->throttled_list);
}

-/* requires cfs_b->lock, may release to reprogram timer */
-void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
+void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
- /*
- * The timer may be active because we're trying to set a new bandwidth
- * period or because we're racing with the tear-down path
- * (timer_active==0 becomes visible before the hrtimer call-back
- * terminates). In either case we ensure that it's re-programmed
- */
- while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
- hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
- /* bounce the lock to allow do_sched_cfs_period_timer to run */
- raw_spin_unlock(&cfs_b->lock);
- cpu_relax();
- raw_spin_lock(&cfs_b->lock);
- /* if someone else restarted the timer then we're done */
- if (!force && cfs_b->timer_active)
- return;
- }
-
- cfs_b->timer_active = 1;
start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
}

--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -18,19 +18,20 @@ static enum hrtimer_restart sched_rt_per
{
struct rt_bandwidth *rt_b =
container_of(timer, struct rt_bandwidth, rt_period_timer);
- ktime_t now;
- int overrun;
int idle = 0;
+ int overrun;

+ raw_spin_lock(&rt_b->rt_runtime_lock);
for (;;) {
- now = hrtimer_cb_get_time(timer);
- overrun = hrtimer_forward(timer, now, rt_b->rt_period);
-
+ overrun = hrtimer_forward_now(timer, rt_b->rt_period);
if (!overrun)
break;

+ raw_spin_unlock(&rt_b->rt_runtime_lock);
idle = do_sched_rt_period_timer(rt_b, overrun);
+ raw_spin_lock(&rt_b->rt_runtime_lock);
}
+ raw_spin_unlock(&rt_b->rt_runtime_lock);

return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
@@ -52,9 +53,6 @@ static void start_rt_bandwidth(struct rt
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
return;

- if (hrtimer_active(&rt_b->rt_period_timer))
- return;
-
raw_spin_lock(&rt_b->rt_runtime_lock);
start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
raw_spin_unlock(&rt_b->rt_runtime_lock);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -215,7 +215,7 @@ struct cfs_bandwidth {
s64 hierarchical_quota;
u64 runtime_expires;

- int idle, timer_active;
+ int idle;
struct hrtimer period_timer, slack_timer;
struct list_head throttled_cfs_rq;

@@ -306,7 +306,7 @@ extern void init_cfs_bandwidth(struct cf
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);

extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
-extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
+extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);

extern void free_rt_sched_group(struct task_group *tg);


2015-04-16 20:03:46

by Benjamin Segall

[permalink] [raw]
Subject: Re: [PATCH 2/3] sched: Cleanup bandwidth timers

Peter Zijlstra <[email protected]> writes:

> Roman reported a 3 cpu lockup scenario involving __start_cfs_bandwidth().
>
> The more I look at that code the more I'm convinced its crack, that
> entire __start_cfs_bandwidth() thing is brain melting, we don't need to
> cancel a timer before starting it, *hrtimer_start*() will happily remove
> the timer for you if its still enqueued.
>
> Removing that, removes a big part of the problem, no more ugly cancel
> loop to get stuck in.
>
> So now, if I understand things right, the entire reason you have this
> cfs_b->lock guarded ->timer_active nonsense is to make sure we don't
> accidentally loose the timer.
>
> It appears to me that it should be possible to guarantee that same by
> unconditionally (re)starting the timer when !queued. Because regardless
> what hrtimer::function will return, if we beat it to (re)enqueue the
> timer, it doesn't matter.
>
> Now, because hrtimers don't come with any serialization guarantees we
> must ensure both handler and (re)start loop serialize their access to
> the hrtimer to avoid both trying to forward the timer at the same
> time.
>
> Update the rt bandwidth timer to match.
>
> This effectively reverts: 09dc4ab03936 ("sched/fair: Fix
> tg_set_cfs_bandwidth() deadlock on rq->lock").
>
> Cc: Thomas Gleixner <[email protected]>
> Cc: Paul Turner <[email protected]>
> Cc: Ben Segall <[email protected]>
> Reported-by: Roman Gushchin <[email protected]>

Reviewed-by: Ben Segall <[email protected]>

So much nicer. Docs/comment issue only: s/loose/lose/


> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> kernel/sched/core.c | 15 ++++++------
> kernel/sched/fair.c | 59 ++++++++++++---------------------------------------
> kernel/sched/rt.c | 14 +++++-------
> kernel/sched/sched.h | 4 +--
> 4 files changed, 31 insertions(+), 61 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -92,10 +92,13 @@
>
> void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
> {
> - if (hrtimer_active(period_timer))
> - return;
> + /*
> + * Do not forward the expiration time of active timers;
> + * we do not want to loose an overrun.
> + */
> + if (!hrtimer_active(period_timer))
> + hrtimer_forward_now(period_timer, period);
>
> - hrtimer_forward_now(period_timer, period);
> hrtimer_start_expires(period_timer, HRTIMER_MODE_ABS_PINNED);
> }
>
> @@ -8098,10 +8101,8 @@ static int tg_set_cfs_bandwidth(struct t
>
> __refill_cfs_bandwidth_runtime(cfs_b);
> /* restart the period timer (if active) to handle new period expiry */
> - if (runtime_enabled && cfs_b->timer_active) {
> - /* force a reprogram */
> - __start_cfs_bandwidth(cfs_b, true);
> - }
> + if (runtime_enabled)
> + start_cfs_bandwidth(cfs_b);
> raw_spin_unlock_irq(&cfs_b->lock);
>
> for_each_online_cpu(i) {
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3476,16 +3476,7 @@ static int assign_cfs_rq_runtime(struct
> if (cfs_b->quota == RUNTIME_INF)
> amount = min_amount;
> else {
> - /*
> - * If the bandwidth pool has become inactive, then at least one
> - * period must have elapsed since the last consumption.
> - * Refresh the global state and ensure bandwidth timer becomes
> - * active.
> - */
> - if (!cfs_b->timer_active) {
> - __refill_cfs_bandwidth_runtime(cfs_b);
> - __start_cfs_bandwidth(cfs_b, false);
> - }
> + start_cfs_bandwidth(cfs_b);
>
> if (cfs_b->runtime > 0) {
> amount = min(cfs_b->runtime, min_amount);
> @@ -3634,6 +3625,7 @@ static void throttle_cfs_rq(struct cfs_r
> struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
> struct sched_entity *se;
> long task_delta, dequeue = 1;
> + bool empty;
>
> se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
>
> @@ -3663,13 +3655,21 @@ static void throttle_cfs_rq(struct cfs_r
> cfs_rq->throttled = 1;
> cfs_rq->throttled_clock = rq_clock(rq);
> raw_spin_lock(&cfs_b->lock);
> + empty = list_empty(&cfs_rq->throttled_list);
> +
> /*
> * Add to the _head_ of the list, so that an already-started
> * distribute_cfs_runtime will not see us
> */
> list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
> - if (!cfs_b->timer_active)
> - __start_cfs_bandwidth(cfs_b, false);
> +
> + /*
> + * If we're the first throttled task, make sure the bandwidth
> + * timer is running.
> + */
> + if (empty)
> + start_cfs_bandwidth(cfs_b);
> +
> raw_spin_unlock(&cfs_b->lock);
> }
>
> @@ -3784,13 +3784,6 @@ static int do_sched_cfs_period_timer(str
> if (cfs_b->idle && !throttled)
> goto out_deactivate;
>
> - /*
> - * if we have relooped after returning idle once, we need to update our
> - * status as actually running, so that other cpus doing
> - * __start_cfs_bandwidth will stop trying to cancel us.
> - */
> - cfs_b->timer_active = 1;
> -
> __refill_cfs_bandwidth_runtime(cfs_b);
>
> if (!throttled) {
> @@ -3835,7 +3828,6 @@ static int do_sched_cfs_period_timer(str
> return 0;
>
> out_deactivate:
> - cfs_b->timer_active = 0;
> return 1;
> }
>
> @@ -3999,6 +3991,7 @@ static enum hrtimer_restart sched_cfs_sl
> {
> struct cfs_bandwidth *cfs_b =
> container_of(timer, struct cfs_bandwidth, slack_timer);
> +
> do_sched_cfs_slack_timer(cfs_b);
>
> return HRTIMER_NORESTART;
> @@ -4008,15 +4001,12 @@ static enum hrtimer_restart sched_cfs_pe
> {
> struct cfs_bandwidth *cfs_b =
> container_of(timer, struct cfs_bandwidth, period_timer);
> - ktime_t now;
> int overrun;
> int idle = 0;
>
> raw_spin_lock(&cfs_b->lock);
> for (;;) {
> - now = hrtimer_cb_get_time(timer);
> - overrun = hrtimer_forward(timer, now, cfs_b->period);
> -
> + overrun = hrtimer_forward_now(timer, cfs_b->period);
> if (!overrun)
> break;
>
> @@ -4047,27 +4037,8 @@ static void init_cfs_rq_runtime(struct c
> INIT_LIST_HEAD(&cfs_rq->throttled_list);
> }
>
> -/* requires cfs_b->lock, may release to reprogram timer */
> -void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
> +void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
> {
> - /*
> - * The timer may be active because we're trying to set a new bandwidth
> - * period or because we're racing with the tear-down path
> - * (timer_active==0 becomes visible before the hrtimer call-back
> - * terminates). In either case we ensure that it's re-programmed
> - */
> - while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
> - hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
> - /* bounce the lock to allow do_sched_cfs_period_timer to run */
> - raw_spin_unlock(&cfs_b->lock);
> - cpu_relax();
> - raw_spin_lock(&cfs_b->lock);
> - /* if someone else restarted the timer then we're done */
> - if (!force && cfs_b->timer_active)
> - return;
> - }
> -
> - cfs_b->timer_active = 1;
> start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
> }
>
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -18,19 +18,20 @@ static enum hrtimer_restart sched_rt_per
> {
> struct rt_bandwidth *rt_b =
> container_of(timer, struct rt_bandwidth, rt_period_timer);
> - ktime_t now;
> - int overrun;
> int idle = 0;
> + int overrun;
>
> + raw_spin_lock(&rt_b->rt_runtime_lock);
> for (;;) {
> - now = hrtimer_cb_get_time(timer);
> - overrun = hrtimer_forward(timer, now, rt_b->rt_period);
> -
> + overrun = hrtimer_forward_now(timer, rt_b->rt_period);
> if (!overrun)
> break;
>
> + raw_spin_unlock(&rt_b->rt_runtime_lock);
> idle = do_sched_rt_period_timer(rt_b, overrun);
> + raw_spin_lock(&rt_b->rt_runtime_lock);
> }
> + raw_spin_unlock(&rt_b->rt_runtime_lock);
>
> return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
> }
> @@ -52,9 +53,6 @@ static void start_rt_bandwidth(struct rt
> if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
> return;
>
> - if (hrtimer_active(&rt_b->rt_period_timer))
> - return;
> -
> raw_spin_lock(&rt_b->rt_runtime_lock);
> start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
> raw_spin_unlock(&rt_b->rt_runtime_lock);
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -215,7 +215,7 @@ struct cfs_bandwidth {
> s64 hierarchical_quota;
> u64 runtime_expires;
>
> - int idle, timer_active;
> + int idle;
> struct hrtimer period_timer, slack_timer;
> struct list_head throttled_cfs_rq;
>
> @@ -306,7 +306,7 @@ extern void init_cfs_bandwidth(struct cf
> extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
>
> extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
> -extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
> +extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
> extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
>
> extern void free_rt_sched_group(struct task_group *tg);

Subject: [tip:timers/core] sched: Cleanup bandwidth timers

Commit-ID: 77a4d1a1b9a122ca1fa3507bd30aec1520d7a8a4
Gitweb: http://git.kernel.org/tip/77a4d1a1b9a122ca1fa3507bd30aec1520d7a8a4
Author: Peter Zijlstra <[email protected]>
AuthorDate: Wed, 15 Apr 2015 11:41:57 +0200
Committer: Thomas Gleixner <[email protected]>
CommitDate: Wed, 22 Apr 2015 17:06:53 +0200

sched: Cleanup bandwidth timers

Roman reported a 3 cpu lockup scenario involving __start_cfs_bandwidth().

The more I look at that code the more I'm convinced its crack, that
entire __start_cfs_bandwidth() thing is brain melting, we don't need to
cancel a timer before starting it, *hrtimer_start*() will happily remove
the timer for you if its still enqueued.

Removing that, removes a big part of the problem, no more ugly cancel
loop to get stuck in.

So now, if I understand things right, the entire reason you have this
cfs_b->lock guarded ->timer_active nonsense is to make sure we don't
accidentally lose the timer.

It appears to me that it should be possible to guarantee that same by
unconditionally (re)starting the timer when !queued. Because regardless
what hrtimer::function will return, if we beat it to (re)enqueue the
timer, it doesn't matter.

Now, because hrtimers don't come with any serialization guarantees we
must ensure both handler and (re)start loop serialize their access to
the hrtimer to avoid both trying to forward the timer at the same
time.

Update the rt bandwidth timer to match.

This effectively reverts: 09dc4ab03936 ("sched/fair: Fix
tg_set_cfs_bandwidth() deadlock on rq->lock").

Reported-by: Roman Gushchin <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Ben Segall <[email protected]>
Cc: Paul Turner <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Thomas Gleixner <[email protected]>
---
kernel/sched/core.c | 15 ++++++-------
kernel/sched/fair.c | 59 +++++++++++++---------------------------------------
kernel/sched/rt.c | 14 ++++++-------
kernel/sched/sched.h | 4 ++--
4 files changed, 31 insertions(+), 61 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3026678..d8a6196 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -92,10 +92,13 @@

void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
{
- if (hrtimer_active(period_timer))
- return;
+ /*
+ * Do not forward the expiration time of active timers;
+ * we do not want to loose an overrun.
+ */
+ if (!hrtimer_active(period_timer))
+ hrtimer_forward_now(period_timer, period);

- hrtimer_forward_now(period_timer, period);
hrtimer_start_expires(period_timer, HRTIMER_MODE_ABS_PINNED);
}

@@ -8113,10 +8116,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)

__refill_cfs_bandwidth_runtime(cfs_b);
/* restart the period timer (if active) to handle new period expiry */
- if (runtime_enabled && cfs_b->timer_active) {
- /* force a reprogram */
- __start_cfs_bandwidth(cfs_b, true);
- }
+ if (runtime_enabled)
+ start_cfs_bandwidth(cfs_b);
raw_spin_unlock_irq(&cfs_b->lock);

for_each_online_cpu(i) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 854881b..e3b32eb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3476,16 +3476,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
if (cfs_b->quota == RUNTIME_INF)
amount = min_amount;
else {
- /*
- * If the bandwidth pool has become inactive, then at least one
- * period must have elapsed since the last consumption.
- * Refresh the global state and ensure bandwidth timer becomes
- * active.
- */
- if (!cfs_b->timer_active) {
- __refill_cfs_bandwidth_runtime(cfs_b);
- __start_cfs_bandwidth(cfs_b, false);
- }
+ start_cfs_bandwidth(cfs_b);

if (cfs_b->runtime > 0) {
amount = min(cfs_b->runtime, min_amount);
@@ -3634,6 +3625,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, dequeue = 1;
+ bool empty;

se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];

@@ -3663,13 +3655,21 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
+ empty = list_empty(&cfs_rq->throttled_list);
+
/*
* Add to the _head_ of the list, so that an already-started
* distribute_cfs_runtime will not see us
*/
list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
- if (!cfs_b->timer_active)
- __start_cfs_bandwidth(cfs_b, false);
+
+ /*
+ * If we're the first throttled task, make sure the bandwidth
+ * timer is running.
+ */
+ if (empty)
+ start_cfs_bandwidth(cfs_b);
+
raw_spin_unlock(&cfs_b->lock);
}

@@ -3784,13 +3784,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
if (cfs_b->idle && !throttled)
goto out_deactivate;

- /*
- * if we have relooped after returning idle once, we need to update our
- * status as actually running, so that other cpus doing
- * __start_cfs_bandwidth will stop trying to cancel us.
- */
- cfs_b->timer_active = 1;
-
__refill_cfs_bandwidth_runtime(cfs_b);

if (!throttled) {
@@ -3835,7 +3828,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
return 0;

out_deactivate:
- cfs_b->timer_active = 0;
return 1;
}

@@ -3999,6 +3991,7 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, slack_timer);
+
do_sched_cfs_slack_timer(cfs_b);

return HRTIMER_NORESTART;
@@ -4008,15 +4001,12 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, period_timer);
- ktime_t now;
int overrun;
int idle = 0;

raw_spin_lock(&cfs_b->lock);
for (;;) {
- now = hrtimer_cb_get_time(timer);
- overrun = hrtimer_forward(timer, now, cfs_b->period);
-
+ overrun = hrtimer_forward_now(timer, cfs_b->period);
if (!overrun)
break;

@@ -4047,27 +4037,8 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
INIT_LIST_HEAD(&cfs_rq->throttled_list);
}

-/* requires cfs_b->lock, may release to reprogram timer */
-void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
+void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
- /*
- * The timer may be active because we're trying to set a new bandwidth
- * period or because we're racing with the tear-down path
- * (timer_active==0 becomes visible before the hrtimer call-back
- * terminates). In either case we ensure that it's re-programmed
- */
- while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
- hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
- /* bounce the lock to allow do_sched_cfs_period_timer to run */
- raw_spin_unlock(&cfs_b->lock);
- cpu_relax();
- raw_spin_lock(&cfs_b->lock);
- /* if someone else restarted the timer then we're done */
- if (!force && cfs_b->timer_active)
- return;
- }
-
- cfs_b->timer_active = 1;
start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
}

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 575da76..b0febf2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -18,19 +18,20 @@ static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
{
struct rt_bandwidth *rt_b =
container_of(timer, struct rt_bandwidth, rt_period_timer);
- ktime_t now;
- int overrun;
int idle = 0;
+ int overrun;

+ raw_spin_lock(&rt_b->rt_runtime_lock);
for (;;) {
- now = hrtimer_cb_get_time(timer);
- overrun = hrtimer_forward(timer, now, rt_b->rt_period);
-
+ overrun = hrtimer_forward_now(timer, rt_b->rt_period);
if (!overrun)
break;

+ raw_spin_unlock(&rt_b->rt_runtime_lock);
idle = do_sched_rt_period_timer(rt_b, overrun);
+ raw_spin_lock(&rt_b->rt_runtime_lock);
}
+ raw_spin_unlock(&rt_b->rt_runtime_lock);

return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
@@ -52,9 +53,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
return;

- if (hrtimer_active(&rt_b->rt_period_timer))
- return;
-
raw_spin_lock(&rt_b->rt_runtime_lock);
start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
raw_spin_unlock(&rt_b->rt_runtime_lock);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e0e1299..08606a1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -215,7 +215,7 @@ struct cfs_bandwidth {
s64 hierarchical_quota;
u64 runtime_expires;

- int idle, timer_active;
+ int idle;
struct hrtimer period_timer, slack_timer;
struct list_head throttled_cfs_rq;

@@ -306,7 +306,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);

extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
-extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
+extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);

extern void free_rt_sched_group(struct task_group *tg);