2020-04-10 22:54:58

by Josh Don

[permalink] [raw]
Subject: [PATCH 1/2] sched: eliminate bandwidth race between throttling and distribution

From: Paul Turner <[email protected]>

There is a race window in which an entity begins throttling before quota
is added to the pool, but does not finish throttling until after we have
finished with distribute_cfs_runtime(). This entity is not observed by
distribute_cfs_runtime() because it was not on the throttled list at the
time that distribution was running. This race manifests as rare
period-length statlls for such entities.

Rather than heavy-weight the synchronization with the progress of
distribution, we can fix this by aborting throttling if bandwidth has
become available. Otherwise, we immediately add the entity to the
throttled list so that it can be observed by a subsequent distribution.

Additionally, we can remove the case of adding the throttled entity to
the head of the throttled list, and simply always add to the tail.
Thanks to 26a8b12747c97, distribute_cfs_runtime() no longer holds onto
its own pool of runtime. This means that if we do hit the !assign and
distribute_running case, we know that distribution is about to end.

Signed-off-by: Paul Turner <[email protected]>
Co-developed-by: Ben Segall <[email protected]>
Signed-off-by: Ben Segall <[email protected]>
Signed-off-by: Josh Don <[email protected]>
---
kernel/sched/fair.c | 78 ++++++++++++++++++++++++++-------------------
1 file changed, 46 insertions(+), 32 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 02f323b85b6d..3fb986c52825 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4587,17 +4587,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
return &tg->cfs_bandwidth;
}

-/* returns 0 on failure to allocate runtime */
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+/* returns 0 on failure to allocate runtime, called with cfs_b->lock held */
+static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
+ struct cfs_rq *cfs_rq, u64 target_runtime)
{
- struct task_group *tg = cfs_rq->tg;
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
u64 amount = 0, min_amount;

/* note: this is a positive sum as runtime_remaining <= 0 */
- min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+ min_amount = target_runtime - cfs_rq->runtime_remaining;

- raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota == RUNTIME_INF)
amount = min_amount;
else {
@@ -4609,13 +4607,26 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_b->idle = 0;
}
}
- raw_spin_unlock(&cfs_b->lock);

cfs_rq->runtime_remaining += amount;

return cfs_rq->runtime_remaining > 0;
}

+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ int ret;
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+
+ raw_spin_lock(&cfs_b->lock);
+ ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq,
+ sched_cfs_bandwidth_slice());
+ raw_spin_unlock(&cfs_b->lock);
+
+ return ret;
+}
+
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
@@ -4704,13 +4715,33 @@ static int tg_throttle_down(struct task_group *tg, void *data)
return 0;
}

-static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, idle_task_delta, dequeue = 1;
- bool empty;
+
+ raw_spin_lock(&cfs_b->lock);
+ /* This will start the period timer if necessary */
+ if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
+ /*
+ * We have raced with bandwidth becoming available, and if we
+ * actually throttled the timer might not unthrottle us for an
+ * entire period. We additionally needed to make sure that any
+ * subsequent check_cfs_rq_runtime calls agree not to throttle
+ * us, as we may commit to do cfs put_prev+pick_next, so we ask
+ * for 1ns of runtime rather than just check cfs_b.
+ */
+ dequeue = 0;
+ } else {
+ list_add_tail_rcu(&cfs_rq->throttled_list,
+ &cfs_b->throttled_cfs_rq);
+ }
+ raw_spin_unlock(&cfs_b->lock);
+
+ if (!dequeue)
+ return false; /* Throttle no longer required. */

se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];

@@ -4744,29 +4775,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (!se)
sub_nr_running(rq, task_delta);

- cfs_rq->throttled = 1;
- cfs_rq->throttled_clock = rq_clock(rq);
- raw_spin_lock(&cfs_b->lock);
- empty = list_empty(&cfs_b->throttled_cfs_rq);
-
/*
- * Add to the _head_ of the list, so that an already-started
- * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
- * not running add to the tail so that later runqueues don't get starved.
+ * Note: distribution will already see us throttled via the
+ * throttled-list. rq->lock protects completion.
*/
- if (cfs_b->distribute_running)
- list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
- else
- list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-
- /*
- * If we're the first throttled task, make sure the bandwidth
- * timer is running.
- */
- if (empty)
- start_cfs_bandwidth(cfs_b);
-
- raw_spin_unlock(&cfs_b->lock);
+ cfs_rq->throttled = 1;
+ cfs_rq->throttled_clock = rq_clock(rq);
+ return true;
}

void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
@@ -5121,8 +5136,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
if (cfs_rq_throttled(cfs_rq))
return true;

- throttle_cfs_rq(cfs_rq);
- return true;
+ return throttle_cfs_rq(cfs_rq);
}

static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
--
2.26.0.110.g2183baf09c-goog


2020-04-13 15:35:16

by Phil Auld

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched: eliminate bandwidth race between throttling and distribution

Hi Josh,

On Sat, Apr 11, 2020 at 07:00:17PM -0700 Josh Don wrote:
> +[1][email protected]
>
> On Fri, Apr 10, 2020 at 3:52 PM Josh Don <[2][email protected]> wrote:
>
> From: Paul Turner <[3][email protected]>
>
> There is a race window in which an entity begins throttling before quota
> is added to the pool, but does not finish throttling until after we have
> finished with distribute_cfs_runtime(). This entity is not observed by
> distribute_cfs_runtime() because it was not on the throttled list at the
> time that distribution was running. This race manifests as rare
> period-length statlls for such entities.
>
> Rather than heavy-weight the synchronization with the progress of
> distribution, we can fix this by aborting throttling if bandwidth has
> become available. Otherwise, we immediately add the entity to the
> throttled list so that it can be observed by a subsequent distribution.
>
> Additionally, we can remove the case of adding the throttled entity to
> the head of the throttled list, and simply always add to the tail.
> Thanks to 26a8b12747c97, distribute_cfs_runtime() no longer holds onto
> its own pool of runtime. This means that if we do hit the !assign and
> distribute_running case, we know that distribution is about to end.
>
> Signed-off-by: Paul Turner <[4][email protected]>
> Co-developed-by: Ben Segall <[5][email protected]>
> Signed-off-by: Ben Segall <[6][email protected]>
> Signed-off-by: Josh Don <[7][email protected]>

This looks good to me, thanks for the cc.

Reviewed-by: Phil Auld <[email protected]>

> ---
> ?kernel/sched/fair.c | 78 ++++++++++++++++++++++++++-------------------
> ?1 file changed, 46 insertions(+), 32 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 02f323b85b6d..3fb986c52825 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4587,17 +4587,15 @@ static inline struct cfs_bandwidth
> *tg_cfs_bandwidth(struct task_group *tg)
> ? ? ? ? return &tg->cfs_bandwidth;
> ?}
>
> -/* returns 0 on failure to allocate runtime */
> -static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> +/* returns 0 on failure to allocate runtime, called with cfs_b->lock held
> */
> +static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
> +? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct cfs_rq *cfs_rq, u64
> target_runtime)
> ?{
> -? ? ? ?struct task_group *tg = cfs_rq->tg;
> -? ? ? ?struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
> ? ? ? ? u64 amount = 0, min_amount;
>
> ? ? ? ? /* note: this is a positive sum as runtime_remaining <= 0 */
> -? ? ? ?min_amount = sched_cfs_bandwidth_slice() - cfs_rq->
> runtime_remaining;
> +? ? ? ?min_amount = target_runtime - cfs_rq->runtime_remaining;
>
> -? ? ? ?raw_spin_lock(&cfs_b->lock);
> ? ? ? ? if (cfs_b->quota == RUNTIME_INF)
> ? ? ? ? ? ? ? ? amount = min_amount;
> ? ? ? ? else {
> @@ -4609,13 +4607,26 @@ static int assign_cfs_rq_runtime(struct cfs_rq
> *cfs_rq)
> ? ? ? ? ? ? ? ? ? ? ? ? cfs_b->idle = 0;
> ? ? ? ? ? ? ? ? }
> ? ? ? ? }
> -? ? ? ?raw_spin_unlock(&cfs_b->lock);
>
> ? ? ? ? cfs_rq->runtime_remaining += amount;
>
> ? ? ? ? return cfs_rq->runtime_remaining > 0;
> ?}
>
> +/* returns 0 on failure to allocate runtime */
> +static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> +{
> +? ? ? ?int ret;
> +? ? ? ?struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
> +
> +? ? ? ?raw_spin_lock(&cfs_b->lock);
> +? ? ? ?ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq,
> +? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?sched_cfs_bandwidth_slice());
> +? ? ? ?raw_spin_unlock(&cfs_b->lock);
> +
> +? ? ? ?return ret;
> +}
> +
> ?static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64
> delta_exec)
> ?{
> ? ? ? ? /* dock delta_exec before expiring quota (as it could span periods)
> */
> @@ -4704,13 +4715,33 @@ static int tg_throttle_down(struct task_group *tg,
> void *data)
> ? ? ? ? return 0;
> ?}
>
> -static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
> +static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
> ?{
> ? ? ? ? struct rq *rq = rq_of(cfs_rq);
> ? ? ? ? struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
> ? ? ? ? struct sched_entity *se;
> ? ? ? ? long task_delta, idle_task_delta, dequeue = 1;
> -? ? ? ?bool empty;
> +
> +? ? ? ?raw_spin_lock(&cfs_b->lock);
> +? ? ? ?/* This will start the period timer if necessary */
> +? ? ? ?if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
> +? ? ? ? ? ? ? ?/*
> +? ? ? ? ? ? ? ? * We have raced with bandwidth becoming available, and if
> we
> +? ? ? ? ? ? ? ? * actually throttled the timer might not unthrottle us for
> an
> +? ? ? ? ? ? ? ? * entire period. We additionally needed to make sure that
> any
> +? ? ? ? ? ? ? ? * subsequent check_cfs_rq_runtime calls agree not to
> throttle
> +? ? ? ? ? ? ? ? * us, as we may commit to do cfs put_prev+pick_next, so we
> ask
> +? ? ? ? ? ? ? ? * for 1ns of runtime rather than just check cfs_b.
> +? ? ? ? ? ? ? ? */
> +? ? ? ? ? ? ? ?dequeue = 0;
> +? ? ? ?} else {
> +? ? ? ? ? ? ? ?list_add_tail_rcu(&cfs_rq->throttled_list,
> +? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?&cfs_b->throttled_cfs_rq);
> +? ? ? ?}
> +? ? ? ?raw_spin_unlock(&cfs_b->lock);
> +
> +? ? ? ?if (!dequeue)
> +? ? ? ? ? ? ? ?return false;? /* Throttle no longer required. */
>
> ? ? ? ? se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
>
> @@ -4744,29 +4775,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
> ? ? ? ? if (!se)
> ? ? ? ? ? ? ? ? sub_nr_running(rq, task_delta);
>
> -? ? ? ?cfs_rq->throttled = 1;
> -? ? ? ?cfs_rq->throttled_clock = rq_clock(rq);
> -? ? ? ?raw_spin_lock(&cfs_b->lock);
> -? ? ? ?empty = list_empty(&cfs_b->throttled_cfs_rq);
> -
> ? ? ? ? /*
> -? ? ? ? * Add to the _head_ of the list, so that an already-started
> -? ? ? ? * distribute_cfs_runtime will not see us. If disribute_cfs_runtime
> is
> -? ? ? ? * not running add to the tail so that later runqueues don't get
> starved.
> +? ? ? ? * Note: distribution will already see us throttled via the
> +? ? ? ? * throttled-list.? rq->lock protects completion.
> ? ? ? ? ?*/
> -? ? ? ?if (cfs_b->distribute_running)
> -? ? ? ? ? ? ? ?list_add_rcu(&cfs_rq->throttled_list, &cfs_b->
> throttled_cfs_rq);
> -? ? ? ?else
> -? ? ? ? ? ? ? ?list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->
> throttled_cfs_rq);
> -
> -? ? ? ?/*
> -? ? ? ? * If we're the first throttled task, make sure the bandwidth
> -? ? ? ? * timer is running.
> -? ? ? ? */
> -? ? ? ?if (empty)
> -? ? ? ? ? ? ? ?start_cfs_bandwidth(cfs_b);
> -
> -? ? ? ?raw_spin_unlock(&cfs_b->lock);
> +? ? ? ?cfs_rq->throttled = 1;
> +? ? ? ?cfs_rq->throttled_clock = rq_clock(rq);
> +? ? ? ?return true;
> ?}
>
> ?void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
> @@ -5121,8 +5136,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq
> *cfs_rq)
> ? ? ? ? if (cfs_rq_throttled(cfs_rq))
> ? ? ? ? ? ? ? ? return true;
>
> -? ? ? ?throttle_cfs_rq(cfs_rq);
> -? ? ? ?return true;
> +? ? ? ?return throttle_cfs_rq(cfs_rq);
> ?}
>
> ?static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
> --
> 2.26.0.110.g2183baf09c-goog
>
>
>
> References:
>
> [1] mailto:[email protected]
> [2] mailto:[email protected]
> [3] mailto:[email protected]
> [4] mailto:[email protected]
> [5] mailto:[email protected]
> [6] mailto:[email protected]
> [7] mailto:[email protected]

--

2020-04-15 19:17:40

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched: eliminate bandwidth race between throttling and distribution

On Fri, Apr 10, 2020 at 03:52:07PM -0700, Josh Don wrote:

> -/* returns 0 on failure to allocate runtime */
> +/* returns 0 on failure to allocate runtime, called with cfs_b->lock held */

That's a gross mis-spelling of lockdep_assert_held(); and since I was
editing things anyway it now looks like so:

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4587,11 +4587,13 @@ static inline struct cfs_bandwidth *tg_c
return &tg->cfs_bandwidth;
}

-/* returns 0 on failure to allocate runtime, called with cfs_b->lock held */
+/* returns 0 on failure to allocate runtime */
static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
struct cfs_rq *cfs_rq, u64 target_runtime)
{
- u64 amount = 0, min_amount;
+ u64 min_amount, amount = 0;
+
+ lockdep_assert_held(cfs_rq->lock);

/* note: this is a positive sum as runtime_remaining <= 0 */
min_amount = target_runtime - cfs_rq->runtime_remaining;
@@ -4616,12 +4618,11 @@ static int __assign_cfs_rq_runtime(struc
/* returns 0 on failure to allocate runtime */
static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
- int ret;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ int ret;

raw_spin_lock(&cfs_b->lock);
- ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq,
- sched_cfs_bandwidth_slice());
+ ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
raw_spin_unlock(&cfs_b->lock);

return ret;

Subject: [tip: sched/core] sched/fair: Eliminate bandwidth race between throttling and distribution

The following commit has been merged into the sched/core branch of tip:

Commit-ID: e98fa02c4f2ea4991dae422ac7e34d102d2f0599
Gitweb: https://git.kernel.org/tip/e98fa02c4f2ea4991dae422ac7e34d102d2f0599
Author: Paul Turner <[email protected]>
AuthorDate: Fri, 10 Apr 2020 15:52:07 -07:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Thu, 30 Apr 2020 20:14:38 +02:00

sched/fair: Eliminate bandwidth race between throttling and distribution

There is a race window in which an entity begins throttling before quota
is added to the pool, but does not finish throttling until after we have
finished with distribute_cfs_runtime(). This entity is not observed by
distribute_cfs_runtime() because it was not on the throttled list at the
time that distribution was running. This race manifests as rare
period-length statlls for such entities.

Rather than heavy-weight the synchronization with the progress of
distribution, we can fix this by aborting throttling if bandwidth has
become available. Otherwise, we immediately add the entity to the
throttled list so that it can be observed by a subsequent distribution.

Additionally, we can remove the case of adding the throttled entity to
the head of the throttled list, and simply always add to the tail.
Thanks to 26a8b12747c97, distribute_cfs_runtime() no longer holds onto
its own pool of runtime. This means that if we do hit the !assign and
distribute_running case, we know that distribution is about to end.

Signed-off-by: Paul Turner <[email protected]>
Signed-off-by: Ben Segall <[email protected]>
Signed-off-by: Josh Don <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Phil Auld <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/sched/fair.c | 79 ++++++++++++++++++++++++++------------------
1 file changed, 47 insertions(+), 32 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 02f323b..0c13a41 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4588,16 +4588,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
}

/* returns 0 on failure to allocate runtime */
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
+ struct cfs_rq *cfs_rq, u64 target_runtime)
{
- struct task_group *tg = cfs_rq->tg;
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
- u64 amount = 0, min_amount;
+ u64 min_amount, amount = 0;
+
+ lockdep_assert_held(&cfs_b->lock);

/* note: this is a positive sum as runtime_remaining <= 0 */
- min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+ min_amount = target_runtime - cfs_rq->runtime_remaining;

- raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota == RUNTIME_INF)
amount = min_amount;
else {
@@ -4609,13 +4609,25 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_b->idle = 0;
}
}
- raw_spin_unlock(&cfs_b->lock);

cfs_rq->runtime_remaining += amount;

return cfs_rq->runtime_remaining > 0;
}

+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ int ret;
+
+ raw_spin_lock(&cfs_b->lock);
+ ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
+ raw_spin_unlock(&cfs_b->lock);
+
+ return ret;
+}
+
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
@@ -4704,13 +4716,33 @@ static int tg_throttle_down(struct task_group *tg, void *data)
return 0;
}

-static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, idle_task_delta, dequeue = 1;
- bool empty;
+
+ raw_spin_lock(&cfs_b->lock);
+ /* This will start the period timer if necessary */
+ if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
+ /*
+ * We have raced with bandwidth becoming available, and if we
+ * actually throttled the timer might not unthrottle us for an
+ * entire period. We additionally needed to make sure that any
+ * subsequent check_cfs_rq_runtime calls agree not to throttle
+ * us, as we may commit to do cfs put_prev+pick_next, so we ask
+ * for 1ns of runtime rather than just check cfs_b.
+ */
+ dequeue = 0;
+ } else {
+ list_add_tail_rcu(&cfs_rq->throttled_list,
+ &cfs_b->throttled_cfs_rq);
+ }
+ raw_spin_unlock(&cfs_b->lock);
+
+ if (!dequeue)
+ return false; /* Throttle no longer required. */

se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];

@@ -4744,29 +4776,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (!se)
sub_nr_running(rq, task_delta);

- cfs_rq->throttled = 1;
- cfs_rq->throttled_clock = rq_clock(rq);
- raw_spin_lock(&cfs_b->lock);
- empty = list_empty(&cfs_b->throttled_cfs_rq);
-
- /*
- * Add to the _head_ of the list, so that an already-started
- * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
- * not running add to the tail so that later runqueues don't get starved.
- */
- if (cfs_b->distribute_running)
- list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
- else
- list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-
/*
- * If we're the first throttled task, make sure the bandwidth
- * timer is running.
+ * Note: distribution will already see us throttled via the
+ * throttled-list. rq->lock protects completion.
*/
- if (empty)
- start_cfs_bandwidth(cfs_b);
-
- raw_spin_unlock(&cfs_b->lock);
+ cfs_rq->throttled = 1;
+ cfs_rq->throttled_clock = rq_clock(rq);
+ return true;
}

void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
@@ -5121,8 +5137,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
if (cfs_rq_throttled(cfs_rq))
return true;

- throttle_cfs_rq(cfs_rq);
- return true;
+ return throttle_cfs_rq(cfs_rq);
}

static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)