Maintain a list of throttle runqueues instead of having to test every runqueue
to determine whether we need to unthrottle.
Signed-off-by: Paul Turner <[email protected]>
---
kernel/sched.c | 7 +++++++
kernel/sched_fair.c | 25 +++++++++++++------------
2 files changed, 20 insertions(+), 12 deletions(-)
Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -256,6 +256,7 @@ struct cfs_bandwidth {
s64 hierarchal_quota; /* used for validating consistency */
struct hrtimer period_timer;
+ struct list_head throttled_cfs_rq;
/* throttle statistics */
u64 nr_periods;
u64 nr_throttled;
@@ -394,6 +395,8 @@ struct cfs_rq {
int quota_enabled, throttled, throttle_count;
s64 quota_remaining;
u64 throttled_timestamp;
+
+ struct list_head throttled_list;
#endif
#endif
};
@@ -433,6 +436,7 @@ void init_cfs_bandwidth(struct cfs_bandw
raw_spin_lock_init(&cfs_b->lock);
cfs_b->quota = cfs_b->runtime = quota;
cfs_b->period = ns_to_ktime(period);
+ INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->period_timer.function = sched_cfs_period_timer;
@@ -8141,6 +8145,9 @@ static void init_cfs_rq(struct cfs_rq *c
#ifdef CONFIG_FAIR_GROUP_SCHED
cfs_rq->rq = rq;
/* allow initial update_cfs_load() to truncate */
+#ifdef CONFIG_CFS_BANDWIDTH
+ INIT_LIST_HEAD(&cfs_rq->throttled_list);
+#endif
#ifdef CONFIG_SMP
cfs_rq->load_stamp = 1;
#endif
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -1405,6 +1405,7 @@ static int tg_throttle_down(struct task_
static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, dequeue = 1;
@@ -1434,7 +1435,11 @@ static void throttle_cfs_rq(struct cfs_r
if (!se)
rq->nr_running += task_delta;
+ raw_spin_lock(&cfs_b->lock);
cfs_rq->throttled = 1;
+ list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ raw_spin_unlock(&cfs_b->lock);
+
cfs_rq->throttled_timestamp = rq_of(cfs_rq)->clock;
}
@@ -1450,6 +1455,10 @@ static void unthrottle_cfs_rq(struct cfs
update_rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
cfs_b->throttled_time += (rq->clock - cfs_rq->throttled_timestamp);
+
+ cfs_rq->throttled = 0;
+ list_del_rcu(&cfs_rq->throttled_list);
+
raw_spin_unlock(&cfs_b->lock);
cfs_rq->throttled_timestamp = 0;
@@ -1459,7 +1468,6 @@ static void unthrottle_cfs_rq(struct cfs
walk_tg_tree_from(cfs_rq->tg, tg_unthrottle_down, tg_nop,
(void*)&udd);
- cfs_rq->throttled = 0;
if (!cfs_rq->load.weight)
return;
@@ -1484,22 +1492,15 @@ static void unthrottle_cfs_rq(struct cfs
resched_task(rq->curr);
}
-static inline struct task_group *cfs_bandwidth_tg(struct cfs_bandwidth *cfs_b)
-{
- return container_of(cfs_b, struct task_group, cfs_bandwidth);
-}
-
static u64 distribute_cfs_bandwidth(struct cfs_bandwidth *cfs_b, u64 runtime)
{
- int i;
+ struct cfs_rq *cfs_rq;
u64 quota, remaining = runtime;
- const struct cpumask *span;
rcu_read_lock();
- span = sched_bw_period_mask();
- for_each_cpu(i, span) {
- struct rq *rq = cpu_rq(i);
- struct cfs_rq *cfs_rq = cfs_bandwidth_tg(cfs_b)->cfs_rq[i];
+ list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
+ throttled_list) {
+ struct rq *rq = rq_of(cfs_rq);
raw_spin_lock(&rq->lock);
if (within_bandwidth(cfs_rq))
Hi Paul,
(2011/03/23 12:03), Paul Turner wrote:
> @@ -1434,7 +1435,11 @@ static void throttle_cfs_rq(struct cfs_r
> if (!se)
> rq->nr_running += task_delta;
>
> + raw_spin_lock(&cfs_b->lock);
> cfs_rq->throttled = 1;
> + list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
> + raw_spin_unlock(&cfs_b->lock);
> +
> cfs_rq->throttled_timestamp = rq_of(cfs_rq)->clock;
> }
>
Though I'm looking forward to see your updates soon, just FYI,
I found that somehow throttle_cfs_rq() was called with already
throttled cfs_rq.
So it breaks list throttled_cfs_rq by repeating list_add_tail_rcu()
and results in lockups like that Bharata and Xiao already reported.
There should be better fix, but following first aid to put_prev_entity()
worked for me:
- if (!within_bandwidth(cfs_rq))
+ if (!within_bandwidth(cfs_rq) && !cfs_rq->throttled)
throttle_cfs_rq(cfs_rq);
else
return_cfs_rq_quota(cfs_rq);
I believe you can do better next time.
Thanks,
H.Seto
On Thu, Apr 21, 2011 at 7:50 PM, Hidetoshi Seto
<[email protected]> wrote:
> Hi Paul,
>
> (2011/03/23 12:03), Paul Turner wrote:
>> @@ -1434,7 +1435,11 @@ static void throttle_cfs_rq(struct cfs_r
>> ? ? ? if (!se)
>> ? ? ? ? ? ? ? rq->nr_running += task_delta;
>>
>> + ? ? raw_spin_lock(&cfs_b->lock);
>> ? ? ? cfs_rq->throttled = 1;
>> + ? ? list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
>> + ? ? raw_spin_unlock(&cfs_b->lock);
>> +
>> ? ? ? cfs_rq->throttled_timestamp = rq_of(cfs_rq)->clock;
>> ?}
>>
>
> Though I'm looking forward to see your updates soon, just FYI,
> I found that somehow throttle_cfs_rq() was called with already
> throttled cfs_rq.
>
> So it breaks list throttled_cfs_rq by repeating list_add_tail_rcu()
> and results in lockups like that Bharata and Xiao already reported.
>
> There should be better fix, but following first aid to put_prev_entity()
> worked for me:
>
> - ? ? ? if (!within_bandwidth(cfs_rq))
> + ? ? ? if (!within_bandwidth(cfs_rq) && !cfs_rq->throttled)
> ? ? ? ? ? ? ? ?throttle_cfs_rq(cfs_rq);
> ? ? ? ?else
> ? ? ? ? ? ? ? ?return_cfs_rq_quota(cfs_rq);
>
> I believe you can do better next time.
>
Aha!
Good catch -- Thanks Hidetoshi! I've been looking for that one.
I'm figuring out exactly how this came about and I will repost.
Thanks!
>
> Thanks,
> H.Seto
>
>