This patch adds a per-task_group timer which handles the refresh of the global
CFS bandwidth pool.
Since the RT pool is using a similar timer there's some small refactoring to
share this support.
Signed-off-by: Paul Turner <[email protected]>
---
kernel/sched.c | 87 ++++++++++++++++++++++++++++++++++++++++------------
kernel/sched_fair.c | 9 +++++
2 files changed, 77 insertions(+), 19 deletions(-)
Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -193,10 +193,28 @@ static inline int rt_bandwidth_enabled(v
return sysctl_sched_rt_runtime >= 0;
}
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
{
- ktime_t now;
+ unsigned long delta;
+ ktime_t soft, hard, now;
+
+ for (;;) {
+ if (hrtimer_active(period_timer))
+ break;
+
+ now = hrtimer_cb_get_time(period_timer);
+ hrtimer_forward(period_timer, now, period);
+ soft = hrtimer_get_softexpires(period_timer);
+ hard = hrtimer_get_expires(period_timer);
+ delta = ktime_to_ns(ktime_sub(hard, soft));
+ __hrtimer_start_range_ns(period_timer, soft, delta,
+ HRTIMER_MODE_ABS_PINNED, 0);
+ }
+}
+
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
return;
@@ -204,22 +222,7 @@ static void start_rt_bandwidth(struct rt
return;
raw_spin_lock(&rt_b->rt_runtime_lock);
- for (;;) {
- unsigned long delta;
- ktime_t soft, hard;
-
- if (hrtimer_active(&rt_b->rt_period_timer))
- break;
-
- now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
- hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-
- soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
- hard = hrtimer_get_expires(&rt_b->rt_period_timer);
- delta = ktime_to_ns(ktime_sub(hard, soft));
- __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
- HRTIMER_MODE_ABS_PINNED, 0);
- }
+ start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
@@ -250,6 +253,9 @@ struct cfs_bandwidth {
ktime_t period;
u64 quota;
s64 hierarchal_quota;
+
+ int idle;
+ struct hrtimer period_timer;
#endif
};
@@ -394,12 +400,38 @@ static inline struct cfs_bandwidth *tg_c
#ifdef CONFIG_CFS_BANDWIDTH
static inline u64 default_cfs_period(void);
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+{
+ struct cfs_bandwidth *cfs_b =
+ container_of(timer, struct cfs_bandwidth, period_timer);
+ ktime_t now;
+ int overrun;
+ int idle = 0;
+
+ for (;;) {
+ now = hrtimer_cb_get_time(timer);
+ overrun = hrtimer_forward(timer, now, cfs_b->period);
+
+ if (!overrun)
+ break;
+
+ idle = do_sched_cfs_period_timer(cfs_b, overrun);
+ }
+
+ return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
raw_spin_lock_init(&cfs_b->lock);
cfs_b->quota = RUNTIME_INF;
cfs_b->period = ns_to_ktime(default_cfs_period());
+
+ hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ cfs_b->period_timer.function = sched_cfs_period_timer;
+
}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -411,8 +443,25 @@ static void init_cfs_rq_runtime(struct c
cfs_rq->runtime_enabled = 1;
}
+static void start_cfs_bandwidth(struct cfs_rq *cfs_rq)
+{
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+
+ if (cfs_b->quota == RUNTIME_INF)
+ return;
+
+ if (hrtimer_active(&cfs_b->period_timer))
+ return;
+
+ raw_spin_lock(&cfs_b->lock);
+ start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+ raw_spin_unlock(&cfs_b->lock);
+}
+
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
-{}
+{
+ hrtimer_cancel(&cfs_b->period_timer);
+}
#else
#ifdef CONFIG_FAIR_GROUP_SCHED
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -1003,6 +1003,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
if (cfs_rq->nr_running == 1)
list_add_leaf_cfs_rq(cfs_rq);
+
+ start_cfs_bandwidth(cfs_rq);
}
static void __clear_buddies_last(struct sched_entity *se)
@@ -1220,6 +1222,8 @@ static void put_prev_entity(struct cfs_r
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
+
+ start_cfs_bandwidth(cfs_rq);
}
cfs_rq->curr = NULL;
}
@@ -1272,6 +1276,11 @@ static inline u64 default_cfs_period(voi
{
return 500000000ULL;
}
+
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+{
+ return 1;
+}
#endif
/**************************************************
(2011/05/03 18:28), Paul Turner wrote:
> @@ -250,6 +253,9 @@ struct cfs_bandwidth {
> ktime_t period;
> u64 quota;
> s64 hierarchal_quota;
> +
> + int idle;
> + struct hrtimer period_timer;
> #endif
> };
>
"idle" is not used yet. How about adding it in later patch?
Plus, comment explaining how it is used would be appreciated.
> static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
> {
> raw_spin_lock_init(&cfs_b->lock);
> cfs_b->quota = RUNTIME_INF;
> cfs_b->period = ns_to_ktime(default_cfs_period());
> +
> + hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> + cfs_b->period_timer.function = sched_cfs_period_timer;
> +
> }
Nit: blank line?
Reviewed-by: Hidetoshi Seto <[email protected]>
Thanks,
H.Seto
On Tue, May 10, 2011 at 12:21 AM, Hidetoshi Seto
<[email protected]> wrote:
> (2011/05/03 18:28), Paul Turner wrote:
>> @@ -250,6 +253,9 @@ struct cfs_bandwidth {
>> ? ? ? ktime_t period;
>> ? ? ? u64 quota;
>> ? ? ? s64 hierarchal_quota;
>> +
>> + ? ? int idle;
>> + ? ? struct hrtimer period_timer;
>> ?#endif
>> ?};
>>
>
> "idle" is not used yet. ?How about adding it in later patch?
> Plus, comment explaining how it is used would be appreciated.
Fixed both. (idle belongs to the accumulate patch)
>
>> ?static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
>> ?{
>> ? ? ? raw_spin_lock_init(&cfs_b->lock);
>> ? ? ? cfs_b->quota = RUNTIME_INF;
>> ? ? ? cfs_b->period = ns_to_ktime(default_cfs_period());
>> +
>> + ? ? hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
>> + ? ? cfs_b->period_timer.function = sched_cfs_period_timer;
>> +
>> ?}
>
> Nit: blank line?
>
> Reviewed-by: Hidetoshi Seto <[email protected]>
>
>
> Thanks,
> H.Seto
>
>
On Tue, 2011-05-03 at 02:28 -0700, Paul Turner wrote:
> @@ -1003,6 +1003,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
>
> if (cfs_rq->nr_running == 1)
> list_add_leaf_cfs_rq(cfs_rq);
> +
> + start_cfs_bandwidth(cfs_rq);
> }
>
> static void __clear_buddies_last(struct sched_entity *se)
> @@ -1220,6 +1222,8 @@ static void put_prev_entity(struct cfs_r
> update_stats_wait_start(cfs_rq, prev);
> /* Put 'current' back into the tree. */
> __enqueue_entity(cfs_rq, prev);
> +
> + start_cfs_bandwidth(cfs_rq);
> }
> cfs_rq->curr = NULL;
> }
OK, so while the first made sense the second had me go wtf?!, now I
_think_ you do that because do_sched_cfs_period_timer() can return idle
and stop the timer when no bandwidth consumption is seen for a while,
and thus we need to re-start the timer when we put the entity to sleep,
since that could have been a throttle.
If that's so then neither really do make sense and a big fat comment is
missing.
So why not start on the same (but inverse) condition that makes it stop?
On Mon, May 16, 2011 at 3:18 AM, Peter Zijlstra <[email protected]> wrote:
> On Tue, 2011-05-03 at 02:28 -0700, Paul Turner wrote:
>> @@ -1003,6 +1003,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
>>
>> ? ? ? ? if (cfs_rq->nr_running == 1)
>> ? ? ? ? ? ? ? ? list_add_leaf_cfs_rq(cfs_rq);
>> +
>> + ? ? ? start_cfs_bandwidth(cfs_rq);
>> ?}
>>
>> ?static void __clear_buddies_last(struct sched_entity *se)
>> @@ -1220,6 +1222,8 @@ static void put_prev_entity(struct cfs_r
>> ? ? ? ? ? ? ? ? update_stats_wait_start(cfs_rq, prev);
>> ? ? ? ? ? ? ? ? /* Put 'current' back into the tree. */
>> ? ? ? ? ? ? ? ? __enqueue_entity(cfs_rq, prev);
>> +
>> + ? ? ? ? ? ? ? start_cfs_bandwidth(cfs_rq);
>> ? ? ? ? }
>> ? ? ? ? cfs_rq->curr = NULL;
>> ?}
>
> OK, so while the first made sense the second had me go wtf?!, now I
> _think_ you do that because do_sched_cfs_period_timer() can return idle
> and stop the timer when no bandwidth consumption is seen for a while,
> and thus we need to re-start the timer when we put the entity to sleep,
> since that could have been a throttle.
>
> If that's so then neither really do make sense and a big fat comment is
> missing.
>
> So why not start on the same (but inverse) condition that makes it stop?
>
This was originally to guard the case that an entity was running on
stale (from a previous period) quota resulting in cfs_bandwidth->idle
and the timer not being re-instantiated.
Now that expiration is properly integrated I think the two cases are
analogous and that this can be dropped (and the start moved into the
(nr_running == 1) entity case on enqueue).
I think this is correct but my brain's a little fuzzy right now, will
confirm in the morning.