2011-06-21 07:21:49

by Paul Turner

[permalink] [raw]
Subject: [patch 15/16] sched: return unused runtime on voluntary sleep

When a local cfs_rq blocks we return the majority of its remaining quota to the
global bandwidth pool for use by other runqueues.

We do this only when the quota is current and there is more than
min_cfs_rq_quota [1ms by default] of runtime remaining on the rq.

In the case where there are throttled runqueues and we have sufficient
bandwidth to meter out a slice, a second timer is kicked off to handle this
delivery, unthrottling where appropriate.

Using a 'worst case' antagonist which executes on each cpu
for 1ms before moving onto the next on a fairly large machine:

no quota generations:
197.47 ms /cgroup/a/cpuacct.usage
199.46 ms /cgroup/a/cpuacct.usage
205.46 ms /cgroup/a/cpuacct.usage
198.46 ms /cgroup/a/cpuacct.usage
208.39 ms /cgroup/a/cpuacct.usage
Since we are allowed to use "stale" quota our usage is effectively bounded by
the rate of input into the global pool and performance is relatively stable.

with quota generations [1s increments]:
119.58 ms /cgroup/a/cpuacct.usage
119.65 ms /cgroup/a/cpuacct.usage
119.64 ms /cgroup/a/cpuacct.usage
119.63 ms /cgroup/a/cpuacct.usage
119.60 ms /cgroup/a/cpuacct.usage
The large deficit here is due to quota generations (/intentionally/) preventing
us from now using previously stranded slack quota. The cost is that this quota
becomes unavailable.

with quota generations and quota return:
200.09 ms /cgroup/a/cpuacct.usage
200.09 ms /cgroup/a/cpuacct.usage
198.09 ms /cgroup/a/cpuacct.usage
200.09 ms /cgroup/a/cpuacct.usage
200.06 ms /cgroup/a/cpuacct.usage
By returning unused quota we're able to both stably consume our desired quota
and prevent unintentional overages due to the abuse of slack quota from
previous quota periods (especially on a large machine).

Signed-off-by: Paul Turner <[email protected]>

---
kernel/sched.c | 15 +++++++
kernel/sched_fair.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 113 insertions(+), 1 deletion(-)

Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -256,7 +256,7 @@ struct cfs_bandwidth {
u64 runtime_expires;

int idle, timer_active;
- struct hrtimer period_timer;
+ struct hrtimer period_timer, slack_timer;
struct list_head throttled_cfs_rq;

/* statistics */
@@ -417,6 +417,16 @@ static inline struct cfs_bandwidth *tg_c

static inline u64 default_cfs_period(void);
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
+
+static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
+{
+ struct cfs_bandwidth *cfs_b =
+ container_of(timer, struct cfs_bandwidth, slack_timer);
+ do_sched_cfs_slack_timer(cfs_b);
+
+ return HRTIMER_NORESTART;
+}

static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
@@ -449,6 +459,8 @@ static void init_cfs_bandwidth(struct cf
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->period_timer.function = sched_cfs_period_timer;
+ hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ cfs_b->slack_timer.function = sched_cfs_slack_timer;
}

static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -476,6 +488,7 @@ static void __start_cfs_bandwidth(struct
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
hrtimer_cancel(&cfs_b->period_timer);
+ hrtimer_cancel(&cfs_b->slack_timer);
}
#else
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -1071,6 +1071,8 @@ static void clear_buddies(struct cfs_rq
__clear_buddies_skip(se);
}

+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
@@ -1109,6 +1111,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;

+ /* return excess runtime on last deuque */
+ if (!cfs_rq->nr_running)
+ return_cfs_rq_runtime(cfs_rq);
+
update_min_vruntime(cfs_rq);
update_cfs_shares(cfs_rq);
}
@@ -1694,11 +1700,104 @@ out_unlock:

return idle;
}
+
+/* a cfs_rq won't donate quota below this amount */
+static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
+/* minimum remaining period time to redistribute slack quota */
+static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
+/* how long we wait to gather additional slack before distributing */
+static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
+
+/* are we near the end of the current quota period? */
+static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
+{
+ struct hrtimer *refresh_timer = &cfs_b->period_timer;
+ u64 remaining;
+
+ /* if the call-back is running a quota refresh is already occurring */
+ if (hrtimer_callback_running(refresh_timer))
+ return 1;
+
+ /* is a quota refresh about to occur? */
+ remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
+ if (remaining < min_expire)
+ return 1;
+
+ return 0;
+}
+
+static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+ u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
+
+ /* if there's a quota refresh soon don't bother with slack */
+ if (runtime_refresh_within(cfs_b, min_left))
+ return;
+
+ start_bandwidth_timer(&cfs_b->slack_timer,
+ ns_to_ktime(cfs_bandwidth_slack_period));
+}
+
+/* we know any runtime found here is valid as update_curr() precedes return */
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
+
+ if (!cfs_rq->runtime_enabled)
+ return;
+
+ if (slack_runtime <= 0)
+ return;
+
+ raw_spin_lock(&cfs_b->lock);
+ if (cfs_b->quota != RUNTIME_INF &&
+ (s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
+ cfs_b->runtime += slack_runtime;
+
+ if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
+ !list_empty(&cfs_b->throttled_cfs_rq))
+ start_cfs_slack_bandwidth(cfs_b);
+ }
+ raw_spin_unlock(&cfs_b->lock);
+
+ cfs_rq->runtime_remaining -= slack_runtime;
+}
+
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
+{
+ u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+ u64 expires;
+
+ /* confirm we're still not at a refresh boundary */
+ if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+ return;
+
+ raw_spin_lock(&cfs_b->lock);
+ if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+ runtime = cfs_b->runtime;
+ cfs_b->runtime = 0;
+ }
+ expires = cfs_b->runtime_expires;
+ raw_spin_unlock(&cfs_b->lock);
+
+ if (!runtime)
+ return;
+
+ runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+
+ raw_spin_lock(&cfs_b->lock);
+ if (expires == cfs_b->runtime_expires)
+ cfs_b->runtime = runtime;
+ raw_spin_unlock(&cfs_b->lock);
+}
+
#else
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
unsigned long delta_exec) {}
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}

static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{


2011-06-21 07:33:55

by Paul Turner

[permalink] [raw]
Subject: Re: [patch 15/16] sched: return unused runtime on voluntary sleep

I just realized the title of this patch is stale, as mentioned in the
changelog, we return on all dequeue to avoid stranding bandwidth.

On Tue, Jun 21, 2011 at 12:17 AM, Paul Turner <[email protected]> wrote:
> When a local cfs_rq blocks we return the majority of its remaining quota to the
> global bandwidth pool for use by other runqueues.
>
> We do this only when the quota is current and there is more than
> min_cfs_rq_quota [1ms by default] of runtime remaining on the rq.
>
> In the case where there are throttled runqueues and we have sufficient
> bandwidth to meter out a slice, a second timer is kicked off to handle this
> delivery, unthrottling where appropriate.
>
> Using a 'worst case' antagonist which executes on each cpu
> for 1ms before moving onto the next on a fairly large machine:
>
> no quota generations:
> ?197.47 ms ? ? ? /cgroup/a/cpuacct.usage
> ?199.46 ms ? ? ? /cgroup/a/cpuacct.usage
> ?205.46 ms ? ? ? /cgroup/a/cpuacct.usage
> ?198.46 ms ? ? ? /cgroup/a/cpuacct.usage
> ?208.39 ms ? ? ? /cgroup/a/cpuacct.usage
> Since we are allowed to use "stale" quota our usage is effectively bounded by
> the rate of input into the global pool and performance is relatively stable.
>
> with quota generations [1s increments]:
> ?119.58 ms ? ? ? /cgroup/a/cpuacct.usage
> ?119.65 ms ? ? ? /cgroup/a/cpuacct.usage
> ?119.64 ms ? ? ? /cgroup/a/cpuacct.usage
> ?119.63 ms ? ? ? /cgroup/a/cpuacct.usage
> ?119.60 ms ? ? ? /cgroup/a/cpuacct.usage
> The large deficit here is due to quota generations (/intentionally/) preventing
> us from now using previously stranded slack quota. ?The cost is that this quota
> becomes unavailable.
>
> with quota generations and quota return:
> ?200.09 ms ? ? ? /cgroup/a/cpuacct.usage
> ?200.09 ms ? ? ? /cgroup/a/cpuacct.usage
> ?198.09 ms ? ? ? /cgroup/a/cpuacct.usage
> ?200.09 ms ? ? ? /cgroup/a/cpuacct.usage
> ?200.06 ms ? ? ? /cgroup/a/cpuacct.usage
> By returning unused quota we're able to both stably consume our desired quota
> and prevent unintentional overages due to the abuse of slack quota from
> previous quota periods (especially on a large machine).
>
> Signed-off-by: Paul Turner <[email protected]>
>
> ---
> ?kernel/sched.c ? ? ?| ? 15 +++++++
> ?kernel/sched_fair.c | ? 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> ?2 files changed, 113 insertions(+), 1 deletion(-)
>
> Index: tip/kernel/sched.c
> ===================================================================
> --- tip.orig/kernel/sched.c
> +++ tip/kernel/sched.c
> @@ -256,7 +256,7 @@ struct cfs_bandwidth {
> ? ? ? ?u64 runtime_expires;
>
> ? ? ? ?int idle, timer_active;
> - ? ? ? struct hrtimer period_timer;
> + ? ? ? struct hrtimer period_timer, slack_timer;
> ? ? ? ?struct list_head throttled_cfs_rq;
>
> ? ? ? ?/* statistics */
> @@ -417,6 +417,16 @@ static inline struct cfs_bandwidth *tg_c
>
> ?static inline u64 default_cfs_period(void);
> ?static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
> +static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
> +
> +static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
> +{
> + ? ? ? struct cfs_bandwidth *cfs_b =
> + ? ? ? ? ? ? ? container_of(timer, struct cfs_bandwidth, slack_timer);
> + ? ? ? do_sched_cfs_slack_timer(cfs_b);
> +
> + ? ? ? return HRTIMER_NORESTART;
> +}
>
> ?static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> ?{
> @@ -449,6 +459,8 @@ static void init_cfs_bandwidth(struct cf
> ? ? ? ?INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
> ? ? ? ?hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> ? ? ? ?cfs_b->period_timer.function = sched_cfs_period_timer;
> + ? ? ? hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> + ? ? ? cfs_b->slack_timer.function = sched_cfs_slack_timer;
> ?}
>
> ?static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> @@ -476,6 +488,7 @@ static void __start_cfs_bandwidth(struct
> ?static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
> ?{
> ? ? ? ?hrtimer_cancel(&cfs_b->period_timer);
> + ? ? ? hrtimer_cancel(&cfs_b->slack_timer);
> ?}
> ?#else
> ?static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
> Index: tip/kernel/sched_fair.c
> ===================================================================
> --- tip.orig/kernel/sched_fair.c
> +++ tip/kernel/sched_fair.c
> @@ -1071,6 +1071,8 @@ static void clear_buddies(struct cfs_rq
> ? ? ? ? ? ? ? ?__clear_buddies_skip(se);
> ?}
>
> +static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
> +
> ?static void
> ?dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> ?{
> @@ -1109,6 +1111,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
> ? ? ? ?if (!(flags & DEQUEUE_SLEEP))
> ? ? ? ? ? ? ? ?se->vruntime -= cfs_rq->min_vruntime;
>
> + ? ? ? /* return excess runtime on last deuque */

typo here also fixed

> + ? ? ? if (!cfs_rq->nr_running)
> + ? ? ? ? ? ? ? return_cfs_rq_runtime(cfs_rq);
> +
> ? ? ? ?update_min_vruntime(cfs_rq);
> ? ? ? ?update_cfs_shares(cfs_rq);
> ?}
> @@ -1694,11 +1700,104 @@ out_unlock:
>
> ? ? ? ?return idle;
> ?}
> +
> +/* a cfs_rq won't donate quota below this amount */
> +static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
> +/* minimum remaining period time to redistribute slack quota */
> +static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
> +/* how long we wait to gather additional slack before distributing */
> +static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
> +
> +/* are we near the end of the current quota period? */
> +static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
> +{
> + ? ? ? struct hrtimer *refresh_timer = &cfs_b->period_timer;
> + ? ? ? u64 remaining;
> +
> + ? ? ? /* if the call-back is running a quota refresh is already occurring */
> + ? ? ? if (hrtimer_callback_running(refresh_timer))
> + ? ? ? ? ? ? ? return 1;
> +
> + ? ? ? /* is a quota refresh about to occur? */
> + ? ? ? remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
> + ? ? ? if (remaining < min_expire)
> + ? ? ? ? ? ? ? return 1;
> +
> + ? ? ? return 0;
> +}
> +
> +static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
> +{
> + ? ? ? u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
> +
> + ? ? ? /* if there's a quota refresh soon don't bother with slack */
> + ? ? ? if (runtime_refresh_within(cfs_b, min_left))
> + ? ? ? ? ? ? ? return;
> +
> + ? ? ? start_bandwidth_timer(&cfs_b->slack_timer,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ns_to_ktime(cfs_bandwidth_slack_period));
> +}
> +
> +/* we know any runtime found here is valid as update_curr() precedes return */
> +static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> +{
> + ? ? ? struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
> + ? ? ? s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
> +
> + ? ? ? if (!cfs_rq->runtime_enabled)
> + ? ? ? ? ? ? ? return;
> +
> + ? ? ? if (slack_runtime <= 0)
> + ? ? ? ? ? ? ? return;
> +
> + ? ? ? raw_spin_lock(&cfs_b->lock);
> + ? ? ? if (cfs_b->quota != RUNTIME_INF &&
> + ? ? ? ? ? (s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
> + ? ? ? ? ? ? ? cfs_b->runtime += slack_runtime;
> +
> + ? ? ? ? ? ? ? if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
> + ? ? ? ? ? ? ? ? ? !list_empty(&cfs_b->throttled_cfs_rq))
> + ? ? ? ? ? ? ? ? ? ? ? start_cfs_slack_bandwidth(cfs_b);
> + ? ? ? }
> + ? ? ? raw_spin_unlock(&cfs_b->lock);
> +
> + ? ? ? cfs_rq->runtime_remaining -= slack_runtime;
> +}
> +
> +static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
> +{
> + ? ? ? u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
> + ? ? ? u64 expires;
> +
> + ? ? ? /* confirm we're still not at a refresh boundary */
> + ? ? ? if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
> + ? ? ? ? ? ? ? return;
> +
> + ? ? ? raw_spin_lock(&cfs_b->lock);
> + ? ? ? if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
> + ? ? ? ? ? ? ? runtime = cfs_b->runtime;
> + ? ? ? ? ? ? ? cfs_b->runtime = 0;
> + ? ? ? }
> + ? ? ? expires = cfs_b->runtime_expires;
> + ? ? ? raw_spin_unlock(&cfs_b->lock);
> +
> + ? ? ? if (!runtime)
> + ? ? ? ? ? ? ? return;
> +
> + ? ? ? runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
> +
> + ? ? ? raw_spin_lock(&cfs_b->lock);
> + ? ? ? if (expires == cfs_b->runtime_expires)
> + ? ? ? ? ? ? ? cfs_b->runtime = runtime;
> + ? ? ? raw_spin_unlock(&cfs_b->lock);
> +}
> +
> ?#else
> ?static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
> ? ? ? ? ? ? ? ?unsigned long delta_exec) {}
> ?static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
> ?static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
> +static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
>
> ?static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
> ?{
>
>
>

2011-06-22 09:40:14

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [patch 15/16] sched: return unused runtime on voluntary sleep

(2011/06/21 16:17), Paul Turner wrote:
> When a local cfs_rq blocks we return the majority of its remaining quota to the
> global bandwidth pool for use by other runqueues.
>
> We do this only when the quota is current and there is more than
> min_cfs_rq_quota [1ms by default] of runtime remaining on the rq.
>
> In the case where there are throttled runqueues and we have sufficient
> bandwidth to meter out a slice, a second timer is kicked off to handle this
> delivery, unthrottling where appropriate.
>
> Using a 'worst case' antagonist which executes on each cpu
> for 1ms before moving onto the next on a fairly large machine:
>
> no quota generations:
> 197.47 ms /cgroup/a/cpuacct.usage
> 199.46 ms /cgroup/a/cpuacct.usage
> 205.46 ms /cgroup/a/cpuacct.usage
> 198.46 ms /cgroup/a/cpuacct.usage
> 208.39 ms /cgroup/a/cpuacct.usage
> Since we are allowed to use "stale" quota our usage is effectively bounded by
> the rate of input into the global pool and performance is relatively stable.
>
> with quota generations [1s increments]:
> 119.58 ms /cgroup/a/cpuacct.usage
> 119.65 ms /cgroup/a/cpuacct.usage
> 119.64 ms /cgroup/a/cpuacct.usage
> 119.63 ms /cgroup/a/cpuacct.usage
> 119.60 ms /cgroup/a/cpuacct.usage
> The large deficit here is due to quota generations (/intentionally/) preventing
> us from now using previously stranded slack quota. The cost is that this quota
> becomes unavailable.
>
> with quota generations and quota return:
> 200.09 ms /cgroup/a/cpuacct.usage
> 200.09 ms /cgroup/a/cpuacct.usage
> 198.09 ms /cgroup/a/cpuacct.usage
> 200.09 ms /cgroup/a/cpuacct.usage
> 200.06 ms /cgroup/a/cpuacct.usage
> By returning unused quota we're able to both stably consume our desired quota
> and prevent unintentional overages due to the abuse of slack quota from
> previous quota periods (especially on a large machine).
>
> Signed-off-by: Paul Turner <[email protected]>
>
> ---

(For all but the patch title:)
Reviewed-by: Hidetoshi Seto <[email protected]>


Thanks,
H.Seto

2011-06-23 15:27:17

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch 15/16] sched: return unused runtime on voluntary sleep

On Tue, 2011-06-21 at 00:17 -0700, Paul Turner wrote:
> plain text document attachment (sched-bwc-simple_return_quota.patch)
> When a local cfs_rq blocks we return the majority of its remaining quota to the
> global bandwidth pool for use by other runqueues.

OK, I saw return_cfs_rq_runtime() do that.

> We do this only when the quota is current and there is more than
> min_cfs_rq_quota [1ms by default] of runtime remaining on the rq.

sure..

> In the case where there are throttled runqueues and we have sufficient
> bandwidth to meter out a slice, a second timer is kicked off to handle this
> delivery, unthrottling where appropriate.

I'm having trouble there, what's the purpose of the timer, you could
redistribute immediately. None of this is well explained.

2011-06-28 01:43:48

by Paul Turner

[permalink] [raw]
Subject: Re: [patch 15/16] sched: return unused runtime on voluntary sleep

On Thu, Jun 23, 2011 at 8:26 AM, Peter Zijlstra <[email protected]> wrote:
> On Tue, 2011-06-21 at 00:17 -0700, Paul Turner wrote:
>> plain text document attachment (sched-bwc-simple_return_quota.patch)
>> When a local cfs_rq blocks we return the majority of its remaining quota to the
>> global bandwidth pool for use by other runqueues.
>
> OK, I saw return_cfs_rq_runtime() do that.
>
>> We do this only when the quota is current and there is more than
>> min_cfs_rq_quota [1ms by default] of runtime remaining on the rq.
>
> sure..
>
>> In the case where there are throttled runqueues and we have sufficient
>> bandwidth to meter out a slice, a second timer is kicked off to handle this
>> delivery, unthrottling where appropriate.
>
> I'm having trouble there, what's the purpose of the timer, you could
> redistribute immediately. None of this is well explained.
>

Current reasons:
- There was concern regarding thrashing the unthrottle path on a task
that is rapidly oscillating between runnable states, using a timer
this operation is inherently limited both in frequency and to a single
cpu. I think the move to using a throttled list (as opposed to having
to poll all cpus) as well as the fact that we only return quota in
excess of min_cfs_rq_quota probably mitigates this to the point where
we could just do away with this and do it directly in the put path.

- The aesthetics of releasing rq->lock in the put path. Quick
inspection suggests it should actually be safe to do at that point,
and we do similar for idle_balance().

Given consideration the above two factors are not requirements, this
could be moved out of a timer and into the put_path directly (with the
fact that we drop rq->lock strongly commented). I have no strong
preference between either choice.

Uninteresting additional historical reason:
The /original/ requirement for a timer here is that previous versions
placed some of the bandwidth distribution under cfs_b->lock. This
meant that we couldn't take rq->lock under cfs_b->lock (as the nesting
is the other way around). This is no longer a requirement
(advancement of expiration now provides what cfs_b->lock used to
here).




A timer is used so that we don't have to release rq->lock within the put path

2011-06-28 10:05:59

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch 15/16] sched: return unused runtime on voluntary sleep

On Mon, 2011-06-27 at 18:42 -0700, Paul Turner wrote:

> - The aesthetics of releasing rq->lock in the put path. Quick
> inspection suggests it should actually be safe to do at that point,
> and we do similar for idle_balance().
>
> Given consideration the above two factors are not requirements, this
> could be moved out of a timer and into the put_path directly (with the
> fact that we drop rq->lock strongly commented). I have no strong
> preference between either choice.

Argh, ok I see, distribute_cfs_runtime() wants that. Dropping rq->lock
is very fragile esp from the put path, you can only do that _before_ the
put path updates rq->curr etc.. So I'd rather you didn't, just keep the
timer crap and add some comments there.

And we need that distribute_cfs_runtime() muck because that's what
unthrottles rqs when more runtime is available.. bah.

2011-06-28 18:46:30

by Paul Turner

[permalink] [raw]
Subject: Re: [patch 15/16] sched: return unused runtime on voluntary sleep

On Tue, Jun 28, 2011 at 3:01 AM, Peter Zijlstra <[email protected]> wrote:
> On Mon, 2011-06-27 at 18:42 -0700, Paul Turner wrote:
>
>> - The aesthetics of releasing rq->lock in the put path. ?Quick
>> inspection suggests it should actually be safe to do at that point,
>> and we do similar for idle_balance().
>>
>> Given consideration the above two factors are not requirements, this
>> could be moved out of a timer and into the put_path directly (with the
>> fact that we drop rq->lock strongly commented). ?I have no strong
>> preference between either choice.
>
> Argh, ok I see, distribute_cfs_runtime() wants that. Dropping rq->lock
> is very fragile esp from the put path, you can only do that _before_ the
> put path updates rq->curr etc.. So I'd rather you didn't, just keep the
> timer crap and add some comments there.
>

Done.

An alternative that does come to mind is something like:

- cfs_b->lock may be sufficient synchronization to twiddle
cfs_rq->runtime_assigned (once it has been throttled, modulo alb)
- we could use this to synchronize a per-rq "to-be-unthrottled" list
which could be checked under something like task_tick_fair

We'd have to be careful about making sure we wake-up a sleeping cpu
but this hides the rq->lock requirements and would let us kill the
timer.

Perhaps this could be a future incremental improvement.

> And we need that distribute_cfs_runtime() muck because that's what
> unthrottles rqs when more runtime is available.. bah.
>
>
>