2011-02-16 03:20:08

by Paul Turner

[permalink] [raw]
Subject: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

In account_cfs_rq_quota() (via update_curr()) we track consumption versus a
cfs_rq's local quota and whether there is global quota available to continue
enabling it in the event we run out.

This patch adds the required support for the latter case, throttling entities
until quota is available to run. Throttling dequeues the entity in question
and sends a reschedule to the owning cpu so that it can be evicted.

The following restrictions apply to a throttled cfs_rq:
- It is dequeued from sched_entity hierarchy and restricted from being
re-enqueued. This means that new/waking children of this entity will be
queued up to it, but not past it.
- It does not contribute to weight calculations in tg_shares_up
- In the case that the cfs_rq of the cpu we are trying to pull from is throttled
it is is ignored by the loadbalancer in __load_balance_fair() and
move_one_task_fair().

Signed-off-by: Paul Turner <[email protected]>
Signed-off-by: Nikhil Rao <[email protected]>
Signed-off-by: Bharata B Rao <[email protected]>
---
kernel/sched.c | 3 +
kernel/sched_fair.c | 121 +++++++++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 114 insertions(+), 10 deletions(-)

Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -388,6 +388,7 @@ struct cfs_rq {
#endif
#ifdef CONFIG_CFS_BANDWIDTH
u64 quota_assigned, quota_used;
+ int throttled;
#endif
#endif
};
@@ -1656,6 +1657,8 @@ static void update_h_load(long cpu)

static void double_rq_lock(struct rq *rq1, struct rq *rq2);

+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
+
/*
* fair double_lock_balance: Safely acquires both rq->locks in a fair
* way at the expense of forcing extra atomic operations in all
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -331,8 +331,34 @@ static inline struct cfs_bandwidth *tg_c
return &tg->cfs_bandwidth;
}

+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->throttled;
+}
+
+/* it's possible to be 'on_rq' in a dequeued (e.g. throttled) hierarchy */
+static inline int entity_on_rq(struct sched_entity *se)
+{
+ for_each_sched_entity(se)
+ if (!se->on_rq)
+ return 0;
+
+ return 1;
+}
+
static void account_cfs_rq_quota(struct cfs_rq *cfs_rq,
unsigned long delta_exec);
+#else
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+ return 0;
+}
+
+static inline int entity_on_rq(struct sched_entity *se)
+{
+ return se->on_rq;
+}
+
#endif


@@ -744,9 +770,10 @@ static void update_cfs_rq_load_contribut
int global_update)
{
struct task_group *tg = cfs_rq->tg;
- long load_avg;
+ long load_avg = 0;

- load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+ if (!cfs_rq_throttled(cfs_rq))
+ load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
load_avg -= cfs_rq->load_contribution;

if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
@@ -761,7 +788,11 @@ static void update_cfs_load(struct cfs_r
u64 now, delta;
unsigned long load = cfs_rq->load.weight;

- if (cfs_rq->tg == &root_task_group)
+ /*
+ * Don't maintain averages for the root task group, or while we are
+ * throttled.
+ */
+ if (cfs_rq->tg == &root_task_group || cfs_rq_throttled(cfs_rq))
return;

now = rq_of(cfs_rq)->clock_task;
@@ -1015,6 +1046,14 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+
+
+#ifdef CONFIG_CFS_BANDWIDTH
+ if (!entity_is_task(se) && (cfs_rq_throttled(group_cfs_rq(se)) ||
+ !group_cfs_rq(se)->nr_running))
+ return;
+#endif
+
update_cfs_load(cfs_rq, 0);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
@@ -1087,6 +1126,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
*/
update_curr(cfs_rq);

+#ifdef CONFIG_CFS_BANDWIDTH
+ if (!entity_is_task(se) && cfs_rq_throttled(group_cfs_rq(se)))
+ return;
+#endif
+
update_stats_dequeue(cfs_rq, se);
if (flags & DEQUEUE_SLEEP) {
#ifdef CONFIG_SCHEDSTATS
@@ -1363,6 +1407,9 @@ enqueue_task_fair(struct rq *rq, struct
break;
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, flags);
+ /* don't continue to enqueue if our parent is throttled */
+ if (cfs_rq_throttled(cfs_rq))
+ break;
flags = ENQUEUE_WAKEUP;
}

@@ -1390,8 +1437,11 @@ static void dequeue_task_fair(struct rq
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);

- /* Don't dequeue parent if it has other entities besides us */
- if (cfs_rq->load.weight)
+ /*
+ * Don't dequeue parent if it has other entities besides us,
+ * or if it is throttled
+ */
+ if (cfs_rq->load.weight || cfs_rq_throttled(cfs_rq))
break;
flags |= DEQUEUE_SLEEP;
}
@@ -1430,6 +1480,42 @@ static u64 tg_request_cfs_quota(struct t
return delta;
}

+static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *se;
+
+ se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+ /* account load preceeding throttle */
+ update_cfs_load(cfs_rq, 0);
+
+ /* prevent previous buddy nominations from re-picking this se */
+ clear_buddies(cfs_rq_of(se), se);
+
+ /*
+ * It's possible for the current task to block and re-wake before task
+ * switch, leading to a throttle within enqueue_task->update_curr()
+ * versus an an entity that has not technically been enqueued yet.
+ *
+ * In this case, since we haven't actually done the enqueue yet, cut
+ * out and allow enqueue_entity() to short-circuit
+ */
+ if (!se->on_rq)
+ goto out_throttled;
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ dequeue_entity(cfs_rq, se, 1);
+ if (cfs_rq->load.weight || cfs_rq_throttled(cfs_rq))
+ break;
+ }
+
+out_throttled:
+ cfs_rq->throttled = 1;
+ update_cfs_rq_load_contribution(cfs_rq, 1);
+}
+
static void account_cfs_rq_quota(struct cfs_rq *cfs_rq,
unsigned long delta_exec)
{
@@ -1438,10 +1524,16 @@ static void account_cfs_rq_quota(struct

cfs_rq->quota_used += delta_exec;

- if (cfs_rq->quota_used < cfs_rq->quota_assigned)
+ if (cfs_rq_throttled(cfs_rq) ||
+ cfs_rq->quota_used < cfs_rq->quota_assigned)
return;

cfs_rq->quota_assigned += tg_request_cfs_quota(cfs_rq->tg);
+
+ if (cfs_rq->quota_used >= cfs_rq->quota_assigned) {
+ throttle_cfs_rq(cfs_rq);
+ resched_task(cfs_rq->rq->curr);
+ }
}

static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
@@ -1941,6 +2033,12 @@ static void check_preempt_wakeup(struct
if (unlikely(se == pse))
return;

+#ifdef CONFIG_CFS_BANDWIDTH
+ /* avoid pre-emption check/buddy nomination for throttled tasks */
+ if (!entity_on_rq(pse))
+ return;
+#endif
+
if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
set_next_buddy(pse);

@@ -2060,7 +2158,8 @@ static bool yield_to_task_fair(struct rq
{
struct sched_entity *se = &p->se;

- if (!se->on_rq)
+ /* ensure entire hierarchy is on rq (e.g. running & not throttled) */
+ if (!entity_on_rq(se))
return false;

/* Tell the scheduler that we'd really like pse to run next. */
@@ -2280,7 +2379,8 @@ static void update_shares(int cpu)

rcu_read_lock();
for_each_leaf_cfs_rq(rq, cfs_rq)
- update_shares_cpu(cfs_rq->tg, cpu);
+ if (!cfs_rq_throttled(cfs_rq))
+ update_shares_cpu(cfs_rq->tg, cpu);
rcu_read_unlock();
}

@@ -2304,9 +2404,10 @@ load_balance_fair(struct rq *this_rq, in
u64 rem_load, moved_load;

/*
- * empty group
+ * empty group or throttled cfs_rq
*/
- if (!busiest_cfs_rq->task_weight)
+ if (!busiest_cfs_rq->task_weight ||
+ cfs_rq_throttled(busiest_cfs_rq))
continue;

rem_load = (u64)rem_load_move * busiest_weight;


2011-02-18 06:52:34

by Balbir Singh

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

* Paul Turner <[email protected]> [2011-02-15 19:18:34]:

> In account_cfs_rq_quota() (via update_curr()) we track consumption versus a
> cfs_rq's local quota and whether there is global quota available to continue
> enabling it in the event we run out.
>
> This patch adds the required support for the latter case, throttling entities
> until quota is available to run. Throttling dequeues the entity in question
> and sends a reschedule to the owning cpu so that it can be evicted.
>
> The following restrictions apply to a throttled cfs_rq:
> - It is dequeued from sched_entity hierarchy and restricted from being
> re-enqueued. This means that new/waking children of this entity will be
> queued up to it, but not past it.
> - It does not contribute to weight calculations in tg_shares_up
> - In the case that the cfs_rq of the cpu we are trying to pull from is throttled
> it is is ignored by the loadbalancer in __load_balance_fair() and
> move_one_task_fair().
>
> Signed-off-by: Paul Turner <[email protected]>
> Signed-off-by: Nikhil Rao <[email protected]>
> Signed-off-by: Bharata B Rao <[email protected]>


Acked-by: Balbir Singh <[email protected]>

--
Three Cheers,
Balbir

2011-02-23 13:32:49

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

On Tue, 2011-02-15 at 19:18 -0800, Paul Turner wrote:

> +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
> +{
> + return cfs_rq->throttled;
> +}
> +
> +/* it's possible to be 'on_rq' in a dequeued (e.g. throttled) hierarchy */
> +static inline int entity_on_rq(struct sched_entity *se)
> +{
> + for_each_sched_entity(se)
> + if (!se->on_rq)
> + return 0;

Please add block braces over multi line stmts even if not strictly
needed.

> +
> + return 1;
> +}


> @@ -761,7 +788,11 @@ static void update_cfs_load(struct cfs_r
> u64 now, delta;
> unsigned long load = cfs_rq->load.weight;
>
> - if (cfs_rq->tg == &root_task_group)
> + /*
> + * Don't maintain averages for the root task group, or while we are
> + * throttled.
> + */
> + if (cfs_rq->tg == &root_task_group || cfs_rq_throttled(cfs_rq))
> return;
>
> now = rq_of(cfs_rq)->clock_task;

Placing the return there avoids updating the timestamps, so once we get
unthrottled we'll observe a very long period and skew the load avg?

Ideally we'd never call this on throttled groups to begin with and
handle them like full dequeue/enqueue like things.

> @@ -1015,6 +1046,14 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
> * Update run-time statistics of the 'current'.
> */
> update_curr(cfs_rq);
> +
> +
> +#ifdef CONFIG_CFS_BANDWIDTH
> + if (!entity_is_task(se) && (cfs_rq_throttled(group_cfs_rq(se)) ||
> + !group_cfs_rq(se)->nr_running))
> + return;
> +#endif
> +
> update_cfs_load(cfs_rq, 0);
> account_entity_enqueue(cfs_rq, se);
> update_cfs_shares(cfs_rq);
> @@ -1087,6 +1126,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
> */
> update_curr(cfs_rq);
>
> +#ifdef CONFIG_CFS_BANDWIDTH
> + if (!entity_is_task(se) && cfs_rq_throttled(group_cfs_rq(se)))
> + return;
> +#endif
> +
> update_stats_dequeue(cfs_rq, se);
> if (flags & DEQUEUE_SLEEP) {
> #ifdef CONFIG_SCHEDSTATS

These make me very nervous, on enqueue you bail after adding
min_vruntime to ->vruntime and calling update_curr(), but on dequeue you
bail before subtracting min_vruntime from ->vruntime.

> @@ -1363,6 +1407,9 @@ enqueue_task_fair(struct rq *rq, struct
> break;
> cfs_rq = cfs_rq_of(se);
> enqueue_entity(cfs_rq, se, flags);
> + /* don't continue to enqueue if our parent is throttled */
> + if (cfs_rq_throttled(cfs_rq))
> + break;
> flags = ENQUEUE_WAKEUP;
> }
>
> @@ -1390,8 +1437,11 @@ static void dequeue_task_fair(struct rq
> cfs_rq = cfs_rq_of(se);
> dequeue_entity(cfs_rq, se, flags);
>
> - /* Don't dequeue parent if it has other entities besides us */
> - if (cfs_rq->load.weight)
> + /*
> + * Don't dequeue parent if it has other entities besides us,
> + * or if it is throttled
> + */
> + if (cfs_rq->load.weight || cfs_rq_throttled(cfs_rq))
> break;
> flags |= DEQUEUE_SLEEP;
> }

How could we even be running if our parent was throttled?

> @@ -1430,6 +1480,42 @@ static u64 tg_request_cfs_quota(struct t
> return delta;
> }
>
> +static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
> +{
> + struct sched_entity *se;
> +
> + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
> +
> + /* account load preceeding throttle */

My spell checker thinks that should be written as: preceding.

> + update_cfs_load(cfs_rq, 0);
> +
> + /* prevent previous buddy nominations from re-picking this se */
> + clear_buddies(cfs_rq_of(se), se);
> +
> + /*
> + * It's possible for the current task to block and re-wake before task
> + * switch, leading to a throttle within enqueue_task->update_curr()
> + * versus an an entity that has not technically been enqueued yet.

I'm not quite seeing how this would happen.. care to expand on this?

> + * In this case, since we haven't actually done the enqueue yet, cut
> + * out and allow enqueue_entity() to short-circuit
> + */
> + if (!se->on_rq)
> + goto out_throttled;
> +
> + for_each_sched_entity(se) {
> + struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +
> + dequeue_entity(cfs_rq, se, 1);
> + if (cfs_rq->load.weight || cfs_rq_throttled(cfs_rq))
> + break;
> + }
> +
> +out_throttled:
> + cfs_rq->throttled = 1;
> + update_cfs_rq_load_contribution(cfs_rq, 1);
> +}
> +
> static void account_cfs_rq_quota(struct cfs_rq *cfs_rq,
> unsigned long delta_exec)
> {
> @@ -1438,10 +1524,16 @@ static void account_cfs_rq_quota(struct
>
> cfs_rq->quota_used += delta_exec;
>
> - if (cfs_rq->quota_used < cfs_rq->quota_assigned)
> + if (cfs_rq_throttled(cfs_rq) ||
> + cfs_rq->quota_used < cfs_rq->quota_assigned)
> return;

So we are throttled but running anyway, I suppose this comes from the PI
ceiling muck?

> cfs_rq->quota_assigned += tg_request_cfs_quota(cfs_rq->tg);
> +
> + if (cfs_rq->quota_used >= cfs_rq->quota_assigned) {
> + throttle_cfs_rq(cfs_rq);
> + resched_task(cfs_rq->rq->curr);
> + }
> }
>
> static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
> @@ -1941,6 +2033,12 @@ static void check_preempt_wakeup(struct
> if (unlikely(se == pse))
> return;
>
> +#ifdef CONFIG_CFS_BANDWIDTH
> + /* avoid pre-emption check/buddy nomination for throttled tasks */

Somehow my spell checker doesn't like that hyphen.

> + if (!entity_on_rq(pse))
> + return;
> +#endif

Ideally that #ifdef'ery would go away too.

> if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
> set_next_buddy(pse);
>
> @@ -2060,7 +2158,8 @@ static bool yield_to_task_fair(struct rq
> {
> struct sched_entity *se = &p->se;
>
> - if (!se->on_rq)
> + /* ensure entire hierarchy is on rq (e.g. running & not throttled) */
> + if (!entity_on_rq(se))
> return false;

like here..

> /* Tell the scheduler that we'd really like pse to run next. */
> @@ -2280,7 +2379,8 @@ static void update_shares(int cpu)
>
> rcu_read_lock();
> for_each_leaf_cfs_rq(rq, cfs_rq)
> - update_shares_cpu(cfs_rq->tg, cpu);
> + if (!cfs_rq_throttled(cfs_rq))
> + update_shares_cpu(cfs_rq->tg, cpu);

This wants extra braces

> rcu_read_unlock();
> }
>
> @@ -2304,9 +2404,10 @@ load_balance_fair(struct rq *this_rq, in
> u64 rem_load, moved_load;
>
> /*
> - * empty group
> + * empty group or throttled cfs_rq
> */
> - if (!busiest_cfs_rq->task_weight)
> + if (!busiest_cfs_rq->task_weight ||
> + cfs_rq_throttled(busiest_cfs_rq))
> continue;
>
> rem_load = (u64)rem_load_move * busiest_weight;
>
>

2011-02-24 05:20:49

by Bharata B Rao

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

Hi Peter,

I will only answer a couple of your questions and let Paul clarify the rest...

On Wed, Feb 23, 2011 at 02:32:13PM +0100, Peter Zijlstra wrote:
> On Tue, 2011-02-15 at 19:18 -0800, Paul Turner wrote:
>
>
> > @@ -1363,6 +1407,9 @@ enqueue_task_fair(struct rq *rq, struct
> > break;
> > cfs_rq = cfs_rq_of(se);
> > enqueue_entity(cfs_rq, se, flags);
> > + /* don't continue to enqueue if our parent is throttled */
> > + if (cfs_rq_throttled(cfs_rq))
> > + break;
> > flags = ENQUEUE_WAKEUP;
> > }
> >
> > @@ -1390,8 +1437,11 @@ static void dequeue_task_fair(struct rq
> > cfs_rq = cfs_rq_of(se);
> > dequeue_entity(cfs_rq, se, flags);
> >
> > - /* Don't dequeue parent if it has other entities besides us */
> > - if (cfs_rq->load.weight)
> > + /*
> > + * Don't dequeue parent if it has other entities besides us,
> > + * or if it is throttled
> > + */
> > + if (cfs_rq->load.weight || cfs_rq_throttled(cfs_rq))
> > break;
> > flags |= DEQUEUE_SLEEP;
> > }
>
> How could we even be running if our parent was throttled?

The task isn't running actually. One of its parents up in the heirarchy has
been throttled and been already dequeued. Now this task sits on its immediate
parent's runqueue which isn't throttled but not really running also since
the hierarchy is throttled. In this situation, load balancer can try to pull
this task. When that happens, load balancer tries to dequeue it and this
check will ensure that we don't attempt to dequeue a group entity in our
hierarchy which has already been dequeued.

> > @@ -1438,10 +1524,16 @@ static void account_cfs_rq_quota(struct
> >
> > cfs_rq->quota_used += delta_exec;
> >
> > - if (cfs_rq->quota_used < cfs_rq->quota_assigned)
> > + if (cfs_rq_throttled(cfs_rq) ||
> > + cfs_rq->quota_used < cfs_rq->quota_assigned)
> > return;
>
> So we are throttled but running anyway, I suppose this comes from the PI
> ceiling muck?

When a cfs_rq is throttled, its representative se (and all its parent
se's) get dequeued and the task is marked for resched. But the task entity is
still on its throttled parent's cfs_rq (=> task->se.on_rq = 1). Next during
put_prev_task_fair(), we enqueue the task back on its throttled parent's
cfs_rq at which time we end up calling update_curr() on throttled cfs_rq.
This check would help us bail out from that situation.

Regards,
Bharata.

2011-02-24 11:05:33

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

On Thu, 2011-02-24 at 10:51 +0530, Bharata B Rao wrote:
> Hi Peter,
>
> I will only answer a couple of your questions and let Paul clarify the rest...
>
> On Wed, Feb 23, 2011 at 02:32:13PM +0100, Peter Zijlstra wrote:
> > On Tue, 2011-02-15 at 19:18 -0800, Paul Turner wrote:
> >
> >
> > > @@ -1363,6 +1407,9 @@ enqueue_task_fair(struct rq *rq, struct
> > > break;
> > > cfs_rq = cfs_rq_of(se);
> > > enqueue_entity(cfs_rq, se, flags);
> > > + /* don't continue to enqueue if our parent is throttled */
> > > + if (cfs_rq_throttled(cfs_rq))
> > > + break;
> > > flags = ENQUEUE_WAKEUP;
> > > }
> > >
> > > @@ -1390,8 +1437,11 @@ static void dequeue_task_fair(struct rq
> > > cfs_rq = cfs_rq_of(se);
> > > dequeue_entity(cfs_rq, se, flags);
> > >
> > > - /* Don't dequeue parent if it has other entities besides us */
> > > - if (cfs_rq->load.weight)
> > > + /*
> > > + * Don't dequeue parent if it has other entities besides us,
> > > + * or if it is throttled
> > > + */
> > > + if (cfs_rq->load.weight || cfs_rq_throttled(cfs_rq))
> > > break;
> > > flags |= DEQUEUE_SLEEP;
> > > }
> >
> > How could we even be running if our parent was throttled?
>
> The task isn't running actually. One of its parents up in the heirarchy has
> been throttled and been already dequeued. Now this task sits on its immediate
> parent's runqueue which isn't throttled but not really running also since
> the hierarchy is throttled. In this situation, load balancer can try to pull
> this task. When that happens, load balancer tries to dequeue it and this
> check will ensure that we don't attempt to dequeue a group entity in our
> hierarchy which has already been dequeued.

That's insane, its throttled, that means it should be dequeued and
should thus invisible for the load-balancer. If it is visible the
load-balancer will try and move tasks around to balance load, but all in
vain, it'll move phantom loads around and get most confused at best.

Pure and utter suckage if you ask me.

> > > @@ -1438,10 +1524,16 @@ static void account_cfs_rq_quota(struct
> > >
> > > cfs_rq->quota_used += delta_exec;
> > >
> > > - if (cfs_rq->quota_used < cfs_rq->quota_assigned)
> > > + if (cfs_rq_throttled(cfs_rq) ||
> > > + cfs_rq->quota_used < cfs_rq->quota_assigned)
> > > return;
> >
> > So we are throttled but running anyway, I suppose this comes from the PI
> > ceiling muck?
>
> When a cfs_rq is throttled, its representative se (and all its parent
> se's) get dequeued and the task is marked for resched. But the task entity is
> still on its throttled parent's cfs_rq (=> task->se.on_rq = 1). Next during
> put_prev_task_fair(), we enqueue the task back on its throttled parent's
> cfs_rq at which time we end up calling update_curr() on throttled cfs_rq.
> This check would help us bail out from that situation.

But why bother with this early exit? At worst you'll call
tg_request_cfs_quota() in vain, at best you'll find there is runtime
because the period tick just happened on another cpu and you're good to
go, yay!

2011-02-24 15:45:40

by Bharata B Rao

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

On Thu, Feb 24, 2011 at 12:05:01PM +0100, Peter Zijlstra wrote:
> On Thu, 2011-02-24 at 10:51 +0530, Bharata B Rao wrote:
> > Hi Peter,
> >
> > I will only answer a couple of your questions and let Paul clarify the rest...
> >
> > On Wed, Feb 23, 2011 at 02:32:13PM +0100, Peter Zijlstra wrote:
> > > On Tue, 2011-02-15 at 19:18 -0800, Paul Turner wrote:
> > >
> > >
> > > > @@ -1363,6 +1407,9 @@ enqueue_task_fair(struct rq *rq, struct
> > > > break;
> > > > cfs_rq = cfs_rq_of(se);
> > > > enqueue_entity(cfs_rq, se, flags);
> > > > + /* don't continue to enqueue if our parent is throttled */
> > > > + if (cfs_rq_throttled(cfs_rq))
> > > > + break;
> > > > flags = ENQUEUE_WAKEUP;
> > > > }
> > > >
> > > > @@ -1390,8 +1437,11 @@ static void dequeue_task_fair(struct rq
> > > > cfs_rq = cfs_rq_of(se);
> > > > dequeue_entity(cfs_rq, se, flags);
> > > >
> > > > - /* Don't dequeue parent if it has other entities besides us */
> > > > - if (cfs_rq->load.weight)
> > > > + /*
> > > > + * Don't dequeue parent if it has other entities besides us,
> > > > + * or if it is throttled
> > > > + */
> > > > + if (cfs_rq->load.weight || cfs_rq_throttled(cfs_rq))
> > > > break;
> > > > flags |= DEQUEUE_SLEEP;
> > > > }
> > >
> > > How could we even be running if our parent was throttled?
> >
> > The task isn't running actually. One of its parents up in the heirarchy has
> > been throttled and been already dequeued. Now this task sits on its immediate
> > parent's runqueue which isn't throttled but not really running also since
> > the hierarchy is throttled. In this situation, load balancer can try to pull
> > this task. When that happens, load balancer tries to dequeue it and this
> > check will ensure that we don't attempt to dequeue a group entity in our
> > hierarchy which has already been dequeued.
>
> That's insane, its throttled, that means it should be dequeued and
> should thus invisible for the load-balancer. If it is visible the
> load-balancer will try and move tasks around to balance load, but all in
> vain, it'll move phantom loads around and get most confused at best.

We can't walk the se hierarchy downwards and hence can't really dequeue
the entire hierarchy if any one entity in the hierarchy is throttled.
However this semantics of retaining the child entities enqueued while
dequeuing the entities upwards of a throttled entity makes our life simple
during unthrottling. We just have to enqueue the entities upwards the
throttled entity and the rest of the entities downwards automatically become
available.

While I admit that our load balancing semantics wrt thorttled entities are
not consistent (we don't allow pulling of tasks directly from throttled
cfs_rqs, while allow pulling of tasks from a throttled hierarchy as in the
above case), I am beginning to think if it works out to be advantageous.
Is there a chance that the task gets to run on other CPU where the hierarchy
isn't throttled since runtime is still available ?

>
> Pure and utter suckage if you ask me.
>
> > > > @@ -1438,10 +1524,16 @@ static void account_cfs_rq_quota(struct
> > > >
> > > > cfs_rq->quota_used += delta_exec;
> > > >
> > > > - if (cfs_rq->quota_used < cfs_rq->quota_assigned)
> > > > + if (cfs_rq_throttled(cfs_rq) ||
> > > > + cfs_rq->quota_used < cfs_rq->quota_assigned)
> > > > return;
> > >
> > > So we are throttled but running anyway, I suppose this comes from the PI
> > > ceiling muck?
> >
> > When a cfs_rq is throttled, its representative se (and all its parent
> > se's) get dequeued and the task is marked for resched. But the task entity is
> > still on its throttled parent's cfs_rq (=> task->se.on_rq = 1). Next during
> > put_prev_task_fair(), we enqueue the task back on its throttled parent's
> > cfs_rq at which time we end up calling update_curr() on throttled cfs_rq.
> > This check would help us bail out from that situation.
>
> But why bother with this early exit? At worst you'll call
> tg_request_cfs_quota() in vain, at best you'll find there is runtime
> because the period tick just happened on another cpu and you're good to
> go, yay!

I see your point. I had this check in my version of hard limits patches earlier
for the reason I described above. Lets see if Paul had any other reason
to retain this check.

Regards,
Bharata.

2011-02-24 15:53:38

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

On Thu, 2011-02-24 at 21:15 +0530, Bharata B Rao wrote:
> While I admit that our load balancing semantics wrt thorttled entities are
> not consistent (we don't allow pulling of tasks directly from throttled
> cfs_rqs, while allow pulling of tasks from a throttled hierarchy as in the
> above case), I am beginning to think if it works out to be advantageous.
> Is there a chance that the task gets to run on other CPU where the hierarchy
> isn't throttled since runtime is still available ?

Possible yes, but the load-balancer doesn't know about that, not should
it (its complicated, and broken, enough, no need to add more cruft to
it).

I'm starting to think you all should just toss all this and start over,
its just too smelly.

2011-02-24 16:39:43

by Bharata B Rao

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

On Thu, Feb 24, 2011 at 04:52:53PM +0100, Peter Zijlstra wrote:
> On Thu, 2011-02-24 at 21:15 +0530, Bharata B Rao wrote:
> > While I admit that our load balancing semantics wrt thorttled entities are
> > not consistent (we don't allow pulling of tasks directly from throttled
> > cfs_rqs, while allow pulling of tasks from a throttled hierarchy as in the
> > above case), I am beginning to think if it works out to be advantageous.
> > Is there a chance that the task gets to run on other CPU where the hierarchy
> > isn't throttled since runtime is still available ?
>
> Possible yes, but the load-balancer doesn't know about that, not should
> it (its complicated, and broken, enough, no need to add more cruft to
> it).
>
> I'm starting to think you all should just toss all this and start over,
> its just too smelly.

Hmm... You have brought up 3 concerns:

1. Hierarchy semantics

If you look at the heirarchy semantics we currently have while ignoring the
load balancer interactions for a moment, I guess what we have is a reasonable
one.

- Only group entities are throttled
- Throttled entities are taken off the runqueue and hence they never
get picked up for scheduling.
- New or child entites are queued up to the throttled entities and not
further up. As I said in another thread, having the tree intact and correct
underneath the throttled entity allows us to rebuild the hierarchy during
unthrottling with least amount of effort.
- Group entities in a hierarchy are throttled independent of each other based
on their bandwidth specification.

2. Handling of throttled entities by load balancer

This definetely needs to improve and be more consistent. We can work on this.

3. per-cgroup vs global period specification

I thought per-cgroup specification would be most flexible and hence started
out with that. This would allow groups/workloads/VMs to define their
own bandwidth rate.

Let us know if you have other design concerns besides these.

Regards,
Bharata.

2011-02-24 17:21:46

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

On Thu, 2011-02-24 at 22:09 +0530, Bharata B Rao wrote:
> On Thu, Feb 24, 2011 at 04:52:53PM +0100, Peter Zijlstra wrote:
> > On Thu, 2011-02-24 at 21:15 +0530, Bharata B Rao wrote:
> > > While I admit that our load balancing semantics wrt thorttled entities are
> > > not consistent (we don't allow pulling of tasks directly from throttled
> > > cfs_rqs, while allow pulling of tasks from a throttled hierarchy as in the
> > > above case), I am beginning to think if it works out to be advantageous.
> > > Is there a chance that the task gets to run on other CPU where the hierarchy
> > > isn't throttled since runtime is still available ?
> >
> > Possible yes, but the load-balancer doesn't know about that, not should
> > it (its complicated, and broken, enough, no need to add more cruft to
> > it).
> >
> > I'm starting to think you all should just toss all this and start over,
> > its just too smelly.
>
> Hmm... You have brought up 3 concerns:
>
> 1. Hierarchy semantics
>
> If you look at the heirarchy semantics we currently have while ignoring the
> load balancer interactions for a moment, I guess what we have is a reasonable
> one.
>
> - Only group entities are throttled
> - Throttled entities are taken off the runqueue and hence they never
> get picked up for scheduling.
> - New or child entites are queued up to the throttled entities and not
> further up. As I said in another thread, having the tree intact and correct
> underneath the throttled entity allows us to rebuild the hierarchy during
> unthrottling with least amount of effort.

It also gets you into all that load-balancer mess, and I'm not going to
let you off lightly there.

> - Group entities in a hierarchy are throttled independent of each other based
> on their bandwidth specification.

That's missing out quite a few details.. for one there is no mention of
hierarchical implication of/constraints on bandwidth, can children have
more bandwidth than their parent (I hope not).

> 2. Handling of throttled entities by load balancer
>
> This definetely needs to improve and be more consistent. We can work on this.

Feh, improve is being nice about it, it needs a complete overhaul, the
current situation is a cobbled together leaky mess.

> 3. per-cgroup vs global period specification
>
> I thought per-cgroup specification would be most flexible and hence started
> out with that. This would allow groups/workloads/VMs to define their
> own bandwidth rate.

Most flexible yes, most 'interesting' too, now if you consider running a
child task is also running the parent entity and therefore you're
consuming bandwidth up the entire hierarchy, what happens when the
parent has a much larger period than the child?

In that case your child doesn't get ran while the parent is throttled,
and the child's period is violated.


> Let us know if you have other design concerns besides these.

Yeah, that weird time accounting muck, bandwidth should decrease on
usage and incremented on replenishment, this gets you 0 as the natural
boundary between credit and debt, no need to keep two variables.

Also, the above just about covers all the patch set does, isn't that
enough justification to throw the thing out and start over?

2011-02-25 03:11:34

by Paul Turner

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

On Wed, Feb 23, 2011 at 5:32 AM, Peter Zijlstra <[email protected]> wrote:
> On Tue, 2011-02-15 at 19:18 -0800, Paul Turner wrote:
>
>> +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
>> +{
>> + ? ? return cfs_rq->throttled;
>> +}
>> +
>> +/* it's possible to be 'on_rq' in a dequeued (e.g. throttled) hierarchy */
>> +static inline int entity_on_rq(struct sched_entity *se)
>> +{
>> + ? ? for_each_sched_entity(se)
>> + ? ? ? ? ? ? if (!se->on_rq)
>> + ? ? ? ? ? ? ? ? ? ? return 0;
>
> Please add block braces over multi line stmts even if not strictly
> needed.
>

Done

>> +
>> + ? ? return 1;
>> +}
>
>
>> @@ -761,7 +788,11 @@ static void update_cfs_load(struct cfs_r
>> ? ? ? u64 now, delta;
>> ? ? ? unsigned long load = cfs_rq->load.weight;
>>
>> - ? ? if (cfs_rq->tg == &root_task_group)
>> + ? ? /*
>> + ? ? ?* Don't maintain averages for the root task group, or while we are
>> + ? ? ?* throttled.
>> + ? ? ?*/
>> + ? ? if (cfs_rq->tg == &root_task_group || cfs_rq_throttled(cfs_rq))
>> ? ? ? ? ? ? ? return;
>>
>> ? ? ? now = rq_of(cfs_rq)->clock_task;
>
> Placing the return there avoids updating the timestamps, so once we get
> unthrottled we'll observe a very long period and skew the load avg?
>

It's easier to avoid this by fixing up the load average on unthrottle,
since there's no point in moving up the intermediate timestamps on
each throttled update.

The one "gotcha" in either case is that it's possible for time to
drift on the child of a throttled group and I don't see an easy way
around this.

> Ideally we'd never call this on throttled groups to begin with and
> handle them like full dequeue/enqueue like things.
>

This is what is attempted -- however it's still possible actions such
as wakeup which may still occur against throttled groups regardless of
their queue state.

In this case we still need to preserve the correct child hierarchy
state so that it can be re-enqueued when there is again bandwidth.

>> @@ -1015,6 +1046,14 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
>> ? ? ? ?* Update run-time statistics of the 'current'.
>> ? ? ? ?*/
>> ? ? ? update_curr(cfs_rq);
>> +
>> +
>> +#ifdef CONFIG_CFS_BANDWIDTH
>> + ? ? if (!entity_is_task(se) && (cfs_rq_throttled(group_cfs_rq(se)) ||
>> + ? ? ? ? ?!group_cfs_rq(se)->nr_running))
>> + ? ? ? ? ? ? return;
>> +#endif
>> +
>> ? ? ? update_cfs_load(cfs_rq, 0);
>> ? ? ? account_entity_enqueue(cfs_rq, se);
>> ? ? ? update_cfs_shares(cfs_rq);
>> @@ -1087,6 +1126,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
>> ? ? ? ?*/
>> ? ? ? update_curr(cfs_rq);
>>
>> +#ifdef CONFIG_CFS_BANDWIDTH
>> + ? ? if (!entity_is_task(se) && cfs_rq_throttled(group_cfs_rq(se)))
>> + ? ? ? ? ? ? return;
>> +#endif
>> +
>> ? ? ? update_stats_dequeue(cfs_rq, se);
>> ? ? ? if (flags & DEQUEUE_SLEEP) {
>> ?#ifdef CONFIG_SCHEDSTATS
>
> These make me very nervous, on enqueue you bail after adding
> min_vruntime to ->vruntime and calling update_curr(), but on dequeue you
> bail before subtracting min_vruntime from ->vruntime.
>

min_vruntime shouldn't be added in enqueue since unthrottling is
treated as a wakeup (which results in placement versus min as opposed
to normalization).

>> @@ -1363,6 +1407,9 @@ enqueue_task_fair(struct rq *rq, struct
>> ? ? ? ? ? ? ? ? ? ? ? break;
>> ? ? ? ? ? ? ? cfs_rq = cfs_rq_of(se);
>> ? ? ? ? ? ? ? enqueue_entity(cfs_rq, se, flags);
>> + ? ? ? ? ? ? /* don't continue to enqueue if our parent is throttled */
>> + ? ? ? ? ? ? if (cfs_rq_throttled(cfs_rq))
>> + ? ? ? ? ? ? ? ? ? ? break;
>> ? ? ? ? ? ? ? flags = ENQUEUE_WAKEUP;
>> ? ? ? }
>>
>> @@ -1390,8 +1437,11 @@ static void dequeue_task_fair(struct rq
>> ? ? ? ? ? ? ? cfs_rq = cfs_rq_of(se);
>> ? ? ? ? ? ? ? dequeue_entity(cfs_rq, se, flags);
>>
>> - ? ? ? ? ? ? /* Don't dequeue parent if it has other entities besides us */
>> - ? ? ? ? ? ? if (cfs_rq->load.weight)
>> + ? ? ? ? ? ? /*
>> + ? ? ? ? ? ? ?* Don't dequeue parent if it has other entities besides us,
>> + ? ? ? ? ? ? ?* or if it is throttled
>> + ? ? ? ? ? ? ?*/
>> + ? ? ? ? ? ? if (cfs_rq->load.weight || cfs_rq_throttled(cfs_rq))
>> ? ? ? ? ? ? ? ? ? ? ? break;
>> ? ? ? ? ? ? ? flags |= DEQUEUE_SLEEP;
>> ? ? ? }
>
> How could we even be running if our parent was throttled?
>

It's possible we throttled within the preceding dequeue_entity -- the
partial update_curr against cfs_rq might be just enough to push it
over the edge. In which case that entity has already been dequeued
and we want to bail out.

>> @@ -1430,6 +1480,42 @@ static u64 tg_request_cfs_quota(struct t
>> ? ? ? return delta;
>> ?}
>>
>> +static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
>> +{
>> + ? ? struct sched_entity *se;
>> +
>> + ? ? se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
>> +
>> + ? ? /* account load preceeding throttle */
>
> My spell checker thinks that should be written as: preceding.
>

My fat fingers have corrected this typo.

>> + ? ? update_cfs_load(cfs_rq, 0);
>> +
>> + ? ? /* prevent previous buddy nominations from re-picking this se */
>> + ? ? clear_buddies(cfs_rq_of(se), se);
>> +
>> + ? ? /*
>> + ? ? ?* It's possible for the current task to block and re-wake before task
>> + ? ? ?* switch, leading to a throttle within enqueue_task->update_curr()
>> + ? ? ?* versus an an entity that has not technically been enqueued yet.
>
> I'm not quite seeing how this would happen.. care to expand on this?
>

I'm not sure the example Bharata gave is correct -- I'm going to treat
that discussion separately as it's not the intent here.

Here the task _is_ running.

Specifically:

- Suppose the current task on a cfs_rq blocks
- Accordingly we issue dequeue against that task (however it remains
as curr until the put)
- Before we get to the put some other activity (e.g. network bottom
half) gets to run and re-wake the task
- The time elapsed for this is charged to the task, which might push
it over its reservation, it then gets throttled while we're trying to
queue it

BUT

We haven't actually done any of the enqueue work yet so there's
nothing to do to take it off rq. So what we just mark it throttled
and make sure that the rest of the enqueue work gets short circuited.

The clock_task helps reduce the occurrence of this since the task will
be spared the majority of the SI time but it's still possible to push
it over.


>> + ? ? ?* In this case, since we haven't actually done the enqueue yet, cut
>> + ? ? ?* out and allow enqueue_entity() to short-circuit
>> + ? ? ?*/
>> + ? ? if (!se->on_rq)
>> + ? ? ? ? ? ? goto out_throttled;
>> +
>> + ? ? for_each_sched_entity(se) {
>> + ? ? ? ? ? ? struct cfs_rq *cfs_rq = cfs_rq_of(se);
>> +
>> + ? ? ? ? ? ? dequeue_entity(cfs_rq, se, 1);
>> + ? ? ? ? ? ? if (cfs_rq->load.weight || cfs_rq_throttled(cfs_rq))
>> + ? ? ? ? ? ? ? ? ? ? break;
>> + ? ? }
>> +
>> +out_throttled:
>> + ? ? cfs_rq->throttled = 1;
>> + ? ? update_cfs_rq_load_contribution(cfs_rq, 1);
>> +}
>> +
>> ?static void account_cfs_rq_quota(struct cfs_rq *cfs_rq,
>> ? ? ? ? ? ? ? unsigned long delta_exec)
>> ?{
>> @@ -1438,10 +1524,16 @@ static void account_cfs_rq_quota(struct
>>
>> ? ? ? cfs_rq->quota_used += delta_exec;
>>
>> - ? ? if (cfs_rq->quota_used < cfs_rq->quota_assigned)
>> + ? ? if (cfs_rq_throttled(cfs_rq) ||
>> + ? ? ? ? ? ? cfs_rq->quota_used < cfs_rq->quota_assigned)
>> ? ? ? ? ? ? ? return;
>
> So we are throttled but running anyway, I suppose this comes from the PI
> ceiling muck?
>

No -- this is just the fact that there are cases where reschedule
can't evict the task immediately.

e.g. softirq or any kernel time without config_preempt

Once we're throttled we know there's no time left or point in trying
to acquire it so just short circuit these until we get to a point
where this task can be removed from rq.


>> ? ? ? cfs_rq->quota_assigned += tg_request_cfs_quota(cfs_rq->tg);
>> +
>> + ? ? if (cfs_rq->quota_used >= cfs_rq->quota_assigned) {
>> + ? ? ? ? ? ? throttle_cfs_rq(cfs_rq);
>> + ? ? ? ? ? ? resched_task(cfs_rq->rq->curr);
>> + ? ? }
>> ?}
>>
>> ?static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
>> @@ -1941,6 +2033,12 @@ static void check_preempt_wakeup(struct
>> ? ? ? if (unlikely(se == pse))
>> ? ? ? ? ? ? ? return;
>>
>> +#ifdef CONFIG_CFS_BANDWIDTH
>> + ? ? /* avoid pre-emption check/buddy nomination for throttled tasks */
>
> Somehow my spell checker doesn't like that hyphen.
>

Fixed

>> + ? ? if (!entity_on_rq(pse))
>> + ? ? ? ? ? ? return;
>> +#endif
>
> Ideally that #ifdef'ery would go away too.

This can 100% go away (and is already in the #ifdefs), but it will
always be true in the !BANDWIDTH case, so it's a micro-overhead.
Accompanying micro-optimization isn't really needed :)

>
>> ? ? ? if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
>> ? ? ? ? ? ? ? set_next_buddy(pse);
>>
>> @@ -2060,7 +2158,8 @@ static bool yield_to_task_fair(struct rq
>> ?{
>> ? ? ? struct sched_entity *se = &p->se;
>>
>> - ? ? if (!se->on_rq)
>> + ? ? /* ensure entire hierarchy is on rq (e.g. running & not throttled) */
>> + ? ? if (!entity_on_rq(se))
>> ? ? ? ? ? ? ? return false;
>
> like here..
>
>> ? ? ? /* Tell the scheduler that we'd really like pse to run next. */
>> @@ -2280,7 +2379,8 @@ static void update_shares(int cpu)
>>
>> ? ? ? rcu_read_lock();
>> ? ? ? for_each_leaf_cfs_rq(rq, cfs_rq)
>> - ? ? ? ? ? ? update_shares_cpu(cfs_rq->tg, cpu);
>> + ? ? ? ? ? ? if (!cfs_rq_throttled(cfs_rq))
>> + ? ? ? ? ? ? ? ? ? ? update_shares_cpu(cfs_rq->tg, cpu);
>
> This wants extra braces
>

Fixed

>> ? ? ? rcu_read_unlock();
>> ?}
>>
>> @@ -2304,9 +2404,10 @@ load_balance_fair(struct rq *this_rq, in
>> ? ? ? ? ? ? ? u64 rem_load, moved_load;
>>
>> ? ? ? ? ? ? ? /*
>> - ? ? ? ? ? ? ?* empty group
>> + ? ? ? ? ? ? ?* empty group or throttled cfs_rq
>> ? ? ? ? ? ? ? ?*/
>> - ? ? ? ? ? ? if (!busiest_cfs_rq->task_weight)
>> + ? ? ? ? ? ? if (!busiest_cfs_rq->task_weight ||
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? cfs_rq_throttled(busiest_cfs_rq))
>> ? ? ? ? ? ? ? ? ? ? ? continue;
>>
>> ? ? ? ? ? ? ? rem_load = (u64)rem_load_move * busiest_weight;
>>
>>
>
>

2011-02-25 03:42:01

by Paul Turner

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

On Thu, Feb 24, 2011 at 3:05 AM, Peter Zijlstra <[email protected]> wrote:
> On Thu, 2011-02-24 at 10:51 +0530, Bharata B Rao wrote:
>> Hi Peter,
>>
>> I will only answer a couple of your questions and let Paul clarify the rest...
>>
>> On Wed, Feb 23, 2011 at 02:32:13PM +0100, Peter Zijlstra wrote:
>> > On Tue, 2011-02-15 at 19:18 -0800, Paul Turner wrote:
>> >
>> >
>> > > @@ -1363,6 +1407,9 @@ enqueue_task_fair(struct rq *rq, struct
>> > > ? ? ? ? ? ? ? ? ? break;
>> > > ? ? ? ? ? cfs_rq = cfs_rq_of(se);
>> > > ? ? ? ? ? enqueue_entity(cfs_rq, se, flags);
>> > > + ? ? ? ? /* don't continue to enqueue if our parent is throttled */
>> > > + ? ? ? ? if (cfs_rq_throttled(cfs_rq))
>> > > + ? ? ? ? ? ? ? ? break;
>> > > ? ? ? ? ? flags = ENQUEUE_WAKEUP;
>> > > ? }
>> > >
>> > > @@ -1390,8 +1437,11 @@ static void dequeue_task_fair(struct rq
>> > > ? ? ? ? ? cfs_rq = cfs_rq_of(se);
>> > > ? ? ? ? ? dequeue_entity(cfs_rq, se, flags);
>> > >
>> > > - ? ? ? ? /* Don't dequeue parent if it has other entities besides us */
>> > > - ? ? ? ? if (cfs_rq->load.weight)
>> > > + ? ? ? ? /*
>> > > + ? ? ? ? ?* Don't dequeue parent if it has other entities besides us,
>> > > + ? ? ? ? ?* or if it is throttled
>> > > + ? ? ? ? ?*/
>> > > + ? ? ? ? if (cfs_rq->load.weight || cfs_rq_throttled(cfs_rq))
>> > > ? ? ? ? ? ? ? ? ? break;
>> > > ? ? ? ? ? flags |= DEQUEUE_SLEEP;
>> > > ? }
>> >
>> > How could we even be running if our parent was throttled?

The only way this can happen is if we are on our way out. The example
given doesn't apply to this case I don't think
>>
>> The task isn't running actually. One of its parents up in the heirarchy has
>> been throttled and been already dequeued. Now this task sits on its immediate
>> parent's runqueue which isn't throttled but not really running also since
>> the hierarchy is throttled.
>> In this situation, load balancer can try to pull
>> this task. When that happens, load balancer tries to dequeue it and this
>> check will ensure that we don't attempt to dequeue a group entity in our
>> hierarchy which has already been dequeued.
>
> That's insane, its throttled, that means it should be dequeued and
> should thus invisible for the load-balancer.

I agree. We ensure this does not happen by making the h_load zero.
Something I thought I was doing but apparently not, will fix in
repost.

> load-balancer will try and move tasks around to balance load, but all in
> vain, it'll move phantom loads around and get most confused at best.
>

Yeah this shouldn't happen, I don't think this example is a valid one.

> Pure and utter suckage if you ask me.
>
>> > > @@ -1438,10 +1524,16 @@ static void account_cfs_rq_quota(struct
>> > >
>> > > ? cfs_rq->quota_used += delta_exec;
>> > >
>> > > - if (cfs_rq->quota_used < cfs_rq->quota_assigned)
>> > > + if (cfs_rq_throttled(cfs_rq) ||
>> > > + ? ? ? ? cfs_rq->quota_used < cfs_rq->quota_assigned)
>> > > ? ? ? ? ? return;
>> >
>> > So we are throttled but running anyway, I suppose this comes from the PI
>> > ceiling muck?
>>
>> When a cfs_rq is throttled, its representative se (and all its parent
>> se's) get dequeued and the task is marked for resched. But the task entity is
>> still on its throttled parent's cfs_rq (=> task->se.on_rq = 1). Next during
>> put_prev_task_fair(), we enqueue the task back on its throttled parent's
>> cfs_rq at which time we end up calling update_curr() on throttled cfs_rq.
>> This check would help us bail out from that situation.
>
> But why bother with this early exit? At worst you'll call
> tg_request_cfs_quota() in vain, at best you'll find there is runtime
> because the period tick just happened on another cpu and you're good to
> go, yay!

It's for the non-preemptible case where we could be running for non
trivial time after reschedule()

Considering your second point I suppose there could be a micro-benefit
in checking in case the period tick did just happen to occur and then
self unthrottling... but I don't think it's really worth it.


>
>
>

2011-02-25 03:59:53

by Paul Turner

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

On Thu, Feb 24, 2011 at 9:20 AM, Peter Zijlstra <[email protected]> wrote:
> On Thu, 2011-02-24 at 22:09 +0530, Bharata B Rao wrote:
>> On Thu, Feb 24, 2011 at 04:52:53PM +0100, Peter Zijlstra wrote:
>> > On Thu, 2011-02-24 at 21:15 +0530, Bharata B Rao wrote:
>> > > While I admit that our load balancing semantics wrt thorttled entities are
>> > > not consistent (we don't allow pulling of tasks directly from throttled
>> > > cfs_rqs, while allow pulling of tasks from a throttled hierarchy as in the
>> > > above case), I am beginning to think if it works out to be advantageous.
>> > > Is there a chance that the task gets to run on other CPU where the hierarchy
>> > > isn't throttled since runtime is still available ?
>> >
>> > Possible yes, but the load-balancer doesn't know about that, not should
>> > it (its complicated, and broken, enough, no need to add more cruft to
>> > it).
>> >
>> > I'm starting to think you all should just toss all this and start over,
>> > its just too smelly.
>>
>> Hmm... You have brought up 3 concerns:
>>
>> 1. Hierarchy semantics
>>
>> If you look at the heirarchy semantics we currently have while ignoring the
>> load balancer interactions for a moment, I guess what we have is a reasonable
>> one.
>>
>> - Only group entities are throttled
>> - Throttled entities are taken off the runqueue and hence they never
>> ? get picked up for scheduling.
>> - New or child entites are queued up to the throttled entities and not
>> ? further up. As I said in another thread, having the tree intact and correct
>> ? underneath the throttled entity allows us to rebuild the hierarchy during
>> ? unthrottling with least amount of effort.
>
> It also gets you into all that load-balancer mess, and I'm not going to
> let you off lightly there.
>

I think the example was a little cuckoo. As you say, it's dequeued
and invisible to the load balancer.

The special case of block->wakeup->throttle->put only exists for the
current task which is ineligible for non-active load-balance anyway.

>> - Group entities in a hierarchy are throttled independent of each other based
>> ? on their bandwidth specification.
>
> That's missing out quite a few details.. for one there is no mention of
> hierarchical implication of/constraints on bandwidth, can children have
> more bandwidth than their parent (I hope not).
>

I wasn't planning to enforce it since I believe there is value in
non-conformant constraints:

Consider:

- I have some application that I want to limit to 3 cpus
I have a 2 workers in that application, across a period I would like
those workers to use a maximum of say 2.5 cpus each (suppose they
serve some sort of co-processor request per user and we want to
prevent a single user eating our entire limit and starving out
everything else).

The goal in this case is not preventing over-subscription, but
ensuring that some part threads is not allowed to blow our entire
quota, while not destroying the (relatively) work-conserving aspect of
its performance in general.

The above occurs sufficiently often that at the very least I think
conformance checking would have to be gated by a sysctl so that this
use case is still enabled.

- There's also the case of "I want to manage a newly abusive user,
being smart I've given his hierarchy a unique root so that I can
constrain them."
A non-conformant constraint avoids the adversarial problem of having
to find and bring all of their set (possibly maliciously large) limits
within the global limit I want to impose upon them.

My viewpoint was that if some idiot wants to set up such a tree
(unintentionally) it's their own damn fault but I suppose we should at
least give them a safety :) I'll add it.

>> 2. Handling of throttled entities by load balancer
>>
>> This definetely needs to improve and be more consistent. We can work on this.
>
> Feh, improve is being nice about it, it needs a complete overhaul, the
> current situation is a cobbled together leaky mess.
>

I think as long as the higher level semantics are correct and
throttling happens /sanely/ this is a non-issue.

>> 3. per-cgroup vs global period specification
>>
>> I thought per-cgroup specification would be most flexible and hence started
>> out with that. This would allow groups/workloads/VMs to define their
>> own bandwidth rate.
>
> Most flexible yes, most 'interesting' too, now if you consider running a
> child task is also running the parent entity and therefore you're
> consuming bandwidth up the entire hierarchy, what happens when the
> parent has a much larger period than the child?
>
> In that case your child doesn't get ran while the parent is throttled,
> and the child's period is violated.
>

There are definitely cases where this is both valid and useful. I
think gating conformancy allows for both (especially if it defaults to
"on").

>
>> Let us know if you have other design concerns besides these.
>
> Yeah, that weird time accounting muck, bandwidth should decrease on
> usage and incremented on replenishment, this gets you 0 as the natural
> boundary between credit and debt, no need to keep two variables.
>

Yes, agreed! Fixing :)

> Also, the above just about covers all the patch set does, isn't that
> enough justification to throw the thing out and start over?
>

2011-02-25 13:59:09

by Bharata B Rao

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

On Thu, Feb 24, 2011 at 07:10:58PM -0800, Paul Turner wrote:
> On Wed, Feb 23, 2011 at 5:32 AM, Peter Zijlstra <[email protected]> wrote:
> > On Tue, 2011-02-15 at 19:18 -0800, Paul Turner wrote:
>
> >> + ? ? update_cfs_load(cfs_rq, 0);
> >> +
> >> + ? ? /* prevent previous buddy nominations from re-picking this se */
> >> + ? ? clear_buddies(cfs_rq_of(se), se);
> >> +
> >> + ? ? /*
> >> + ? ? ?* It's possible for the current task to block and re-wake before task
> >> + ? ? ?* switch, leading to a throttle within enqueue_task->update_curr()
> >> + ? ? ?* versus an an entity that has not technically been enqueued yet.
> >
> > I'm not quite seeing how this would happen.. care to expand on this?
> >
>
> I'm not sure the example Bharata gave is correct -- I'm going to treat
> that discussion separately as it's not the intent here.

Just for the record, my examples were not given for the above question from
Peter.

I answered two questions and I am tempted to stand by those until proven
wrong :)

1. Why do we have cfs_rq_throtted() check in dequeue_task_fair() ?
( => How could we be running if our parent was throttled ?)

Consider the following hierarchy.

Root Group
|
|
Group 1 (Bandwidth constrained group)
|
|
Group 2 (Infinite runtime group)

Assume both the groups have tasks in them.

When Group 1 is throttled, its cfs_rq is marked throttled, and is removed from
Root group's runqueue. But leaf tasks in Group 2 continue to be enqueued in
Group 1's runqueue.

Load balancer kicks in on CPU A and figures out that it can pull a few tasks
from CPU B (busiest_cpu). It iterates through all the task groups
(load_balance_fair) and considers Group 2 also. It tries to pull a task from
CPU B's cfs_rq for Group 2. I don't see anything that would prevent the
load balancer from bailing out here. Note that Group 2 is technically
not throttled, only its parent Group 1 is. Load balancer goes ahead and
starts pulling individual tasks from Group 2's cfs_rq on CPU B. This
results in dequeuing of task whose hierarchy is throttled.

When load balancer iterates through Group 1's cfs_rqs, the situation is
different because we have already marked Group 1's cfs_rqs as throttled.
And we check this in load_balance_fair() and bail out from pulling tasks
from throttled hierarchy.

This is my understanding. Let me know what I miss. Specifically I would
like to understand how do you ensure that load balancer doesn't consider
tasks from throttled cfs_rqs for pulling.

2. Why there is cfs_rq_throttled() check in account_cfs_rq_quota() ?

In addition to the case you described, I believe the situation I described
is also valid.

Regards,
Bharata.

2011-02-25 20:51:37

by Paul Turner

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

On Fri, Feb 25, 2011 at 5:58 AM, Bharata B Rao
<[email protected]> wrote:
> On Thu, Feb 24, 2011 at 07:10:58PM -0800, Paul Turner wrote:
>> On Wed, Feb 23, 2011 at 5:32 AM, Peter Zijlstra <[email protected]> wrote:
>> > On Tue, 2011-02-15 at 19:18 -0800, Paul Turner wrote:
>>
>> >> + ? ? update_cfs_load(cfs_rq, 0);
>> >> +
>> >> + ? ? /* prevent previous buddy nominations from re-picking this se */
>> >> + ? ? clear_buddies(cfs_rq_of(se), se);
>> >> +
>> >> + ? ? /*
>> >> + ? ? ?* It's possible for the current task to block and re-wake before task
>> >> + ? ? ?* switch, leading to a throttle within enqueue_task->update_curr()
>> >> + ? ? ?* versus an an entity that has not technically been enqueued yet.
>> >
>> > I'm not quite seeing how this would happen.. care to expand on this?
>> >
>>
>> I'm not sure the example Bharata gave is correct -- I'm going to treat
>> that discussion separately as it's not the intent here.
>
> Just for the record, my examples were not given for the above question from
> Peter.
>
> I answered two questions and I am tempted to stand by those until proven
> wrong :)

This is important to get right, I'm happy to elaborate.

>
> 1. Why do we have cfs_rq_throtted() check in dequeue_task_fair() ?

The check is primarily needed because we could become throttled as
part of a regular dequeue. At which point we bail because the parent
dequeue is actually complete.

(Were it necessitated by load balance we could actually not do this
and just perform a hierarchal check within load_balance_fair)

> ( => How could we be running if our parent was throttled ?)
>

The only way we can be running if our parent was throttled is if /we/
triggered that throttle and have been marked for re-schedule.

> Consider the following hierarchy.
>
> Root Group
> ? |
> ? |
> Group 1 (Bandwidth constrained group)
> ? |
> ? |
> Group 2 (Infinite runtime group)
>
> Assume both the groups have tasks in them.
>
> When Group 1 is throttled, its cfs_rq is marked throttled, and is removed from
> Root group's runqueue. But leaf tasks in Group 2 continue to be enqueued in
> Group 1's runqueue.
>

Yes, the hierarchy state is maintained in isolation.

> Load balancer kicks in on CPU A and figures out that it can pull a few tasks
> from CPU B (busiest_cpu). It iterates through all the task groups
> (load_balance_fair) and considers Group 2 also. It tries to pull a task from
> CPU B's cfs_rq for Group 2. I don't see anything that would prevent the
> load balancer from bailing out here.

Per above, the descendants of a throttled group are also identified
(and appropriately skipped) using h_load.

> Note that Group 2 is technically
> not throttled, only its parent Group 1 is. Load balancer goes ahead and
> starts pulling individual tasks from Group 2's cfs_rq on CPU B.

In general, not true -- load balancing against a throttled hierarchy
is crazy[*].

> This results in dequeuing of task whose hierarchy is throttled.
>

[*]: There is one edge case in which it may sanely occur:

Namely, if the load balance races with a throttle (since we don't take
rq->locks until we start actually moving tasks). In this case it's
still ok because the cached h_load ensures the load balancer is still
working from a sane load view and it's as if we performed a minute
re-ordering so it's as if the load-balance had occurred fractionally
before the throttle instead of fractionally after.


> When load balancer iterates through Group 1's cfs_rqs, the situation is
> different because we have already marked Group 1's cfs_rqs as throttled.
> And we check this in load_balance_fair() and bail out from pulling tasks
> from throttled hierarchy.
>
> This is my understanding. Let me know what I miss. Specifically I would
> like to understand how do you ensure that load balancer doesn't consider
> tasks from throttled cfs_rqs for pulling.
>
> 2. Why there is cfs_rq_throttled() check in account_cfs_rq_quota() ?
>
> In addition to the case you described, I believe the situation I described
> is also valid.
>

The point made above was that it's actually for any update that may
(legitimately) occur as throttling can not always be aligned with
eviction. The case you gave is one of several -- there's nothing
particularly unique about it (nor did I actually disagree with it).

> Regards,
> Bharata.
>

2011-02-28 03:50:36

by Bharata B Rao

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

On Fri, Feb 25, 2011 at 12:51:01PM -0800, Paul Turner wrote:
> On Fri, Feb 25, 2011 at 5:58 AM, Bharata B Rao
> <[email protected]> wrote:
> > On Thu, Feb 24, 2011 at 07:10:58PM -0800, Paul Turner wrote:
> >> On Wed, Feb 23, 2011 at 5:32 AM, Peter Zijlstra <[email protected]> wrote:
> >> > On Tue, 2011-02-15 at 19:18 -0800, Paul Turner wrote:
> >>
> >> >> + ? ? update_cfs_load(cfs_rq, 0);
> >> >> +
> >> >> + ? ? /* prevent previous buddy nominations from re-picking this se */
> >> >> + ? ? clear_buddies(cfs_rq_of(se), se);
> >> >> +
> >> >> + ? ? /*
> >> >> + ? ? ?* It's possible for the current task to block and re-wake before task
> >> >> + ? ? ?* switch, leading to a throttle within enqueue_task->update_curr()
> >> >> + ? ? ?* versus an an entity that has not technically been enqueued yet.
> >> >
> >> > I'm not quite seeing how this would happen.. care to expand on this?
> >> >
> >>
> >> I'm not sure the example Bharata gave is correct -- I'm going to treat
> >> that discussion separately as it's not the intent here.
> >
> > Just for the record, my examples were not given for the above question from
> > Peter.
> >
> > I answered two questions and I am tempted to stand by those until proven
> > wrong :)
>
> This is important to get right, I'm happy to elaborate.
>
> >
> > 1. Why do we have cfs_rq_throtted() check in dequeue_task_fair() ?
>
> The check is primarily needed because we could become throttled as
> part of a regular dequeue. At which point we bail because the parent
> dequeue is actually complete.
>
> (Were it necessitated by load balance we could actually not do this
> and just perform a hierarchal check within load_balance_fair)
>
> > ( => How could we be running if our parent was throttled ?)
> >
>
> The only way we can be running if our parent was throttled is if /we/
> triggered that throttle and have been marked for re-schedule.
>
> > Consider the following hierarchy.
> >
> > Root Group
> > ? |
> > ? |
> > Group 1 (Bandwidth constrained group)
> > ? |
> > ? |
> > Group 2 (Infinite runtime group)
> >
> > Assume both the groups have tasks in them.
> >
> > When Group 1 is throttled, its cfs_rq is marked throttled, and is removed from
> > Root group's runqueue. But leaf tasks in Group 2 continue to be enqueued in
> > Group 1's runqueue.
> >
>
> Yes, the hierarchy state is maintained in isolation.
>
> > Load balancer kicks in on CPU A and figures out that it can pull a few tasks
> > from CPU B (busiest_cpu). It iterates through all the task groups
> > (load_balance_fair) and considers Group 2 also. It tries to pull a task from
> > CPU B's cfs_rq for Group 2. I don't see anything that would prevent the
> > load balancer from bailing out here.
>
> Per above, the descendants of a throttled group are also identified
> (and appropriately skipped) using h_load.

This bit is still unclear to me. We do nothing in tg_load_down() to treat
throttled cfs_rqs differently when calculating h_load. Nor do we do
anything in load_balance_fair() to explicitly identify descendents of
throttled group using h_load AFAICS. All we have is
cfs_rq_throttled() check, which I think should be converted to entity_on_rq()
to check for the throttled hierarchy and discard pulling from throttled
hierarchies.

Regards,
Bharata.

2011-02-28 06:39:08

by Paul Turner

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

On Sun, Feb 27, 2011 at 7:50 PM, Bharata B Rao
<[email protected]> wrote:
> On Fri, Feb 25, 2011 at 12:51:01PM -0800, Paul Turner wrote:
>> On Fri, Feb 25, 2011 at 5:58 AM, Bharata B Rao
>> <[email protected]> wrote:
>> > On Thu, Feb 24, 2011 at 07:10:58PM -0800, Paul Turner wrote:
>> >> On Wed, Feb 23, 2011 at 5:32 AM, Peter Zijlstra <[email protected]> wrote:
>> >> > On Tue, 2011-02-15 at 19:18 -0800, Paul Turner wrote:
>> >>
>> >> >> + ? ? update_cfs_load(cfs_rq, 0);
>> >> >> +
>> >> >> + ? ? /* prevent previous buddy nominations from re-picking this se */
>> >> >> + ? ? clear_buddies(cfs_rq_of(se), se);
>> >> >> +
>> >> >> + ? ? /*
>> >> >> + ? ? ?* It's possible for the current task to block and re-wake before task
>> >> >> + ? ? ?* switch, leading to a throttle within enqueue_task->update_curr()
>> >> >> + ? ? ?* versus an an entity that has not technically been enqueued yet.
>> >> >
>> >> > I'm not quite seeing how this would happen.. care to expand on this?
>> >> >
>> >>
>> >> I'm not sure the example Bharata gave is correct -- I'm going to treat
>> >> that discussion separately as it's not the intent here.
>> >
>> > Just for the record, my examples were not given for the above question from
>> > Peter.
>> >
>> > I answered two questions and I am tempted to stand by those until proven
>> > wrong :)
>>
>> This is important to get right, I'm happy to elaborate.
>>
>> >
>> > 1. Why do we have cfs_rq_throtted() check in dequeue_task_fair() ?
>>
>> The check is primarily needed because we could become throttled as
>> part of a regular dequeue. ?At which point we bail because the parent
>> dequeue is actually complete.
>>
>> (Were it necessitated by load balance we could actually not do this
>> and just perform a hierarchal check within load_balance_fair)
>>
>> > ( => How could we be running if our parent was throttled ?)
>> >
>>
>> The only way we can be running if our parent was throttled is if /we/
>> triggered that throttle and have been marked for re-schedule.
>>
>> > Consider the following hierarchy.
>> >
>> > Root Group
>> > ? |
>> > ? |
>> > Group 1 (Bandwidth constrained group)
>> > ? |
>> > ? |
>> > Group 2 (Infinite runtime group)
>> >
>> > Assume both the groups have tasks in them.
>> >
>> > When Group 1 is throttled, its cfs_rq is marked throttled, and is removed from
>> > Root group's runqueue. But leaf tasks in Group 2 continue to be enqueued in
>> > Group 1's runqueue.
>> >
>>
>> Yes, the hierarchy state is maintained in isolation.
>>
>> > Load balancer kicks in on CPU A and figures out that it can pull a few tasks
>> > from CPU B (busiest_cpu). It iterates through all the task groups
>> > (load_balance_fair) and considers Group 2 also. It tries to pull a task from
>> > CPU B's cfs_rq for Group 2. I don't see anything that would prevent the
>> > load balancer from bailing out here.
>>
>> Per above, the descendants of a throttled group are also identified
>> (and appropriately skipped) using h_load.
>
> This bit is still unclear to me. We do nothing in tg_load_down() to treat
> throttled cfs_rqs differently when calculating h_load.

>From above:

"I agree. We ensure this does not happen by making the h_load zero.
Something I thought I was doing but apparently not, will fix in
repost."

>Nor do we do
> anything in load_balance_fair() to explicitly identify descendents of
> throttled group using h_load AFAICS. All we have is
> cfs_rq_throttled() check, which I think should be converted to entity_on_rq()
> to check for the throttled hierarchy and discard pulling from throttled
> hierarchies.
>
> Regards,
> Bharata.
>

2011-02-28 13:49:11

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

On Thu, 2011-02-24 at 19:10 -0800, Paul Turner wrote:

> >> @@ -761,7 +788,11 @@ static void update_cfs_load(struct cfs_r
> >> u64 now, delta;
> >> unsigned long load = cfs_rq->load.weight;
> >>
> >> - if (cfs_rq->tg == &root_task_group)
> >> + /*
> >> + * Don't maintain averages for the root task group, or while we are
> >> + * throttled.
> >> + */
> >> + if (cfs_rq->tg == &root_task_group || cfs_rq_throttled(cfs_rq))
> >> return;
> >>
> >> now = rq_of(cfs_rq)->clock_task;
> >
> > Placing the return there avoids updating the timestamps, so once we get
> > unthrottled we'll observe a very long period and skew the load avg?
> >
>
> It's easier to avoid this by fixing up the load average on unthrottle,
> since there's no point in moving up the intermediate timestamps on
> each throttled update.
>
> The one "gotcha" in either case is that it's possible for time to
> drift on the child of a throttled group and I don't see an easy way
> around this.

drift how? running while being throttled due to non-preempt and other
things?

> > Ideally we'd never call this on throttled groups to begin with and
> > handle them like full dequeue/enqueue like things.
> >
>
> This is what is attempted -- however it's still possible actions such
> as wakeup which may still occur against throttled groups regardless of
> their queue state.
>
> In this case we still need to preserve the correct child hierarchy
> state so that it can be re-enqueued when there is again bandwidth.

If wakeup is the one sore spot, why not terminate the hierarchy
iteration in enqueue_task_fair that does all the load bits?

> >> @@ -1015,6 +1046,14 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
> >> * Update run-time statistics of the 'current'.
> >> */
> >> update_curr(cfs_rq);
> >> +
> >> +
> >> +#ifdef CONFIG_CFS_BANDWIDTH
> >> + if (!entity_is_task(se) && (cfs_rq_throttled(group_cfs_rq(se)) ||
> >> + !group_cfs_rq(se)->nr_running))
> >> + return;
> >> +#endif
> >> +
> >> update_cfs_load(cfs_rq, 0);
> >> account_entity_enqueue(cfs_rq, se);
> >> update_cfs_shares(cfs_rq);
> >> @@ -1087,6 +1126,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
> >> */
> >> update_curr(cfs_rq);
> >>
> >> +#ifdef CONFIG_CFS_BANDWIDTH
> >> + if (!entity_is_task(se) && cfs_rq_throttled(group_cfs_rq(se)))
> >> + return;
> >> +#endif
> >> +
> >> update_stats_dequeue(cfs_rq, se);
> >> if (flags & DEQUEUE_SLEEP) {
> >> #ifdef CONFIG_SCHEDSTATS
> >
> > These make me very nervous, on enqueue you bail after adding
> > min_vruntime to ->vruntime and calling update_curr(), but on dequeue you
> > bail before subtracting min_vruntime from ->vruntime.
> >
>
> min_vruntime shouldn't be added in enqueue since unthrottling is
> treated as a wakeup (which results in placement versus min as opposed
> to normalization).

Sure, but at least put a comment there, I mean that's a glaring
asymmetry.

> >> @@ -1363,6 +1407,9 @@ enqueue_task_fair(struct rq *rq, struct
> >> break;
> >> cfs_rq = cfs_rq_of(se);
> >> enqueue_entity(cfs_rq, se, flags);
> >> + /* don't continue to enqueue if our parent is throttled */
> >> + if (cfs_rq_throttled(cfs_rq))
> >> + break;
> >> flags = ENQUEUE_WAKEUP;
> >> }
> >>
> >> @@ -1390,8 +1437,11 @@ static void dequeue_task_fair(struct rq
> >> cfs_rq = cfs_rq_of(se);
> >> dequeue_entity(cfs_rq, se, flags);
> >>
> >> - /* Don't dequeue parent if it has other entities besides us */
> >> - if (cfs_rq->load.weight)
> >> + /*
> >> + * Don't dequeue parent if it has other entities besides us,
> >> + * or if it is throttled
> >> + */
> >> + if (cfs_rq->load.weight || cfs_rq_throttled(cfs_rq))
> >> break;
> >> flags |= DEQUEUE_SLEEP;
> >> }
> >
> > How could we even be running if our parent was throttled?
> >
>
> It's possible we throttled within the preceding dequeue_entity -- the
> partial update_curr against cfs_rq might be just enough to push it
> over the edge. In which case that entity has already been dequeued
> and we want to bail out.

right.

>
> >> @@ -1430,6 +1480,42 @@ static u64 tg_request_cfs_quota(struct t
> >> return delta;
> >> }
> >>
> >> +static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
> >> +{
> >> + struct sched_entity *se;
> >> +
> >> + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
> >> +
> >> + /* account load preceeding throttle */
> >> + update_cfs_load(cfs_rq, 0);
> >> +
> >> + /* prevent previous buddy nominations from re-picking this se */
> >> + clear_buddies(cfs_rq_of(se), se);
> >> +
> >> + /*
> >> + * It's possible for the current task to block and re-wake before task
> >> + * switch, leading to a throttle within enqueue_task->update_curr()
> >> + * versus an an entity that has not technically been enqueued yet.
> >
> > I'm not quite seeing how this would happen.. care to expand on this?
> >
>
> I'm not sure the example Bharata gave is correct -- I'm going to treat
> that discussion separately as it's not the intent here.
>
> Here the task _is_ running.
>
> Specifically:
>
> - Suppose the current task on a cfs_rq blocks
> - Accordingly we issue dequeue against that task (however it remains
> as curr until the put)
> - Before we get to the put some other activity (e.g. network bottom
> half) gets to run and re-wake the task
> - The time elapsed for this is charged to the task, which might push
> it over its reservation, it then gets throttled while we're trying to
> queue it
>
> BUT
>
> We haven't actually done any of the enqueue work yet so there's
> nothing to do to take it off rq. So what we just mark it throttled
> and make sure that the rest of the enqueue work gets short circuited.
>
> The clock_task helps reduce the occurrence of this since the task will
> be spared the majority of the SI time but it's still possible to push
> it over.

Ah, uhm, so this is all due to us dropping rq->lock after dequeue,
right? Would

https://lkml.org/lkml/2011/1/4/228

help here?

> >> + * In this case, since we haven't actually done the enqueue yet, cut
> >> + * out and allow enqueue_entity() to short-circuit
> >> + */
> >> + if (!se->on_rq)
> >> + goto out_throttled;
> >> +
> >> + for_each_sched_entity(se) {
> >> + struct cfs_rq *cfs_rq = cfs_rq_of(se);
> >> +
> >> + dequeue_entity(cfs_rq, se, 1);
> >> + if (cfs_rq->load.weight || cfs_rq_throttled(cfs_rq))
> >> + break;
> >> + }
> >> +
> >> +out_throttled:
> >> + cfs_rq->throttled = 1;
> >> + update_cfs_rq_load_contribution(cfs_rq, 1);
> >> +}
> >> +
> >> static void account_cfs_rq_quota(struct cfs_rq *cfs_rq,
> >> unsigned long delta_exec)
> >> {
> >> @@ -1438,10 +1524,16 @@ static void account_cfs_rq_quota(struct
> >>
> >> cfs_rq->quota_used += delta_exec;
> >>
> >> - if (cfs_rq->quota_used < cfs_rq->quota_assigned)
> >> + if (cfs_rq_throttled(cfs_rq) ||
> >> + cfs_rq->quota_used < cfs_rq->quota_assigned)
> >> return;
> >
> > So we are throttled but running anyway, I suppose this comes from the PI
> > ceiling muck?
> >
>
> No -- this is just the fact that there are cases where reschedule
> can't evict the task immediately.
>
> e.g. softirq or any kernel time without config_preempt
>
> Once we're throttled we know there's no time left or point in trying
> to acquire it so just short circuit these until we get to a point
> where this task can be removed from rq.

Right, but like I argued in another email, it could be refreshed on
another cpu and you now miss it.. :-)

> >> + if (!entity_on_rq(pse))
> >> + return;
> >> +#endif
> >
> > Ideally that #ifdef'ery would go away too.
>
> This can 100% go away (and is already in the #ifdefs), but it will
> always be true in the !BANDWIDTH case, so it's a micro-overhead.
> Accompanying micro-optimization isn't really needed :)

Wouldn't gcc be able to optimize if (!true) stmt; with DCE ?

2011-03-01 08:31:43

by Paul Turner

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

On Mon, Feb 28, 2011 at 5:48 AM, Peter Zijlstra <[email protected]> wrote:
> On Thu, 2011-02-24 at 19:10 -0800, Paul Turner wrote:
>
>> >> @@ -761,7 +788,11 @@ static void update_cfs_load(struct cfs_r
>> >> ? ? ? u64 now, delta;
>> >> ? ? ? unsigned long load = cfs_rq->load.weight;
>> >>
>> >> - ? ? if (cfs_rq->tg == &root_task_group)
>> >> + ? ? /*
>> >> + ? ? ?* Don't maintain averages for the root task group, or while we are
>> >> + ? ? ?* throttled.
>> >> + ? ? ?*/
>> >> + ? ? if (cfs_rq->tg == &root_task_group || cfs_rq_throttled(cfs_rq))
>> >> ? ? ? ? ? ? ? return;
>> >>
>> >> ? ? ? now = rq_of(cfs_rq)->clock_task;
>> >
>> > Placing the return there avoids updating the timestamps, so once we get
>> > unthrottled we'll observe a very long period and skew the load avg?
>> >
>>
>> It's easier to avoid this by fixing up the load average on unthrottle,
>> since there's no point in moving up the intermediate timestamps on
>> each throttled update.
>>
>> The one "gotcha" in either case is that it's possible for time to
>> drift on the child of a throttled group and I don't see an easy way
>> around this.
>
> drift how? running while being throttled due to non-preempt and other
> things?
>

Not quite -- that time will actually be omitted since we nuke the
last_update on unthrottle.

What I was referring to here is that it's not easy (and not currently
done) to freeze the load average clock for descendants of a throttled
entity.

Plugging this properly is actually rather annoying since we'd have to
do a tree walk at both throttle and unthrottle (it's not sufficient to
just fix things up at unthrottle because wakeups can lead to updates
in the interim, meaning we'd need to mark some state).

Whether this drift is worth all that hassle is questionable at this point.

>> > Ideally we'd never call this on throttled groups to begin with and
>> > handle them like full dequeue/enqueue like things.
>> >
>>
>> This is what is attempted -- however it's still possible actions such
>> as wakeup which may still occur against throttled groups regardless of
>> their queue state.
>>
>> In this case we still need to preserve the correct child hierarchy
>> state so that it can be re-enqueued when there is again bandwidth.
>
> If wakeup is the one sore spot, why not terminate the hierarchy
> iteration in enqueue_task_fair that does all the load bits?
>

It does actually terminate early already, the problem here is the
processing for children has already occurred since it's a bottom up
enqueue.

Perhaps this can be simplified:

If we do a single walk at the start to validate whether the entity is
throttled (we can use entity_on_rq to keep things clean) then we can
pass a flag into enqueue_entity indicating whether it is part of a
throttled hierarchy.

This would allow us to skip the accounting and keep things accurate
above without having to do all the tree walks at throttle/unthrottle.

Let me see what this looks like for the next posting.

>> >> @@ -1015,6 +1046,14 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
>> >> ? ? ? ?* Update run-time statistics of the 'current'.
>> >> ? ? ? ?*/
>> >> ? ? ? update_curr(cfs_rq);
>> >> +
>> >> +
>> >> +#ifdef CONFIG_CFS_BANDWIDTH
>> >> + ? ? if (!entity_is_task(se) && (cfs_rq_throttled(group_cfs_rq(se)) ||
>> >> + ? ? ? ? ?!group_cfs_rq(se)->nr_running))
>> >> + ? ? ? ? ? ? return;
>> >> +#endif
>> >> +
>> >> ? ? ? update_cfs_load(cfs_rq, 0);
>> >> ? ? ? account_entity_enqueue(cfs_rq, se);
>> >> ? ? ? update_cfs_shares(cfs_rq);
>> >> @@ -1087,6 +1126,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
>> >> ? ? ? ?*/
>> >> ? ? ? update_curr(cfs_rq);
>> >>
>> >> +#ifdef CONFIG_CFS_BANDWIDTH
>> >> + ? ? if (!entity_is_task(se) && cfs_rq_throttled(group_cfs_rq(se)))
>> >> + ? ? ? ? ? ? return;
>> >> +#endif
>> >> +
>> >> ? ? ? update_stats_dequeue(cfs_rq, se);
>> >> ? ? ? if (flags & DEQUEUE_SLEEP) {
>> >> ?#ifdef CONFIG_SCHEDSTATS
>> >
>> > These make me very nervous, on enqueue you bail after adding
>> > min_vruntime to ->vruntime and calling update_curr(), but on dequeue you
>> > bail before subtracting min_vruntime from ->vruntime.
>> >
>>
>> min_vruntime shouldn't be added in enqueue since unthrottling is
>> treated as a wakeup (which results in placement versus min as opposed
>> to normalization).
>
> Sure, but at least put a comment there, I mean that's a glaring
> asymmetry.
>
>> >> @@ -1363,6 +1407,9 @@ enqueue_task_fair(struct rq *rq, struct
>> >> ? ? ? ? ? ? ? ? ? ? ? break;
>> >> ? ? ? ? ? ? ? cfs_rq = cfs_rq_of(se);
>> >> ? ? ? ? ? ? ? enqueue_entity(cfs_rq, se, flags);
>> >> + ? ? ? ? ? ? /* don't continue to enqueue if our parent is throttled */
>> >> + ? ? ? ? ? ? if (cfs_rq_throttled(cfs_rq))
>> >> + ? ? ? ? ? ? ? ? ? ? break;
>> >> ? ? ? ? ? ? ? flags = ENQUEUE_WAKEUP;
>> >> ? ? ? }
>> >>
>> >> @@ -1390,8 +1437,11 @@ static void dequeue_task_fair(struct rq
>> >> ? ? ? ? ? ? ? cfs_rq = cfs_rq_of(se);
>> >> ? ? ? ? ? ? ? dequeue_entity(cfs_rq, se, flags);
>> >>
>> >> - ? ? ? ? ? ? /* Don't dequeue parent if it has other entities besides us */
>> >> - ? ? ? ? ? ? if (cfs_rq->load.weight)
>> >> + ? ? ? ? ? ? /*
>> >> + ? ? ? ? ? ? ?* Don't dequeue parent if it has other entities besides us,
>> >> + ? ? ? ? ? ? ?* or if it is throttled
>> >> + ? ? ? ? ? ? ?*/
>> >> + ? ? ? ? ? ? if (cfs_rq->load.weight || cfs_rq_throttled(cfs_rq))
>> >> ? ? ? ? ? ? ? ? ? ? ? break;
>> >> ? ? ? ? ? ? ? flags |= DEQUEUE_SLEEP;
>> >> ? ? ? }
>> >
>> > How could we even be running if our parent was throttled?
>> >
>>
>> It's possible we throttled within the preceding dequeue_entity -- the
>> partial update_curr against cfs_rq might be just enough to push it
>> over the edge. ?In which case that entity has already been dequeued
>> and we want to bail out.
>
> right.
>
>>
>> >> @@ -1430,6 +1480,42 @@ static u64 tg_request_cfs_quota(struct t
>> >> ? ? ? return delta;
>> >> ?}
>> >>
>> >> +static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
>> >> +{
>> >> + ? ? struct sched_entity *se;
>> >> +
>> >> + ? ? se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
>> >> +
>> >> + ? ? /* account load preceeding throttle */
>> >> + ? ? update_cfs_load(cfs_rq, 0);
>> >> +
>> >> + ? ? /* prevent previous buddy nominations from re-picking this se */
>> >> + ? ? clear_buddies(cfs_rq_of(se), se);
>> >> +
>> >> + ? ? /*
>> >> + ? ? ?* It's possible for the current task to block and re-wake before task
>> >> + ? ? ?* switch, leading to a throttle within enqueue_task->update_curr()
>> >> + ? ? ?* versus an an entity that has not technically been enqueued yet.
>> >
>> > I'm not quite seeing how this would happen.. care to expand on this?
>> >
>>
>> I'm not sure the example Bharata gave is correct -- I'm going to treat
>> that discussion separately as it's not the intent here.
>>
>> Here the task _is_ running.
>>
>> Specifically:
>>
>> - Suppose the current task on a cfs_rq blocks
>> - Accordingly we issue dequeue against that task (however it remains
>> as curr until the put)
>> - Before we get to the put some other activity (e.g. network bottom
>> half) gets to run and re-wake the task
>> - The time elapsed for this is charged to the task, which might push
>> it over its reservation, it then gets throttled while we're trying to
>> queue it
>>
>> BUT
>>
>> We haven't actually done any of the enqueue work yet so there's
>> nothing to do to take it off rq. ?So what we just mark it throttled
>> and make sure that the rest of the enqueue work gets short circuited.
>>
>> The clock_task helps reduce the occurrence of this since the task will
>> be spared the majority of the SI time but it's still possible to push
>> it over.
>
> Ah, uhm, so this is all due to us dropping rq->lock after dequeue,
> right? Would
>
> ?https://lkml.org/lkml/2011/1/4/228
>
> help here?
>

omm at a glance, it'll still be cfs_rq->curr until we actually issue the put.

I don't think we can skip update_curr in the !p->on_rq case since for
the non-preempt kernel case we should be counting that time; so I
think this case would unfortunately still exist.

>> >> + ? ? ?* In this case, since we haven't actually done the enqueue yet, cut
>> >> + ? ? ?* out and allow enqueue_entity() to short-circuit
>> >> + ? ? ?*/
>> >> + ? ? if (!se->on_rq)
>> >> + ? ? ? ? ? ? goto out_throttled;
>> >> +
>> >> + ? ? for_each_sched_entity(se) {
>> >> + ? ? ? ? ? ? struct cfs_rq *cfs_rq = cfs_rq_of(se);
>> >> +
>> >> + ? ? ? ? ? ? dequeue_entity(cfs_rq, se, 1);
>> >> + ? ? ? ? ? ? if (cfs_rq->load.weight || cfs_rq_throttled(cfs_rq))
>> >> + ? ? ? ? ? ? ? ? ? ? break;
>> >> + ? ? }
>> >> +
>> >> +out_throttled:
>> >> + ? ? cfs_rq->throttled = 1;
>> >> + ? ? update_cfs_rq_load_contribution(cfs_rq, 1);
>> >> +}
>> >> +
>> >> ?static void account_cfs_rq_quota(struct cfs_rq *cfs_rq,
>> >> ? ? ? ? ? ? ? unsigned long delta_exec)
>> >> ?{
>> >> @@ -1438,10 +1524,16 @@ static void account_cfs_rq_quota(struct
>> >>
>> >> ? ? ? cfs_rq->quota_used += delta_exec;
>> >>
>> >> - ? ? if (cfs_rq->quota_used < cfs_rq->quota_assigned)
>> >> + ? ? if (cfs_rq_throttled(cfs_rq) ||
>> >> + ? ? ? ? ? ? cfs_rq->quota_used < cfs_rq->quota_assigned)
>> >> ? ? ? ? ? ? ? return;
>> >
>> > So we are throttled but running anyway, I suppose this comes from the PI
>> > ceiling muck?
>> >
>>
>> No -- this is just the fact that there are cases where reschedule
>> can't evict the task immediately.
>>
>> e.g. softirq or any kernel time without config_preempt
>>
>> Once we're throttled we know there's no time left or point in trying
>> to acquire it so just short circuit these until we get to a point
>> where this task can be removed from rq.
>
> Right, but like I argued in another email, it could be refreshed on
> another cpu and you now miss it.. :-)
>

Yes, that would be non-desirable.. hum, synchronizing the check under
rq->lock in do_sched_cfs_period_timer() should cover this.

>> >> + ? ? if (!entity_on_rq(pse))
>> >> + ? ? ? ? ? ? return;
>> >> +#endif
>> >
>> > Ideally that #ifdef'ery would go away too.
>>
>> This can 100% go away (and is already in the #ifdefs), but it will
>> always be true in the !BANDWIDTH case, so it's a micro-overhead.
>> Accompanying micro-optimization isn't really needed :)
>
> Wouldn't gcc be able to optimize if (!true) stmt; with DCE ?
>

it's p->on_rq so it's a memory ref, I don't think the gcc can reduce
it since the path from ttwu -> check_preempt goes through a function
pointer.

although if it can handle that bifurcation then I'm super impressed :)
-- ill check against the disassembly.

2011-03-02 07:23:44

by Bharata B Rao

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

Hi Paul,

On Tue, Feb 15, 2011 at 07:18:34PM -0800, Paul Turner wrote:
> @@ -1015,6 +1046,14 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
> * Update run-time statistics of the 'current'.
> */
> update_curr(cfs_rq);
> +
> +
> +#ifdef CONFIG_CFS_BANDWIDTH
> + if (!entity_is_task(se) && (cfs_rq_throttled(group_cfs_rq(se)) ||
> + !group_cfs_rq(se)->nr_running))
> + return;
> +#endif
> +
> update_cfs_load(cfs_rq, 0);
> account_entity_enqueue(cfs_rq, se);
> update_cfs_shares(cfs_rq);

> @@ -1363,6 +1407,9 @@ enqueue_task_fair(struct rq *rq, struct
> break;
> cfs_rq = cfs_rq_of(se);
> enqueue_entity(cfs_rq, se, flags);
> + /* don't continue to enqueue if our parent is throttled */
> + if (cfs_rq_throttled(cfs_rq))
> + break;

1. This check (in enqueue_task_fair) ensures that if the cfs_rq we just enqueued
se to is throttled, we bail our from futher enqueueing of the hierarchy.

2. In enqueue_entity() we check if the entity we are enqueing owns a throttled
hieararchy and refuse to enqueue if true. And we silently refuse but continue
with futher enqueue attempts.

I see that 1 can happen when there a task belonging to a throttled group
wakes up.

Can you pls explain the scenario when 2 is needed ?

Regards,
Bharata.

2011-03-02 08:06:26

by Paul Turner

[permalink] [raw]
Subject: Re: [CFS Bandwidth Control v4 3/7] sched: throttle cfs_rq entities which exceed their local quota

On Tue, Mar 1, 2011 at 11:23 PM, Bharata B Rao
<[email protected]> wrote:
> Hi Paul,
>
> On Tue, Feb 15, 2011 at 07:18:34PM -0800, Paul Turner wrote:
>> @@ -1015,6 +1046,14 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
>> ? ? ? ?* Update run-time statistics of the 'current'.
>> ? ? ? ?*/
>> ? ? ? update_curr(cfs_rq);
>> +
>> +
>> +#ifdef CONFIG_CFS_BANDWIDTH
>> + ? ? if (!entity_is_task(se) && (cfs_rq_throttled(group_cfs_rq(se)) ||
>> + ? ? ? ? ?!group_cfs_rq(se)->nr_running))
>> + ? ? ? ? ? ? return;
>> +#endif
>> +
>> ? ? ? update_cfs_load(cfs_rq, 0);
>> ? ? ? account_entity_enqueue(cfs_rq, se);
>> ? ? ? update_cfs_shares(cfs_rq);
>
>> @@ -1363,6 +1407,9 @@ enqueue_task_fair(struct rq *rq, struct
>> ? ? ? ? ? ? ? ? ? ? ? break;
>> ? ? ? ? ? ? ? cfs_rq = cfs_rq_of(se);
>> ? ? ? ? ? ? ? enqueue_entity(cfs_rq, se, flags);
>> + ? ? ? ? ? ? /* don't continue to enqueue if our parent is throttled */
>> + ? ? ? ? ? ? if (cfs_rq_throttled(cfs_rq))
>> + ? ? ? ? ? ? ? ? ? ? break;
>
> 1. This check (in enqueue_task_fair) ensures that if the cfs_rq we just enqueued
> se to is throttled, we bail our from futher enqueueing of the hierarchy.
>
> 2. In enqueue_entity() we check if the entity we are enqueing owns a throttled
> hieararchy and refuse to enqueue if true. And we silently refuse but continue
> with futher enqueue attempts.
>
> I see that 1 can happen when there a task belonging to a throttled group
> wakes up.
>
> Can you pls explain the scenario when 2 is needed ?
>

As I recall this was for entity reweight

In this case on_rq is cached and it's possible we'll tip over into a
throttled state when we update_curr(). I think with reweight_entity()
this may no longer be required.

In my current v5 stack I've actually reworked the throttling logic to make
update_curr() non-state changing again and thus eliminated the
associated checks and cases.

> Regards,
> Bharata.
>