LinuxLists.cc - [CFS Bandwidth Control v4 6/7] sched: hierarchical task accounting for SCHED

2011-02-16 03:20:56

Subject: [CFS Bandwidth Control v4 6/7] sched: hierarchical task accounting for SCHED_OTHER

With task entities participating in throttled sub-trees it is possible for
task activation/de-activation to not lead to root visible changes to
rq->nr_running. This in turn leads to incorrect idle and weight-per-task load
balance decisions.

To allow correct accounting we move responsibility for updating rq->nr_running
to the respective sched::classes. In the fair-group case this update is
hierarchical, tracking the number of active tasks rooted at each group entity.

Note: technically this issue also exists with the existing sched_rt
throttling; however due to the nearly complete provisioning of system
resources for rt scheduling this is much less common by default.

Signed-off-by: Paul Turner <[email protected]>
Signed-off-by: Bharata B Rao <[email protected]>
---
kernel/sched.c | 9 ++++++---
kernel/sched_fair.c | 42 ++++++++++++++++++++++++++++++++++++++++++
kernel/sched_rt.c | 5 ++++-
3 files changed, 52 insertions(+), 4 deletions(-)

Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -330,7 +330,7 @@ struct task_group root_task_group;
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
- unsigned long nr_running;
+ unsigned long nr_running, h_nr_tasks;

u64 exec_clock;
u64 min_vruntime;
@@ -1846,6 +1846,11 @@ static const struct sched_class rt_sched

#include "sched_stats.h"

+static void mod_nr_running(struct rq *rq, long delta)
+{
+ rq->nr_running += delta;
+}
+
static void inc_nr_running(struct rq *rq)
{
rq->nr_running++;
@@ -1896,7 +1901,6 @@ static void activate_task(struct rq *rq,
rq->nr_uninterruptible--;

enqueue_task(rq, p, flags);
- inc_nr_running(rq);
}

/*
@@ -1908,7 +1912,6 @@ static void deactivate_task(struct rq *r
rq->nr_uninterruptible++;

dequeue_task(rq, p, flags);
- dec_nr_running(rq);
}

#ifdef CONFIG_IRQ_TIME_ACCOUNTING
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -81,6 +81,8 @@ unsigned int normalized_sysctl_sched_wak

const_debug unsigned int sysctl_sched_migration_cost = 500000UL;

+static void account_hier_tasks(struct sched_entity *se, int delta);
+
/*
* The exponential sliding window over which load is averaged for shares
* distribution.
@@ -933,6 +935,40 @@ static inline void update_entity_shares_
}
#endif /* CONFIG_FAIR_GROUP_SCHED */

+#ifdef CONFIG_CFS_BANDWIDTH
+/* maintain hierarchal task counts on group entities */
+static void account_hier_tasks(struct sched_entity *se, int delta)
+{
+ struct rq *rq = rq_of(cfs_rq_of(se));
+ struct cfs_rq *cfs_rq;
+
+ for_each_sched_entity(se) {
+ /* a throttled entity cannot affect its parent hierarchy */
+ if (group_cfs_rq(se) && cfs_rq_throttled(group_cfs_rq(se)))
+ break;
+
+ /* we affect our queuing entity */
+ cfs_rq = cfs_rq_of(se);
+ cfs_rq->h_nr_tasks += delta;
+ }
+
+ /* account for global nr_running delta to hierarchy change */
+ if (!se)
+ mod_nr_running(rq, delta);
+}
+#else
+/*
+ * In the absence of group throttling, all operations are guaranteed to be
+ * globally visible at the root rq level.
+ */
+static void account_hier_tasks(struct sched_entity *se, int delta)
+{
+ struct rq *rq = rq_of(cfs_rq_of(se));
+
+ mod_nr_running(rq, delta);
+}
+#endif
+
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
@@ -1428,6 +1464,7 @@ enqueue_task_fair(struct rq *rq, struct
update_cfs_shares(cfs_rq);
}

+ account_hier_tasks(&p->se, 1);
hrtick_update(rq);
}

@@ -1461,6 +1498,7 @@ static void dequeue_task_fair(struct rq
update_cfs_shares(cfs_rq);
}

+ account_hier_tasks(&p->se, -1);
hrtick_update(rq);
}

@@ -1488,6 +1526,8 @@ static u64 tg_request_cfs_quota(struct t
return delta;
}

+static void account_hier_tasks(struct sched_entity *se, int delta);
+
static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct sched_entity *se;
@@ -1507,6 +1547,7 @@ static void throttle_cfs_rq(struct cfs_r
if (!se->on_rq)
goto out_throttled;

+ account_hier_tasks(se, -cfs_rq->h_nr_tasks);
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);

@@ -1541,6 +1582,7 @@ static void unthrottle_cfs_rq(struct cfs
cfs_rq->load_stamp = cfs_rq->load_last = rq->clock_task;

cfs_rq->throttled = 0;
+ account_hier_tasks(se, cfs_rq->h_nr_tasks);
for_each_sched_entity(se) {
if (se->on_rq)
break;
Index: tip/kernel/sched_rt.c
===================================================================
--- tip.orig/kernel/sched_rt.c
+++ tip/kernel/sched_rt.c
@@ -906,6 +906,8 @@ enqueue_task_rt(struct rq *rq, struct ta

if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
+
+ inc_nr_running(rq);
}

static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -916,6 +918,8 @@ static void dequeue_task_rt(struct rq *r
dequeue_rt_entity(rt_se);

dequeue_pushable_task(rq, p);
+
+ dec_nr_running(rq);
}

/*
@@ -1783,4 +1787,3 @@ static void print_rt_stats(struct seq_fi
rcu_read_unlock();
}
#endif /* CONFIG_SCHED_DEBUG */
-

2011-02-22 03:18:07

by Balbir Singh

[permalink] [raw]

Subject: Re: [CFS Bandwidth Control v4 6/7] sched: hierarchical task accounting for SCHED_OTHER

* Paul Turner <[email protected]> [2011-02-15 19:18:37]:

> +#ifdef CONFIG_CFS_BANDWIDTH
> +/* maintain hierarchal task counts on group entities */
> +static void account_hier_tasks(struct sched_entity *se, int delta)

I don't like the use of hier, I'd expand it to hierarchical

I am not too sure about the RT bits, but other than that

Acked-by: Balbir Singh <[email protected]>

--
Three Cheers,
Balbir

2011-02-23 02:03:27

by Hidetoshi Seto

[permalink] [raw]

Subject: Re: [CFS Bandwidth Control v4 6/7] sched: hierarchical task accounting for SCHED_OTHER

(2011/02/16 12:18), Paul Turner wrote:
> @@ -1428,6 +1464,7 @@ enqueue_task_fair(struct rq *rq, struct
> update_cfs_shares(cfs_rq);
> }
>
> + account_hier_tasks(&p->se, 1);
> hrtick_update(rq);
> }
>
> @@ -1461,6 +1498,7 @@ static void dequeue_task_fair(struct rq
> update_cfs_shares(cfs_rq);
> }
>
> + account_hier_tasks(&p->se, -1);
> hrtick_update(rq);
> }
>

Why hrtick_update() is need to be delayed after modifying nr_running?
You should not impact current hrtick logic without once mentioning.

> Index: tip/kernel/sched_rt.c
> ===================================================================
> --- tip.orig/kernel/sched_rt.c
> +++ tip/kernel/sched_rt.c
> @@ -906,6 +906,8 @@ enqueue_task_rt(struct rq *rq, struct ta
>
> if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
> enqueue_pushable_task(rq, p);
> +
> + inc_nr_running(rq);
> }
>
> static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
> @@ -916,6 +918,8 @@ static void dequeue_task_rt(struct rq *r
> dequeue_rt_entity(rt_se);
>
> dequeue_pushable_task(rq, p);
> +
> + dec_nr_running(rq);
> }
>
> /*

I think similar change for sched_stoptask.c is required.

In fact I could not boot tip/master with this v4 patchset.
It reports rcu stall warns without applying following tweak:

--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -35,11 +35,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq
static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
+ inc_nr_running(rq);
}

static void
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
+ dec_nr_running(rq);
}

Thanks,
H.Seto

2011-02-23 02:20:40

by Paul Turner

[permalink] [raw]

Subject: Re: [CFS Bandwidth Control v4 6/7] sched: hierarchical task accounting for SCHED_OTHER

On Tue, Feb 22, 2011 at 6:02 PM, Hidetoshi Seto
<[email protected]> wrote:
> (2011/02/16 12:18), Paul Turner wrote:
>> @@ -1428,6 +1464,7 @@ enqueue_task_fair(struct rq *rq, struct
>> ? ? ? ? ? ? ? update_cfs_shares(cfs_rq);
>> ? ? ? }
>>
>> + ? ? account_hier_tasks(&p->se, 1);
>> ? ? ? hrtick_update(rq);
>> ?}
>>
>> @@ -1461,6 +1498,7 @@ static void dequeue_task_fair(struct rq
>> ? ? ? ? ? ? ? update_cfs_shares(cfs_rq);
>> ? ? ? }
>>
>> + ? ? account_hier_tasks(&p->se, -1);
>> ? ? ? hrtick_update(rq);
>> ?}
>>
>
> Why hrtick_update() is need to be delayed after modifying nr_running?
> You should not impact current hrtick logic without once mentioning.

There shouldn't be any impact -- hrtick_update only cares about
cfs_rq->nr_running which is independent of rq->nr_running (maintained
by hierarchal accounting)

>
>> Index: tip/kernel/sched_rt.c
>> ===================================================================
>> --- tip.orig/kernel/sched_rt.c
>> +++ tip/kernel/sched_rt.c
>> @@ -906,6 +906,8 @@ enqueue_task_rt(struct rq *rq, struct ta
>>
>> ? ? ? if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
>> ? ? ? ? ? ? ? enqueue_pushable_task(rq, p);
>> +
>> + ? ? inc_nr_running(rq);
>> ?}
>>
>> ?static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
>> @@ -916,6 +918,8 @@ static void dequeue_task_rt(struct rq *r
>> ? ? ? dequeue_rt_entity(rt_se);
>>
>> ? ? ? dequeue_pushable_task(rq, p);
>> +
>> + ? ? dec_nr_running(rq);
>> ?}
>>
>> ?/*
>
> I think similar change for sched_stoptask.c is required.

Aha!

Yes, I missed this addition when re-basing.

It is needed since when something is placed into the stop-class via
set_sched it goes through an activate/deactivate.

Will add, thanks!

>
> In fact I could not boot tip/master with this v4 patchset.
> It reports rcu stall warns without applying following tweak:
>
> --- a/kernel/sched_stoptask.c
> +++ b/kernel/sched_stoptask.c
> @@ -35,11 +35,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq
> ?static void
> ?enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
> ?{
> + ? ? ? inc_nr_running(rq);
> ?}
>
> ?static void
> ?dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
> ?{
> + ? ? ? dec_nr_running(rq);
> ?}
>
>
> Thanks,
> H.Seto
>
>

2011-02-23 02:43:17

by Balbir Singh

[permalink] [raw]

Subject: Re: [CFS Bandwidth Control v4 6/7] sched: hierarchical task accounting for SCHED_OTHER

* Hidetoshi Seto <[email protected]> [2011-02-23 11:02:40]:

> (2011/02/16 12:18), Paul Turner wrote:
> > @@ -1428,6 +1464,7 @@ enqueue_task_fair(struct rq *rq, struct
> > update_cfs_shares(cfs_rq);
> > }
> >
> > + account_hier_tasks(&p->se, 1);
> > hrtick_update(rq);
> > }
> >
> > @@ -1461,6 +1498,7 @@ static void dequeue_task_fair(struct rq
> > update_cfs_shares(cfs_rq);
> > }
> >
> > + account_hier_tasks(&p->se, -1);
> > hrtick_update(rq);
> > }
> >
>
> Why hrtick_update() is need to be delayed after modifying nr_running?
> You should not impact current hrtick logic without once mentioning.
>
> > Index: tip/kernel/sched_rt.c
> > ===================================================================
> > --- tip.orig/kernel/sched_rt.c
> > +++ tip/kernel/sched_rt.c
> > @@ -906,6 +906,8 @@ enqueue_task_rt(struct rq *rq, struct ta
> >
> > if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
> > enqueue_pushable_task(rq, p);
> > +
> > + inc_nr_running(rq);
> > }
> >
> > static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
> > @@ -916,6 +918,8 @@ static void dequeue_task_rt(struct rq *r
> > dequeue_rt_entity(rt_se);
> >
> > dequeue_pushable_task(rq, p);
> > +
> > + dec_nr_running(rq);
> > }
> >
> > /*
>
> I think similar change for sched_stoptask.c is required.
>
> In fact I could not boot tip/master with this v4 patchset.
> It reports rcu stall warns without applying following tweak:
>
> --- a/kernel/sched_stoptask.c
> +++ b/kernel/sched_stoptask.c
> @@ -35,11 +35,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq
> static void
> enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
> {
> + inc_nr_running(rq);
> }
>
> static void
> dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
> {
> + dec_nr_running(rq);
> }
>

Good catch!

Acked-by: Balbir Singh <[email protected]>

--
Three Cheers,
Balbir

2011-02-23 08:05:37

by Paul Turner

[permalink] [raw]

Subject: Re: [CFS Bandwidth Control v4 6/7] sched: hierarchical task accounting for SCHED_OTHER

On Mon, Feb 21, 2011 at 7:17 PM, Balbir Singh <[email protected]> wrote:
> * Paul Turner <[email protected]> [2011-02-15 19:18:37]:
>
>> +#ifdef CONFIG_CFS_BANDWIDTH
>> +/* maintain hierarchal task counts on group entities */
>> +static void account_hier_tasks(struct sched_entity *se, int delta)
>
> I don't like the use of hier, I'd expand it to hierarchical
>

Sure

> I am not too sure about the RT bits, but other than that
>
>
> Acked-by: Balbir Singh <[email protected]>
>
>
> --
> ? ? ? ?Three Cheers,
> ? ? ? ?Balbir
>

2011-02-23 13:32:53

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [CFS Bandwidth Control v4 6/7] sched: hierarchical task accounting for SCHED_OTHER

On Tue, 2011-02-15 at 19:18 -0800, Paul Turner wrote:

> @@ -1846,6 +1846,11 @@ static const struct sched_class rt_sched
>
> #include "sched_stats.h"
>
> +static void mod_nr_running(struct rq *rq, long delta)
> +{
> + rq->nr_running += delta;
> +}

I personally don't see much use in such trivial wrappers.. if you're
going to rework all the nr_running stuff you might as well remove all of
that.

> Index: tip/kernel/sched_fair.c
> ===================================================================
> --- tip.orig/kernel/sched_fair.c
> +++ tip/kernel/sched_fair.c
> @@ -81,6 +81,8 @@ unsigned int normalized_sysctl_sched_wak
>
> const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
>
> +static void account_hier_tasks(struct sched_entity *se, int delta);
> +
> /*
> * The exponential sliding window over which load is averaged for shares
> * distribution.
> @@ -933,6 +935,40 @@ static inline void update_entity_shares_
> }
> #endif /* CONFIG_FAIR_GROUP_SCHED */
>
> +#ifdef CONFIG_CFS_BANDWIDTH
> +/* maintain hierarchal task counts on group entities */
> +static void account_hier_tasks(struct sched_entity *se, int delta)
> +{
> + struct rq *rq = rq_of(cfs_rq_of(se));
> + struct cfs_rq *cfs_rq;
> +
> + for_each_sched_entity(se) {
> + /* a throttled entity cannot affect its parent hierarchy */
> + if (group_cfs_rq(se) && cfs_rq_throttled(group_cfs_rq(se)))
> + break;
> +
> + /* we affect our queuing entity */
> + cfs_rq = cfs_rq_of(se);
> + cfs_rq->h_nr_tasks += delta;
> + }
> +
> + /* account for global nr_running delta to hierarchy change */
> + if (!se)
> + mod_nr_running(rq, delta);
> +}
> +#else
> +/*
> + * In the absence of group throttling, all operations are guaranteed to be
> + * globally visible at the root rq level.
> + */
> +static void account_hier_tasks(struct sched_entity *se, int delta)
> +{
> + struct rq *rq = rq_of(cfs_rq_of(se));
> +
> + mod_nr_running(rq, delta);
> +}
> +#endif

While Balbir suggested expanding the _hier_ thing, I'd suggest to simply
drop it altogether, way too much typing ;-), but that is if you cannot
get rid of the extra hierarchy iteration, see below.

> +
> static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> #ifdef CONFIG_SCHEDSTATS
> @@ -1428,6 +1464,7 @@ enqueue_task_fair(struct rq *rq, struct
> update_cfs_shares(cfs_rq);
> }
>
> + account_hier_tasks(&p->se, 1);
> hrtick_update(rq);
> }
>
> @@ -1461,6 +1498,7 @@ static void dequeue_task_fair(struct rq
> update_cfs_shares(cfs_rq);
> }
>
> + account_hier_tasks(&p->se, -1);
> hrtick_update(rq);
> }
>
> @@ -1488,6 +1526,8 @@ static u64 tg_request_cfs_quota(struct t
> return delta;
> }
>
> +static void account_hier_tasks(struct sched_entity *se, int delta);
> +
> static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
> {
> struct sched_entity *se;
> @@ -1507,6 +1547,7 @@ static void throttle_cfs_rq(struct cfs_r
> if (!se->on_rq)
> goto out_throttled;
>
> + account_hier_tasks(se, -cfs_rq->h_nr_tasks);
> for_each_sched_entity(se) {
> struct cfs_rq *cfs_rq = cfs_rq_of(se);
>
> @@ -1541,6 +1582,7 @@ static void unthrottle_cfs_rq(struct cfs
> cfs_rq->load_stamp = cfs_rq->load_last = rq->clock_task;
>
> cfs_rq->throttled = 0;
> + account_hier_tasks(se, cfs_rq->h_nr_tasks);
> for_each_sched_entity(se) {
> if (se->on_rq)
> break;

All call-sites are right next to a for_each_sched_entity() iteration, is
there really no way to fold those loops?

> Index: tip/kernel/sched_rt.c
> ===================================================================
> --- tip.orig/kernel/sched_rt.c
> +++ tip/kernel/sched_rt.c
> @@ -906,6 +906,8 @@ enqueue_task_rt(struct rq *rq, struct ta
>
> if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
> enqueue_pushable_task(rq, p);
> +
> + inc_nr_running(rq);
> }
>
> static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
> @@ -916,6 +918,8 @@ static void dequeue_task_rt(struct rq *r
> dequeue_rt_entity(rt_se);
>
> dequeue_pushable_task(rq, p);
> +
> + dec_nr_running(rq);
> }
>
> /*
> @@ -1783,4 +1787,3 @@ static void print_rt_stats(struct seq_fi
> rcu_read_unlock();
> }
> #endif /* CONFIG_SCHED_DEBUG */

You mentioned something about -rt having the same problem, yet you don't
fix it.. tskkk :-)

2011-02-25 03:25:36

by Paul Turner

[permalink] [raw]

Subject: Re: [CFS Bandwidth Control v4 6/7] sched: hierarchical task accounting for SCHED_OTHER

On Wed, Feb 23, 2011 at 5:32 AM, Peter Zijlstra <[email protected]> wrote:
> On Tue, 2011-02-15 at 19:18 -0800, Paul Turner wrote:
>
>> @@ -1846,6 +1846,11 @@ static const struct sched_class rt_sched
>>
>> ?#include "sched_stats.h"
>>
>> +static void mod_nr_running(struct rq *rq, long delta)
>> +{
>> + ? ? rq->nr_running += delta;
>> +}
>
> I personally don't see much use in such trivial wrappers.. if you're
> going to rework all the nr_running stuff you might as well remove all of
> that.

I'm ok with that, I was trying to preserve some of the existing
encapsulation but there isn't a need for it.

Will do.

>
>> Index: tip/kernel/sched_fair.c
>> ===================================================================
>> --- tip.orig/kernel/sched_fair.c
>> +++ tip/kernel/sched_fair.c
>> @@ -81,6 +81,8 @@ unsigned int normalized_sysctl_sched_wak
>>
>> ?const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
>>
>> +static void account_hier_tasks(struct sched_entity *se, int delta);
>> +
>> ?/*
>> ? * The exponential sliding ?window over which load is averaged for shares
>> ? * distribution.
>> @@ -933,6 +935,40 @@ static inline void update_entity_shares_
>> ?}
>> ?#endif /* CONFIG_FAIR_GROUP_SCHED */
>>
>> +#ifdef CONFIG_CFS_BANDWIDTH
>> +/* maintain hierarchal task counts on group entities */
>> +static void account_hier_tasks(struct sched_entity *se, int delta)
>> +{
>> + ? ? struct rq *rq = rq_of(cfs_rq_of(se));
>> + ? ? struct cfs_rq *cfs_rq;
>> +
>> + ? ? for_each_sched_entity(se) {
>> + ? ? ? ? ? ? /* a throttled entity cannot affect its parent hierarchy */
>> + ? ? ? ? ? ? if (group_cfs_rq(se) && cfs_rq_throttled(group_cfs_rq(se)))
>> + ? ? ? ? ? ? ? ? ? ? break;
>> +
>> + ? ? ? ? ? ? /* we affect our queuing entity */
>> + ? ? ? ? ? ? cfs_rq = cfs_rq_of(se);
>> + ? ? ? ? ? ? cfs_rq->h_nr_tasks += delta;
>> + ? ? }
>> +
>> + ? ? /* account for global nr_running delta to hierarchy change */
>> + ? ? if (!se)
>> + ? ? ? ? ? ? mod_nr_running(rq, delta);
>> +}
>> +#else
>> +/*
>> + * In the absence of group throttling, all operations are guaranteed to be
>> + * globally visible at the root rq level.
>> + */
>> +static void account_hier_tasks(struct sched_entity *se, int delta)
>> +{
>> + ? ? struct rq *rq = rq_of(cfs_rq_of(se));
>> +
>> + ? ? mod_nr_running(rq, delta);
>> +}
>> +#endif
>
> While Balbir suggested expanding the _hier_ thing, I'd suggest to simply
> drop it altogether, way too much typing ;-), but that is if you cannot
> get rid of the extra hierarchy iteration, see below.
>
>> +
>> ?static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
>> ?{
>> ?#ifdef CONFIG_SCHEDSTATS
>> @@ -1428,6 +1464,7 @@ enqueue_task_fair(struct rq *rq, struct
>> ? ? ? ? ? ? ? update_cfs_shares(cfs_rq);
>> ? ? ? }
>>
>> + ? ? account_hier_tasks(&p->se, 1);
>> ? ? ? hrtick_update(rq);
>> ?}
>>
>> @@ -1461,6 +1498,7 @@ static void dequeue_task_fair(struct rq
>> ? ? ? ? ? ? ? update_cfs_shares(cfs_rq);
>> ? ? ? }
>>
>> + ? ? account_hier_tasks(&p->se, -1);
>> ? ? ? hrtick_update(rq);
>> ?}
>>
>> @@ -1488,6 +1526,8 @@ static u64 tg_request_cfs_quota(struct t
>> ? ? ? return delta;
>> ?}
>>
>> +static void account_hier_tasks(struct sched_entity *se, int delta);
>> +
>> ?static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
>> ?{
>> ? ? ? struct sched_entity *se;
>> @@ -1507,6 +1547,7 @@ static void throttle_cfs_rq(struct cfs_r
>> ? ? ? if (!se->on_rq)
>> ? ? ? ? ? ? ? goto out_throttled;
>>
>> + ? ? account_hier_tasks(se, -cfs_rq->h_nr_tasks);
>> ? ? ? for_each_sched_entity(se) {
>> ? ? ? ? ? ? ? struct cfs_rq *cfs_rq = cfs_rq_of(se);
>>
>> @@ -1541,6 +1582,7 @@ static void unthrottle_cfs_rq(struct cfs
>> ? ? ? cfs_rq->load_stamp = cfs_rq->load_last = rq->clock_task;
>>
>> ? ? ? cfs_rq->throttled = 0;
>> + ? ? account_hier_tasks(se, cfs_rq->h_nr_tasks);
>> ? ? ? for_each_sched_entity(se) {
>> ? ? ? ? ? ? ? if (se->on_rq)
>> ? ? ? ? ? ? ? ? ? ? ? break;
>
> All call-sites are right next to a for_each_sched_entity() iteration, is
> there really no way to fold those loops?
>

I was trying to avoid some duplication since many of those for_each
iterations are partial and I didn't want to dump the same loop for the
remnants everywhere.

However, thinking about it I think this can be cleaned up and refined
to by having account_hier iterate from wherever they finish and
avoiding that nastiness.

>> Index: tip/kernel/sched_rt.c
>> ===================================================================
>> --- tip.orig/kernel/sched_rt.c
>> +++ tip/kernel/sched_rt.c
>> @@ -906,6 +906,8 @@ enqueue_task_rt(struct rq *rq, struct ta
>>
>> ? ? ? if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
>> ? ? ? ? ? ? ? enqueue_pushable_task(rq, p);
>> +
>> + ? ? inc_nr_running(rq);
>> ?}
>>
>> ?static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
>> @@ -916,6 +918,8 @@ static void dequeue_task_rt(struct rq *r
>> ? ? ? dequeue_rt_entity(rt_se);
>>
>> ? ? ? dequeue_pushable_task(rq, p);
>> +
>> + ? ? dec_nr_running(rq);
>> ?}
>>
>> ?/*
>> @@ -1783,4 +1787,3 @@ static void print_rt_stats(struct seq_fi
>> ? ? ? rcu_read_unlock();
>> ?}
>> ?#endif /* CONFIG_SCHED_DEBUG */
>
>
> You mentioned something about -rt having the same problem, yet you don't
> fix it.. tskkk :-)
>

Yeah I agree :) -- I was planning on fixing this in a follow-up
[honest!] as it's not as important since it doesn't affect idle
balancing to the same magnitude. I wanted to first air the notion of
hierarchal task accounting in one of the schedulers then mirror the
support across the other schedulers with consensus reached.

I can make that part of this series if you'd prefer.

>

2011-02-25 12:17:59

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [CFS Bandwidth Control v4 6/7] sched: hierarchical task accounting for SCHED_OTHER

On Thu, 2011-02-24 at 19:25 -0800, Paul Turner wrote:
>
> Yeah I agree :) -- I was planning on fixing this in a follow-up
> [honest!] as it's not as important since it doesn't affect idle
> balancing to the same magnitude. I wanted to first air the notion of
> hierarchal task accounting in one of the schedulers then mirror the
> support across the other schedulers with consensus reached.

Right, I think I once argued against this, but the effect on a lot of
things are against me, so lets do this.