2010-08-28 22:37:59

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC][PATCH 1/3] sched: Rewrite tg_shares_up

By tracking a per-cpu load-avg for each cfs_rq and folding it into a
global task_group load on each tick we can rework tg_shares_up to be
strictly per-cpu.

This should improve cpu-cgroup performance for smp systems
significantly.

Signed-off-by: Peter Zijlstra <[email protected]>
---
include/linux/sched.h | 2
kernel/sched.c | 173 ++++++++++++------------------------------------
kernel/sched_debug.c | 11 ++-
kernel/sched_fair.c | 153 ++++++++++++++++++++++++++----------------
kernel/sched_features.h | 2
kernel/sysctl.c | 17 ----
6 files changed, 147 insertions(+), 211 deletions(-)

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1858,8 +1858,6 @@ static inline void wake_up_idle_cpu(int
extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_shares_ratelimit;
-extern unsigned int sysctl_sched_shares_thresh;
extern unsigned int sysctl_sched_child_runs_first;

enum sched_tunable_scaling {
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -253,6 +253,8 @@ struct task_group {
/* runqueue "owned" by this group on each cpu */
struct cfs_rq **cfs_rq;
unsigned long shares;
+
+ atomic_t load_weight;
#endif

#ifdef CONFIG_RT_GROUP_SCHED
@@ -359,15 +361,11 @@ struct cfs_rq {
*/
unsigned long h_load;

- /*
- * this cpu's part of tg->shares
- */
- unsigned long shares;
+ u64 load_avg;
+ u64 load_period;
+ u64 load_stamp;

- /*
- * load.weight at the time we set shares
- */
- unsigned long rq_weight;
+ unsigned long load_contribution;
#endif
#endif
};
@@ -793,20 +791,6 @@ late_initcall(sched_init_debug);
const_debug unsigned int sysctl_sched_nr_migrate = 32;

/*
- * ratelimit for updating the group shares.
- * default: 0.25ms
- */
-unsigned int sysctl_sched_shares_ratelimit = 250000;
-unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
-
-/*
- * Inject some fuzzyness into changing the per-cpu group shares
- * this avoids remote rq-locks at the expense of fairness.
- * default: 4
- */
-unsigned int sysctl_sched_shares_thresh = 4;
-
-/*
* period over which we average the RT time consumption, measured
* in ms.
*
@@ -1351,6 +1335,12 @@ static inline void update_load_sub(struc
lw->inv_weight = 0;
}

+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+ lw->weight = w;
+ lw->inv_weight = 0;
+}
+
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
* of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1539,97 +1529,44 @@ static unsigned long cpu_avg_load_per_ta

#ifdef CONFIG_FAIR_GROUP_SCHED

-static __read_mostly unsigned long __percpu *update_shares_data;
-
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
-
-/*
- * Calculate and set the cpu's group shares.
- */
-static void update_group_shares_cpu(struct task_group *tg, int cpu,
- unsigned long sd_shares,
- unsigned long sd_rq_weight,
- unsigned long *usd_rq_weight)
-{
- unsigned long shares, rq_weight;
- int boost = 0;
-
- rq_weight = usd_rq_weight[cpu];
- if (!rq_weight) {
- boost = 1;
- rq_weight = NICE_0_LOAD;
- }
-
- /*
- * \Sum_j shares_j * rq_weight_i
- * shares_i = -----------------------------
- * \Sum_j rq_weight_j
- */
- shares = (sd_shares * rq_weight) / sd_rq_weight;
- shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
-
- if (abs(shares - tg->se[cpu]->load.weight) >
- sysctl_sched_shares_thresh) {
- struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
- tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
- __set_se_shares(tg->se[cpu], shares);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- }
-}
+static void update_cfs_load(struct cfs_rq *cfs_rq);
+static void update_cfs_shares(struct cfs_rq *cfs_rq);

/*
- * Re-compute the task group their per cpu shares over the given domain.
- * This needs to be done in a bottom-up fashion because the rq weight of a
- * parent group depends on the shares of its child groups.
+ * update tg->load_weight by folding this cpu's load_avg
*/
static int tg_shares_up(struct task_group *tg, void *data)
{
- unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
- unsigned long *usd_rq_weight;
- struct sched_domain *sd = data;
+ unsigned long load_avg;
+ struct cfs_rq *cfs_rq;
unsigned long flags;
- int i;
+ int cpu = (long)data;
+ struct rq *rq;

- if (!tg->se[0])
+ if (!tg->se[cpu])
return 0;

- local_irq_save(flags);
- usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
-
- for_each_cpu(i, sched_domain_span(sd)) {
- weight = tg->cfs_rq[i]->load.weight;
- usd_rq_weight[i] = weight;
-
- rq_weight += weight;
- /*
- * If there are currently no tasks on the cpu pretend there
- * is one of average load so that when a new task gets to
- * run here it will not get delayed by group starvation.
- */
- if (!weight)
- weight = NICE_0_LOAD;
+ rq = cpu_rq(cpu);
+ cfs_rq = tg->cfs_rq[cpu];

- sum_weight += weight;
- shares += tg->cfs_rq[i]->shares;
- }
+ raw_spin_lock_irqsave(&rq->lock, flags);

- if (!rq_weight)
- rq_weight = sum_weight;
+ update_rq_clock(rq);
+ update_cfs_load(cfs_rq);

- if ((!shares && rq_weight) || shares > tg->shares)
- shares = tg->shares;
+ load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+ load_avg -= cfs_rq->load_contribution;

- if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
- shares = tg->shares;
+ atomic_add(load_avg, &tg->load_weight);
+ cfs_rq->load_contribution += load_avg;

- for_each_cpu(i, sched_domain_span(sd))
- update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
+ /*
+ * We need to update shares after updating tg->load_weight in
+ * order to adjust the weight of groups with long running tasks.
+ */
+ update_cfs_shares(cfs_rq);

- local_irq_restore(flags);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);

return 0;
}
@@ -1648,7 +1585,7 @@ static int tg_load_down(struct task_grou
load = cpu_rq(cpu)->load.weight;
} else {
load = tg->parent->cfs_rq[cpu]->h_load;
- load *= tg->cfs_rq[cpu]->shares;
+ load *= tg->se[cpu]->load.weight;
load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
}

@@ -1657,21 +1594,16 @@ static int tg_load_down(struct task_grou
return 0;
}

-static void update_shares(struct sched_domain *sd)
+static void update_shares(long cpu)
{
- s64 elapsed;
- u64 now;
-
if (root_task_group_empty())
return;

- now = local_clock();
- elapsed = now - sd->last_update;
+ /*
+ * XXX: replace with an on-demand list
+ */

- if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
- sd->last_update = now;
- walk_tg_tree(tg_nop, tg_shares_up, sd);
- }
+ walk_tg_tree(tg_nop, tg_shares_up, (void *)cpu);
}

static void update_h_load(long cpu)
@@ -1681,7 +1613,7 @@ static void update_h_load(long cpu)

#else

-static inline void update_shares(struct sched_domain *sd)
+static inline void update_shares(int cpu)
{
}

@@ -1806,15 +1738,6 @@ static void double_rq_unlock(struct rq *

#endif

-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
-#ifdef CONFIG_SMP
- cfs_rq->shares = shares;
-#endif
-}
-#endif
-
static void calc_load_account_idle(struct rq *this_rq);
static void update_sysctl(void);
static int get_update_sysctl_factor(void);
@@ -5400,7 +5323,6 @@ static void update_sysctl(void)
SET_SYSCTL(sched_min_granularity);
SET_SYSCTL(sched_latency);
SET_SYSCTL(sched_wakeup_granularity);
- SET_SYSCTL(sched_shares_ratelimit);
#undef SET_SYSCTL
}

@@ -7652,8 +7574,7 @@ static void init_tg_cfs_entry(struct tas
se->cfs_rq = parent->my_q;

se->my_q = cfs_rq;
- se->load.weight = tg->shares;
- se->load.inv_weight = 0;
+ update_load_set(&se->load, tg->shares);
se->parent = parent;
}
#endif
@@ -7746,10 +7667,6 @@ void __init sched_init(void)

#endif /* CONFIG_CGROUP_SCHED */

-#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
- update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
- __alignof__(unsigned long));
-#endif
for_each_possible_cpu(i) {
struct rq *rq;

@@ -8317,8 +8234,7 @@ static void __set_se_shares(struct sched
if (on_rq)
dequeue_entity(cfs_rq, se, 0);

- se->load.weight = shares;
- se->load.inv_weight = 0;
+ update_load_set(&se->load, shares);

if (on_rq)
enqueue_entity(cfs_rq, se, 0);
@@ -8375,7 +8291,6 @@ int sched_group_set_shares(struct task_g
/*
* force a rebalance
*/
- cfs_rq_set_shares(tg->cfs_rq[i], 0);
set_se_shares(tg->se[i], shares);
}

Index: linux-2.6/kernel/sched_debug.c
===================================================================
--- linux-2.6.orig/kernel/sched_debug.c
+++ linux-2.6/kernel/sched_debug.c
@@ -204,13 +204,18 @@ void print_cfs_rq(struct seq_file *m, in
SPLIT_NS(spread0));
SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
+ SPLIT_NS(cfs_rq->load_avg));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
+ SPLIT_NS(cfs_rq->load_period));
+ SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
+ cfs_rq->load_contribution);
+ SEQ_printf(m, " .%-30s: %d\n", "load_tg",
+ atomic_read(&tg->load_weight));

SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
#ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
-#endif
print_cfs_group_stats(m, cpu, cfs_rq->tg);
#endif
}
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -417,7 +417,6 @@ int sched_proc_update_handler(struct ctl
WRT_SYSCTL(sched_min_granularity);
WRT_SYSCTL(sched_latency);
WRT_SYSCTL(sched_wakeup_granularity);
- WRT_SYSCTL(sched_shares_ratelimit);
#undef WRT_SYSCTL

return 0;
@@ -633,7 +632,6 @@ account_entity_enqueue(struct cfs_rq *cf
list_add(&se->group_node, &cfs_rq->tasks);
}
cfs_rq->nr_running++;
- se->on_rq = 1;
}

static void
@@ -647,9 +645,84 @@ account_entity_dequeue(struct cfs_rq *cf
list_del_init(&se->group_node);
}
cfs_rq->nr_running--;
- se->on_rq = 0;
}

+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void update_cfs_load(struct cfs_rq *cfs_rq)
+{
+ u64 period = sched_avg_period();
+ u64 now = rq_of(cfs_rq)->clock;
+ u64 delta = now - cfs_rq->load_stamp;
+
+ cfs_rq->load_stamp = now;
+ cfs_rq->load_period += delta;
+ cfs_rq->load_avg += delta * cfs_rq->load.weight;
+
+ while (cfs_rq->load_period > period) {
+ /*
+ * Inline assembly required to prevent the compiler
+ * optimising this loop into a divmod call.
+ * See __iter_div_u64_rem() for another example of this.
+ */
+ asm("" : "+rm" (cfs_rq->load_period));
+ cfs_rq->load_period /= 2;
+ cfs_rq->load_avg /= 2;
+ }
+}
+
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ unsigned long weight)
+{
+ if (se->on_rq)
+ account_entity_dequeue(cfs_rq, se);
+
+ update_load_set(&se->load, weight);
+
+ if (se->on_rq)
+ account_entity_enqueue(cfs_rq, se);
+}
+
+static void update_cfs_shares(struct cfs_rq *cfs_rq)
+{
+ struct task_group *tg;
+ struct sched_entity *se;
+ unsigned long load_weight, load, shares;
+
+ if (!cfs_rq)
+ return;
+
+ tg = cfs_rq->tg;
+ se = tg->se[cpu_of(rq_of(cfs_rq))];
+ if (!se)
+ return;
+
+ load = cfs_rq->load.weight;
+
+ load_weight = atomic_read(&tg->load_weight);
+ load_weight -= cfs_rq->load_contribution;
+ load_weight += load;
+
+ shares = (tg->shares * load);
+ if (load_weight)
+ shares /= load_weight;
+
+ if (shares < MIN_SHARES)
+ shares = MIN_SHARES;
+ if (shares > tg->shares)
+ shares = tg->shares;
+
+ reweight_entity(cfs_rq_of(se), se, shares);
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static inline void update_cfs_load(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+{
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
@@ -771,7 +844,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+ update_cfs_load(cfs_rq);
account_entity_enqueue(cfs_rq, se);
+ update_cfs_shares(group_cfs_rq(se));

if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
@@ -782,6 +857,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
check_spread(cfs_rq, se);
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
+ se->on_rq = 1;
}

static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +901,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, st

if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
+ se->on_rq = 0;
+ update_cfs_load(cfs_rq);
account_entity_dequeue(cfs_rq, se);
update_min_vruntime(cfs_rq);
+ update_cfs_shares(group_cfs_rq(se));

/*
* Normalize the entity after updating the min_vruntime because the
@@ -1055,6 +1134,9 @@ enqueue_task_fair(struct rq *rq, struct
flags = ENQUEUE_WAKEUP;
}

+ for_each_sched_entity(se)
+ update_cfs_shares(group_cfs_rq(se));
+
hrtick_update(rq);
}

@@ -1071,12 +1153,16 @@ static void dequeue_task_fair(struct rq
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
+
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight)
break;
flags |= DEQUEUE_SLEEP;
}

+ for_each_sched_entity(se)
+ update_cfs_shares(group_cfs_rq(se));
+
hrtick_update(rq);
}

@@ -1143,51 +1229,20 @@ static void task_waking_fair(struct rq *
* Adding load to a group doesn't make a group heavier, but can cause movement
* of group shares between cpus. Assuming the shares were perfectly aligned one
* can calculate the shift in shares.
- *
- * The problem is that perfectly aligning the shares is rather expensive, hence
- * we try to avoid doing that too often - see update_shares(), which ratelimits
- * this change.
- *
- * We compensate this by not only taking the current delta into account, but
- * also considering the delta between when the shares were last adjusted and
- * now.
- *
- * We still saw a performance dip, some tracing learned us that between
- * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
- * significantly. Therefore try to bias the error in direction of failing
- * the affine wakeup.
- *
*/
-static long effective_load(struct task_group *tg, int cpu,
- long wl, long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
struct sched_entity *se = tg->se[cpu];

if (!tg->parent)
return wl;

- /*
- * By not taking the decrease of shares on the other cpu into
- * account our error leans towards reducing the affine wakeups.
- */
- if (!wl && sched_feat(ASYM_EFF_LOAD))
- return wl;
-
for_each_sched_entity(se) {
long S, rw, s, a, b;
- long more_w;

- /*
- * Instead of using this increment, also add the difference
- * between when the shares were last updated and now.
- */
- more_w = se->my_q->load.weight - se->my_q->rq_weight;
- wl += more_w;
- wg += more_w;
-
- S = se->my_q->tg->shares;
- s = se->my_q->shares;
- rw = se->my_q->rq_weight;
+ S = tg->shares;
+ s = se->load.weight;
+ rw = se->my_q->load.weight;

a = S*(rw + wl);
b = S*rw + s*wg;
@@ -1509,23 +1564,6 @@ select_task_rq_fair(struct rq *rq, struc
sd = tmp;
}

-#ifdef CONFIG_FAIR_GROUP_SCHED
- if (sched_feat(LB_SHARES_UPDATE)) {
- /*
- * Pick the largest domain to update shares over
- */
- tmp = sd;
- if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
- tmp = affine_sd;
-
- if (tmp) {
- raw_spin_unlock(&rq->lock);
- update_shares(tmp);
- raw_spin_lock(&rq->lock);
- }
- }
-#endif
-
if (affine_sd) {
if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
return select_idle_sibling(p, cpu);
@@ -2980,7 +3018,6 @@ static int load_balance(int this_cpu, st
schedstat_inc(sd, lb_count[idle]);

redo:
- update_shares(sd);
group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
cpus, balance);

@@ -3115,8 +3152,6 @@ out_one_pinned:
else
ld_moved = 0;
out:
- if (ld_moved)
- update_shares(sd);
return ld_moved;
}

@@ -3510,6 +3545,8 @@ static void rebalance_domains(int cpu, e
int update_next_balance = 0;
int need_serialize;

+ update_shares(cpu);
+
for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
Index: linux-2.6/kernel/sched_features.h
===================================================================
--- linux-2.6.orig/kernel/sched_features.h
+++ linux-2.6/kernel/sched_features.h
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
SCHED_FEAT(HRTICK, 0)
SCHED_FEAT(DOUBLE_TICK, 0)
SCHED_FEAT(LB_BIAS, 1)
-SCHED_FEAT(LB_SHARES_UPDATE, 1)
-SCHED_FEAT(ASYM_EFF_LOAD, 1)

/*
* Spin-wait on mutex acquisition when the mutex owner is running on
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -307,15 +307,6 @@ static struct ctl_table kern_table[] = {
.extra2 = &max_wakeup_granularity_ns,
},
{
- .procname = "sched_shares_ratelimit",
- .data = &sysctl_sched_shares_ratelimit,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_proc_update_handler,
- .extra1 = &min_sched_shares_ratelimit,
- .extra2 = &max_sched_shares_ratelimit,
- },
- {
.procname = "sched_tunable_scaling",
.data = &sysctl_sched_tunable_scaling,
.maxlen = sizeof(enum sched_tunable_scaling),
@@ -325,14 +316,6 @@ static struct ctl_table kern_table[] = {
.extra2 = &max_sched_tunable_scaling,
},
{
- .procname = "sched_shares_thresh",
- .data = &sysctl_sched_shares_thresh,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- },
- {
.procname = "sched_migration_cost",
.data = &sysctl_sched_migration_cost,
.maxlen = sizeof(unsigned int),


2010-08-30 17:29:29

by Srivatsa Vaddagiri

[permalink] [raw]
Subject: Re: [RFC][PATCH 1/3] sched: Rewrite tg_shares_up

On Sun, Aug 29, 2010 at 12:30:26AM +0200, Peter Zijlstra wrote:
> By tracking a per-cpu load-avg for each cfs_rq and folding it into a
> global task_group load on each tick we can rework tg_shares_up to be
> strictly per-cpu.

So tg->load_weight is supposed to represent more or less current task load
across all cpus? I see only atomic_add() to it - which means it can only keep
growing or remain constant - IOW capturing the historical load even since
the task group was started? I was expecting it to reduce based on how a group
goes idle, otherwise

> +static void update_cfs_shares(struct cfs_rq *cfs_rq)
> +{
> + struct task_group *tg;
> + struct sched_entity *se;
> + unsigned long load_weight, load, shares;
> +
> + if (!cfs_rq)
> + return;
> +
> + tg = cfs_rq->tg;
> + se = tg->se[cpu_of(rq_of(cfs_rq))];
> + if (!se)
> + return;
> +
> + load = cfs_rq->load.weight;
> +
> + load_weight = atomic_read(&tg->load_weight);
> + load_weight -= cfs_rq->load_contribution;
> + load_weight += load;
> +
> + shares = (tg->shares * load);
> + if (load_weight)
> + shares /= load_weight;

this seem incorrect? Even though we have corrected tg->load_weight to reflect
current load on 'cfs_rq', tg->load_weight still captures historical load on
other cpus and hence could be a large #, making the division inaccurate?

Also I wonder how much of a hot spot tg->load_weight would become ..

- vatsa

2010-08-30 17:53:18

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 1/3] sched: Rewrite tg_shares_up

On Mon, 2010-08-30 at 22:50 +0530, Srivatsa Vaddagiri wrote:
> On Sun, Aug 29, 2010 at 12:30:26AM +0200, Peter Zijlstra wrote:
> > By tracking a per-cpu load-avg for each cfs_rq and folding it into a
> > global task_group load on each tick we can rework tg_shares_up to be
> > strictly per-cpu.
>
> So tg->load_weight is supposed to represent more or less current task load
> across all cpus? I see only atomic_add() to it - which means it can only keep
> growing or remain constant - IOW capturing the historical load even since
> the task group was started? I was expecting it to reduce based on how a group
> goes idle, otherwise

It can add a negative value, I should probably have made the below use
long instead of unsigned long.

> > +static void update_cfs_shares(struct cfs_rq *cfs_rq)
> > +{
> > + struct task_group *tg;
> > + struct sched_entity *se;
> > + unsigned long load_weight, load, shares;
> > +
> > + if (!cfs_rq)
> > + return;
> > +
> > + tg = cfs_rq->tg;
> > + se = tg->se[cpu_of(rq_of(cfs_rq))];
> > + if (!se)
> > + return;
> > +
> > + load = cfs_rq->load.weight;
> > +
> > + load_weight = atomic_read(&tg->load_weight);
> > + load_weight -= cfs_rq->load_contribution;
> > + load_weight += load;
> > +
> > + shares = (tg->shares * load);
> > + if (load_weight)
> > + shares /= load_weight;
>
> this seem incorrect? Even though we have corrected tg->load_weight to reflect
> current load on 'cfs_rq', tg->load_weight still captures historical load on
> other cpus and hence could be a large #, making the division inaccurate?

Well, without history you can get terribly inaccurate too. Suppose the
workload sleeps for 50ms then runs for 50ms with n tasks, each on its
own cpu. Without history you'll end up being off by n*load, with history
you'll be floating about the avg, giving a much more stable result.

The (tg->shares * cfs_rq->load) / tg->load < MIN_SHARES issues is not
something we can easily fix, there's only so much you can do with fixed
point math.

> Also I wonder how much of a hot spot tg->load_weight would become ..

We touch it once per tick, so pretty hot on large systems, then again,
the larger the system the less likely all cpus will run tasks from the
same cgroup.