LinuxLists.cc - [PATCH Resend v5] sched: fix init NOHZ

2013-04-03 13:38:31

Subject: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

On my smp platform which is made of 5 cores in 2 clusters, I have the
nr_busy_cpu field of sched_group_power struct that is not null when the
platform is fully idle. The root cause is:
During the boot sequence, some CPUs reach the idle loop and set their
NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
field is initialized later with the assumption that all CPUs are in the busy
state whereas some CPUs have already set their NOHZ_IDLE flag.

More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.

This condition can be ensured by adding a synchronize_rcu between the
destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE
flag will not be updated with old sched_domain once it has been initialized.
But this solution introduces a additionnal latency in the rebuild sequence
that is called during cpu hotplug.

As suggested by Frederic Weisbecker, another solution is to have the same
rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
a new sched_domain_rq struct that is the entry point for both sched_domains
and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
will share the same RCU lifecycle and will be always synchronized.

The synchronization is done at the cost of :
- an additional indirection for accessing the first sched_domain level
- an additional indirection and a rcu_dereference before accessing to the
NOHZ_IDLE flag.

Change since v4:
- link both sched_domain and NOHZ_IDLE flag in one RCU object so
their states are always synchronized.

Change since V3;
- NOHZ flag is not cleared if a NULL domain is attached to the CPU
- Remove patch 2/2 which becomes useless with latest modifications

Change since V2:
- change the initialization to idle state instead of busy state so a CPU that
enters idle during the build of the sched_domain will not corrupt the
initialization state

Change since V1:
- remove the patch for SCHED softirq on an idle core use case as it was
a side effect of the other use cases.

Signed-off-by: Vincent Guittot <[email protected]>
---
include/linux/sched.h | 6 +++
kernel/sched/core.c | 105 ++++++++++++++++++++++++++++++++++++++++++++-----
kernel/sched/fair.c | 35 +++++++++++------
kernel/sched/sched.h | 24 +++++++++--
4 files changed, 145 insertions(+), 25 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d35d2b6..2a52188 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -959,6 +959,12 @@ struct sched_domain {
unsigned long span[0];
};

+struct sched_domain_rq {
+ struct sched_domain *sd;
+ unsigned long flags;
+ struct rcu_head rcu; /* used during destruction */
+};
+
static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
{
return to_cpumask(sd->span);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f12624..69e2313 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
destroy_sched_domain(sd, cpu);
}

+static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu)
+{
+ if (!sd_rq)
+ return;
+
+ destroy_sched_domains(sd_rq->sd, cpu);
+ kfree_rcu(sd_rq, rcu);
+}
+
/*
* Keep a special pointer to the highest sched_domain that has
* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
@@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu)
* hold the hotplug lock.
*/
static void
-cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd,
+ int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct sched_domain *tmp;
+ struct sched_domain_rq *tmp_rq;
+ struct sched_domain *tmp, *sd = NULL;
+
+ /*
+ * If we don't have any sched_domain and associated object, we can
+ * directly jump to the attach sequence otherwise we try to degenerate
+ * the sched_domain
+ */
+ if (!sd_rq)
+ goto attach;
+
+ /* Get a pointer to the 1st sched_domain */
+ sd = sd_rq->sd;

/* Remove the sched domains which do not contribute to scheduling. */
for (tmp = sd; tmp; ) {
@@ -5658,14 +5680,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
destroy_sched_domain(tmp, cpu);
if (sd)
sd->child = NULL;
+ /* update sched_domain_rq */
+ sd_rq->sd = sd;
}

+attach:
sched_domain_debug(sd, cpu);

rq_attach_root(rq, rd);
- tmp = rq->sd;
- rcu_assign_pointer(rq->sd, sd);
- destroy_sched_domains(tmp, cpu);
+ tmp_rq = rq->sd_rq;
+ rcu_assign_pointer(rq->sd_rq, sd_rq);
+ destroy_sched_domain_rq(tmp_rq, cpu);

update_top_cache_domain(cpu);
}
@@ -5695,12 +5720,14 @@ struct sd_data {
};

struct s_data {
+ struct sched_domain_rq ** __percpu sd_rq;
struct sched_domain ** __percpu sd;
struct root_domain *rd;
};

enum s_alloc {
sa_rootdomain,
+ sa_sd_rq,
sa_sd,
sa_sd_storage,
sa_none,
@@ -5935,7 +5962,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
return;

update_group_power(sd, cpu);
- atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
+ atomic_set(&sg->sgp->nr_busy_cpus, 0);
}

int __weak arch_sd_sibling_asym_packing(void)
@@ -6011,6 +6038,8 @@ static void set_domain_attribute(struct sched_domain *sd,

static void __sdt_free(const struct cpumask *cpu_map);
static int __sdt_alloc(const struct cpumask *cpu_map);
+static void __sdrq_free(const struct cpumask *cpu_map, struct s_data *d);
+static int __sdrq_alloc(const struct cpumask *cpu_map, struct s_data *d);

static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
const struct cpumask *cpu_map)
@@ -6019,6 +6048,9 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
case sa_rootdomain:
if (!atomic_read(&d->rd->refcount))
free_rootdomain(&d->rd->rcu); /* fall through */
+ case sa_sd_rq:
+ __sdrq_free(cpu_map, d); /* fall through */
+ free_percpu(d->sd_rq); /* fall through */
case sa_sd:
free_percpu(d->sd); /* fall through */
case sa_sd_storage:
@@ -6038,9 +6070,14 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
d->sd = alloc_percpu(struct sched_domain *);
if (!d->sd)
return sa_sd_storage;
+ d->sd_rq = alloc_percpu(struct sched_domain_rq *);
+ if (!d->sd_rq)
+ return sa_sd;
+ if (__sdrq_alloc(cpu_map, d))
+ return sa_sd_rq;
d->rd = alloc_rootdomain();
if (!d->rd)
- return sa_sd;
+ return sa_sd_rq;
return sa_rootdomain;
}

@@ -6466,6 +6503,46 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
}

+static int __sdrq_alloc(const struct cpumask *cpu_map, struct s_data *d)
+{
+ int j;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain_rq *sd_rq;
+
+ sd_rq = kzalloc_node(sizeof(struct sched_domain_rq),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sd_rq)
+ return -ENOMEM;
+
+ *per_cpu_ptr(d->sd_rq, j) = sd_rq;
+ }
+
+ return 0;
+}
+
+static void __sdrq_free(const struct cpumask *cpu_map, struct s_data *d)
+{
+ int j;
+
+ for_each_cpu(j, cpu_map)
+ if (*per_cpu_ptr(d->sd_rq, j))
+ kfree(*per_cpu_ptr(d->sd_rq, j));
+}
+
+static void build_sched_domain_rq(struct s_data *d, int cpu)
+{
+ struct sched_domain_rq *sd_rq;
+ struct sched_domain *sd;
+
+ /* Attach sched_domain to sched_domain_rq */
+ sd = *per_cpu_ptr(d->sd, cpu);
+ sd_rq = *per_cpu_ptr(d->sd_rq, cpu);
+ sd_rq->sd = sd;
+ /* Init flags */
+ set_bit(NOHZ_IDLE, sched_rq_flags(sd_rq));
+}
+
struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
struct s_data *d, const struct cpumask *cpu_map,
struct sched_domain_attr *attr, struct sched_domain *child,
@@ -6495,6 +6572,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
struct sched_domain_attr *attr)
{
enum s_alloc alloc_state = sa_none;
+ struct sched_domain_rq *sd_rq;
struct sched_domain *sd;
struct s_data d;
int i, ret = -ENOMEM;
@@ -6547,11 +6625,18 @@ static int build_sched_domains(const struct cpumask *cpu_map,
}
}

+ /* Init objects that must follow the sched_domain lifecycle */
+ for_each_cpu(i, cpu_map) {
+ build_sched_domain_rq(&d, i);
+ }
+
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
- sd = *per_cpu_ptr(d.sd, i);
- cpu_attach_domain(sd, d.rd, i);
+ sd_rq = *per_cpu_ptr(d.sd_rq, i);
+ cpu_attach_domain(sd_rq, d.rd, i);
+ /* claim allocation of sched_domain_rq object */
+ *per_cpu_ptr(d.sd_rq, i) = NULL;
}
rcu_read_unlock();

@@ -6982,7 +7067,7 @@ void __init sched_init(void)
rq->last_load_update_tick = jiffies;

#ifdef CONFIG_SMP
- rq->sd = NULL;
+ rq->sd_rq = NULL;
rq->rd = NULL;
rq->cpu_power = SCHED_POWER_SCALE;
rq->post_schedule = 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a33e59..1c7447e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5392,31 +5392,39 @@ static inline void nohz_balance_exit_idle(int cpu)

static inline void set_cpu_sd_state_busy(void)
{
+ struct sched_domain_rq *sd_rq;
struct sched_domain *sd;
int cpu = smp_processor_id();

- if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
- return;
- clear_bit(NOHZ_IDLE, nohz_flags(cpu));
-
rcu_read_lock();
- for_each_domain(cpu, sd)
+ sd_rq = get_sched_domain_rq(cpu);
+
+ if (!sd_rq || !test_bit(NOHZ_IDLE, sched_rq_flags(sd_rq)))
+ goto unlock;
+ clear_bit(NOHZ_IDLE, sched_rq_flags(sd_rq));
+
+ for_each_domain_from_rq(sd_rq, sd)
atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+unlock:
rcu_read_unlock();
}

void set_cpu_sd_state_idle(void)
{
+ struct sched_domain_rq *sd_rq;
struct sched_domain *sd;
int cpu = smp_processor_id();

- if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
- return;
- set_bit(NOHZ_IDLE, nohz_flags(cpu));
-
rcu_read_lock();
- for_each_domain(cpu, sd)
+ sd_rq = get_sched_domain_rq(cpu);
+
+ if (!sd_rq || test_bit(NOHZ_IDLE, sched_rq_flags(sd_rq)))
+ goto unlock;
+ set_bit(NOHZ_IDLE, sched_rq_flags(sd_rq));
+
+ for_each_domain_from_rq(sd_rq, sd)
atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+unlock:
rcu_read_unlock();
}

@@ -5673,7 +5681,12 @@ static void run_rebalance_domains(struct softirq_action *h)

static inline int on_null_domain(int cpu)
{
- return !rcu_dereference_sched(cpu_rq(cpu)->sd);
+ struct sched_domain_rq *sd_rq =
+ rcu_dereference_sched(cpu_rq(cpu)->sd_rq);
+ struct sched_domain *sd = NULL;
+ if (sd_rq)
+ sd = sd_rq->sd;
+ return !sd;
}

/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cc03cfd..f589306 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -417,7 +417,7 @@ struct rq {

#ifdef CONFIG_SMP
struct root_domain *rd;
- struct sched_domain *sd;
+ struct sched_domain_rq *sd_rq;

unsigned long cpu_power;

@@ -505,21 +505,37 @@ DECLARE_PER_CPU(struct rq, runqueues);

#ifdef CONFIG_SMP

-#define rcu_dereference_check_sched_domain(p) \
+#define rcu_dereference_check_sched_domain_rq(p) \
rcu_dereference_check((p), \
lockdep_is_held(&sched_domains_mutex))

+#define get_sched_domain_rq(cpu) \
+ rcu_dereference_check_sched_domain_rq(cpu_rq(cpu)->sd_rq)
+
+#define rcu_dereference_check_sched_domain(cpu) ({ \
+ struct sched_domain_rq *__sd_rq = get_sched_domain_rq(cpu); \
+ struct sched_domain *__sd = NULL; \
+ if (__sd_rq) \
+ __sd = __sd_rq->sd; \
+ __sd; \
+})
+
+#define sched_rq_flags(sd_rq) (&sd_rq->flags)
+
/*
- * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * The domain tree (rq->sd_rq) is protected by RCU's quiescent state transition.
* See detach_destroy_domains: synchronize_sched for details.
*
* The domain tree of any CPU may only be accessed from within
* preempt-disabled sections.
*/
#define for_each_domain(cpu, __sd) \
- for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
+ for (__sd = rcu_dereference_check_sched_domain(cpu); \
__sd; __sd = __sd->parent)

+#define for_each_domain_from_rq(sd_rq, __sd) \
+ for (__sd = sd_rq->sd; __sd; __sd = __sd->parent)
+
#define for_each_lower_domain(sd) for (; sd; sd = sd->child)

/**
--
1.7.9.5

2013-04-04 14:24:27

by Frederic Weisbecker

[permalink] [raw]

Subject: Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013-04-04 14:31:36

by Frederic Weisbecker

[permalink] [raw]

Subject: Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013/4/4 Frederic Weisbecker <[email protected]>:
> 2013/4/3 Vincent Guittot <[email protected]>:
>> On my smp platform which is made of 5 cores in 2 clusters, I have the
>> nr_busy_cpu field of sched_group_power struct that is not null when the
>> platform is fully idle. The root cause is:
>> During the boot sequence, some CPUs reach the idle loop and set their
>> NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
>> field is initialized later with the assumption that all CPUs are in the busy
>> state whereas some CPUs have already set their NOHZ_IDLE flag.
>>
>> More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
>> are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.
>>
>> This condition can be ensured by adding a synchronize_rcu between the
>> destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE
>> flag will not be updated with old sched_domain once it has been initialized.
>> But this solution introduces a additionnal latency in the rebuild sequence
>> that is called during cpu hotplug.
>>
>> As suggested by Frederic Weisbecker, another solution is to have the same
>> rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
>> a new sched_domain_rq struct that is the entry point for both sched_domains
>> and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
>> will share the same RCU lifecycle and will be always synchronized.
>>
>> The synchronization is done at the cost of :
>> - an additional indirection for accessing the first sched_domain level
>> - an additional indirection and a rcu_dereference before accessing to the
>> NOHZ_IDLE flag.
>>
>> Change since v4:
>> - link both sched_domain and NOHZ_IDLE flag in one RCU object so
>> their states are always synchronized.
>>
>> Change since V3;
>> - NOHZ flag is not cleared if a NULL domain is attached to the CPU
>> - Remove patch 2/2 which becomes useless with latest modifications
>>
>> Change since V2:
>> - change the initialization to idle state instead of busy state so a CPU that
>> enters idle during the build of the sched_domain will not corrupt the
>> initialization state
>>
>> Change since V1:
>> - remove the patch for SCHED softirq on an idle core use case as it was
>> a side effect of the other use cases.
>>
>> Signed-off-by: Vincent Guittot <[email protected]>
>> ---
>> include/linux/sched.h | 6 +++
>> kernel/sched/core.c | 105 ++++++++++++++++++++++++++++++++++++++++++++-----
>> kernel/sched/fair.c | 35 +++++++++++------
>> kernel/sched/sched.h | 24 +++++++++--
>> 4 files changed, 145 insertions(+), 25 deletions(-)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index d35d2b6..2a52188 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -959,6 +959,12 @@ struct sched_domain {
>> unsigned long span[0];
>> };
>>
>> +struct sched_domain_rq {
>> + struct sched_domain *sd;
>
> Why not make it part of the structure content instead of pointing to it?

I just realize that would make destroy_sched_domains() too complicated
because only the leaf sched_domain belong to a sched_domain_rq.
Nevermind.

2013-04-04 17:07:46

by Frederic Weisbecker

[permalink] [raw]

Subject: Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013/4/3 Vincent Guittot <[email protected]>:
> On my smp platform which is made of 5 cores in 2 clusters, I have the
> nr_busy_cpu field of sched_group_power struct that is not null when the
> platform is fully idle. The root cause is:
> During the boot sequence, some CPUs reach the idle loop and set their
> NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
> field is initialized later with the assumption that all CPUs are in the busy
> state whereas some CPUs have already set their NOHZ_IDLE flag.
>
> More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
> are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.
>
> This condition can be ensured by adding a synchronize_rcu between the
> destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE
> flag will not be updated with old sched_domain once it has been initialized.
> But this solution introduces a additionnal latency in the rebuild sequence
> that is called during cpu hotplug.
>
> As suggested by Frederic Weisbecker, another solution is to have the same
> rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
> a new sched_domain_rq struct that is the entry point for both sched_domains
> and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
> will share the same RCU lifecycle and will be always synchronized.
>
> The synchronization is done at the cost of :
> - an additional indirection for accessing the first sched_domain level
> - an additional indirection and a rcu_dereference before accessing to the
> NOHZ_IDLE flag.
>
> Change since v4:
> - link both sched_domain and NOHZ_IDLE flag in one RCU object so
> their states are always synchronized.
>
> Change since V3;
> - NOHZ flag is not cleared if a NULL domain is attached to the CPU
> - Remove patch 2/2 which becomes useless with latest modifications
>
> Change since V2:
> - change the initialization to idle state instead of busy state so a CPU that
> enters idle during the build of the sched_domain will not corrupt the
> initialization state
>
> Change since V1:
> - remove the patch for SCHED softirq on an idle core use case as it was
> a side effect of the other use cases.
>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
> include/linux/sched.h | 6 +++
> kernel/sched/core.c | 105 ++++++++++++++++++++++++++++++++++++++++++++-----
> kernel/sched/fair.c | 35 +++++++++++------
> kernel/sched/sched.h | 24 +++++++++--
> 4 files changed, 145 insertions(+), 25 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index d35d2b6..2a52188 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -959,6 +959,12 @@ struct sched_domain {
> unsigned long span[0];
> };
>
> +struct sched_domain_rq {
> + struct sched_domain *sd;
> + unsigned long flags;
> + struct rcu_head rcu; /* used during destruction */
> +};
> +
> static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
> {
> return to_cpumask(sd->span);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 7f12624..69e2313 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
> destroy_sched_domain(sd, cpu);
> }
>
> +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu)
> +{
> + if (!sd_rq)
> + return;
> +
> + destroy_sched_domains(sd_rq->sd, cpu);
> + kfree_rcu(sd_rq, rcu);
> +}
> +
> /*
> * Keep a special pointer to the highest sched_domain that has
> * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
> @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu)
> * hold the hotplug lock.
> */
> static void
> -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
> +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd,
> + int cpu)
> {
> struct rq *rq = cpu_rq(cpu);
> - struct sched_domain *tmp;
> + struct sched_domain_rq *tmp_rq;
> + struct sched_domain *tmp, *sd = NULL;
> +
> + /*
> + * If we don't have any sched_domain and associated object, we can
> + * directly jump to the attach sequence otherwise we try to degenerate
> + * the sched_domain
> + */
> + if (!sd_rq)
> + goto attach;
> +
> + /* Get a pointer to the 1st sched_domain */
> + sd = sd_rq->sd;
>
> /* Remove the sched domains which do not contribute to scheduling. */
> for (tmp = sd; tmp; ) {
> @@ -5658,14 +5680,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
> destroy_sched_domain(tmp, cpu);
> if (sd)
> sd->child = NULL;
> + /* update sched_domain_rq */
> + sd_rq->sd = sd;
> }
>
> +attach:
> sched_domain_debug(sd, cpu);
>
> rq_attach_root(rq, rd);
> - tmp = rq->sd;
> - rcu_assign_pointer(rq->sd, sd);
> - destroy_sched_domains(tmp, cpu);
> + tmp_rq = rq->sd_rq;
> + rcu_assign_pointer(rq->sd_rq, sd_rq);
> + destroy_sched_domain_rq(tmp_rq, cpu);
>
> update_top_cache_domain(cpu);
> }
> @@ -5695,12 +5720,14 @@ struct sd_data {
> };
>
> struct s_data {
> + struct sched_domain_rq ** __percpu sd_rq;
> struct sched_domain ** __percpu sd;
> struct root_domain *rd;
> };
>
> enum s_alloc {
> sa_rootdomain,
> + sa_sd_rq,
> sa_sd,
> sa_sd_storage,
> sa_none,
> @@ -5935,7 +5962,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
> return;
>
> update_group_power(sd, cpu);
> - atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
> + atomic_set(&sg->sgp->nr_busy_cpus, 0);

Is it possible that we can be dealing here with a
sched_group/sched_group_power that is used on another CPU (from that
CPU's rq->rq_sd->sd) concurrently?
When we call build_sched_groups(), we might reuse an exisiting struct
sched_group used elsewhere right? If so, is there a race with the
above initialization?

2013-04-04 17:30:47

by Vincent Guittot

[permalink] [raw]

Subject: Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

On 4 April 2013 19:07, Frederic Weisbecker <[email protected]> wrote:
> 2013/4/3 Vincent Guittot <[email protected]>:
>> On my smp platform which is made of 5 cores in 2 clusters, I have the
>> nr_busy_cpu field of sched_group_power struct that is not null when the
>> platform is fully idle. The root cause is:
>> During the boot sequence, some CPUs reach the idle loop and set their
>> NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
>> field is initialized later with the assumption that all CPUs are in the busy
>> state whereas some CPUs have already set their NOHZ_IDLE flag.
>>
>> More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
>> are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.
>>
>> This condition can be ensured by adding a synchronize_rcu between the
>> destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE
>> flag will not be updated with old sched_domain once it has been initialized.
>> But this solution introduces a additionnal latency in the rebuild sequence
>> that is called during cpu hotplug.
>>
>> As suggested by Frederic Weisbecker, another solution is to have the same
>> rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
>> a new sched_domain_rq struct that is the entry point for both sched_domains
>> and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
>> will share the same RCU lifecycle and will be always synchronized.
>>
>> The synchronization is done at the cost of :
>> - an additional indirection for accessing the first sched_domain level
>> - an additional indirection and a rcu_dereference before accessing to the
>> NOHZ_IDLE flag.
>>
>> Change since v4:
>> - link both sched_domain and NOHZ_IDLE flag in one RCU object so
>> their states are always synchronized.
>>
>> Change since V3;
>> - NOHZ flag is not cleared if a NULL domain is attached to the CPU
>> - Remove patch 2/2 which becomes useless with latest modifications
>>
>> Change since V2:
>> - change the initialization to idle state instead of busy state so a CPU that
>> enters idle during the build of the sched_domain will not corrupt the
>> initialization state
>>
>> Change since V1:
>> - remove the patch for SCHED softirq on an idle core use case as it was
>> a side effect of the other use cases.
>>
>> Signed-off-by: Vincent Guittot <[email protected]>
>> ---
>> include/linux/sched.h | 6 +++
>> kernel/sched/core.c | 105 ++++++++++++++++++++++++++++++++++++++++++++-----
>> kernel/sched/fair.c | 35 +++++++++++------
>> kernel/sched/sched.h | 24 +++++++++--
>> 4 files changed, 145 insertions(+), 25 deletions(-)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index d35d2b6..2a52188 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -959,6 +959,12 @@ struct sched_domain {
>> unsigned long span[0];
>> };
>>
>> +struct sched_domain_rq {
>> + struct sched_domain *sd;
>> + unsigned long flags;
>> + struct rcu_head rcu; /* used during destruction */
>> +};
>> +
>> static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
>> {
>> return to_cpumask(sd->span);
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 7f12624..69e2313 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
>> destroy_sched_domain(sd, cpu);
>> }
>>
>> +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu)
>> +{
>> + if (!sd_rq)
>> + return;
>> +
>> + destroy_sched_domains(sd_rq->sd, cpu);
>> + kfree_rcu(sd_rq, rcu);
>> +}
>> +
>> /*
>> * Keep a special pointer to the highest sched_domain that has
>> * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
>> @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu)
>> * hold the hotplug lock.
>> */
>> static void
>> -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
>> +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd,
>> + int cpu)
>> {
>> struct rq *rq = cpu_rq(cpu);
>> - struct sched_domain *tmp;
>> + struct sched_domain_rq *tmp_rq;
>> + struct sched_domain *tmp, *sd = NULL;
>> +
>> + /*
>> + * If we don't have any sched_domain and associated object, we can
>> + * directly jump to the attach sequence otherwise we try to degenerate
>> + * the sched_domain
>> + */
>> + if (!sd_rq)
>> + goto attach;
>> +
>> + /* Get a pointer to the 1st sched_domain */
>> + sd = sd_rq->sd;
>>
>> /* Remove the sched domains which do not contribute to scheduling. */
>> for (tmp = sd; tmp; ) {
>> @@ -5658,14 +5680,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
>> destroy_sched_domain(tmp, cpu);
>> if (sd)
>> sd->child = NULL;
>> + /* update sched_domain_rq */
>> + sd_rq->sd = sd;
>> }
>>
>> +attach:
>> sched_domain_debug(sd, cpu);
>>
>> rq_attach_root(rq, rd);
>> - tmp = rq->sd;
>> - rcu_assign_pointer(rq->sd, sd);
>> - destroy_sched_domains(tmp, cpu);
>> + tmp_rq = rq->sd_rq;
>> + rcu_assign_pointer(rq->sd_rq, sd_rq);
>> + destroy_sched_domain_rq(tmp_rq, cpu);
>>
>> update_top_cache_domain(cpu);
>> }
>> @@ -5695,12 +5720,14 @@ struct sd_data {
>> };
>>
>> struct s_data {
>> + struct sched_domain_rq ** __percpu sd_rq;
>> struct sched_domain ** __percpu sd;
>> struct root_domain *rd;
>> };
>>
>> enum s_alloc {
>> sa_rootdomain,
>> + sa_sd_rq,
>> sa_sd,
>> sa_sd_storage,
>> sa_none,
>> @@ -5935,7 +5962,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
>> return;
>>
>> update_group_power(sd, cpu);
>> - atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
>> + atomic_set(&sg->sgp->nr_busy_cpus, 0);
>
> Is it possible that we can be dealing here with a
> sched_group/sched_group_power that is used on another CPU (from that
> CPU's rq->rq_sd->sd) concurrently?
> When we call build_sched_groups(), we might reuse an exisiting struct
> sched_group used elsewhere right? If so, is there a race with the
> above initialization?

No we are not reusing an existing struct, the
sched_group/sched_group_power that is initialized here, has just been
created by __visit_domain_allocation_hell in build_sched_domains. The
sched_group/sched_group_power is not already attached to any CPU

Vincent

2013-04-09 07:16:48

by Vincent Guittot

[permalink] [raw]

Subject: Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

On 4 April 2013 19:30, Vincent Guittot <[email protected]> wrote:
> On 4 April 2013 19:07, Frederic Weisbecker <[email protected]> wrote:
>> 2013/4/3 Vincent Guittot <[email protected]>:
>>> On my smp platform which is made of 5 cores in 2 clusters, I have the
>>> nr_busy_cpu field of sched_group_power struct that is not null when the
>>> platform is fully idle. The root cause is:
>>> During the boot sequence, some CPUs reach the idle loop and set their
>>> NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
>>> field is initialized later with the assumption that all CPUs are in the busy
>>> state whereas some CPUs have already set their NOHZ_IDLE flag.
>>>
>>> More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
>>> are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.
>>>
>>> This condition can be ensured by adding a synchronize_rcu between the
>>> destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE
>>> flag will not be updated with old sched_domain once it has been initialized.
>>> But this solution introduces a additionnal latency in the rebuild sequence
>>> that is called during cpu hotplug.
>>>
>>> As suggested by Frederic Weisbecker, another solution is to have the same
>>> rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
>>> a new sched_domain_rq struct that is the entry point for both sched_domains
>>> and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
>>> will share the same RCU lifecycle and will be always synchronized.
>>>
>>> The synchronization is done at the cost of :
>>> - an additional indirection for accessing the first sched_domain level
>>> - an additional indirection and a rcu_dereference before accessing to the
>>> NOHZ_IDLE flag.
>>>
>>> Change since v4:
>>> - link both sched_domain and NOHZ_IDLE flag in one RCU object so
>>> their states are always synchronized.
>>>
>>> Change since V3;
>>> - NOHZ flag is not cleared if a NULL domain is attached to the CPU
>>> - Remove patch 2/2 which becomes useless with latest modifications
>>>
>>> Change since V2:
>>> - change the initialization to idle state instead of busy state so a CPU that
>>> enters idle during the build of the sched_domain will not corrupt the
>>> initialization state
>>>
>>> Change since V1:
>>> - remove the patch for SCHED softirq on an idle core use case as it was
>>> a side effect of the other use cases.
>>>
>>> Signed-off-by: Vincent Guittot <[email protected]>
>>> ---
>>> include/linux/sched.h | 6 +++
>>> kernel/sched/core.c | 105 ++++++++++++++++++++++++++++++++++++++++++++-----
>>> kernel/sched/fair.c | 35 +++++++++++------
>>> kernel/sched/sched.h | 24 +++++++++--
>>> 4 files changed, 145 insertions(+), 25 deletions(-)
>>>
>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>>> index d35d2b6..2a52188 100644
>>> --- a/include/linux/sched.h
>>> +++ b/include/linux/sched.h
>>> @@ -959,6 +959,12 @@ struct sched_domain {
>>> unsigned long span[0];
>>> };
>>>
>>> +struct sched_domain_rq {
>>> + struct sched_domain *sd;
>>> + unsigned long flags;
>>> + struct rcu_head rcu; /* used during destruction */
>>> +};
>>> +
>>> static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
>>> {
>>> return to_cpumask(sd->span);
>>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>>> index 7f12624..69e2313 100644
>>> --- a/kernel/sched/core.c
>>> +++ b/kernel/sched/core.c
>>> @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
>>> destroy_sched_domain(sd, cpu);
>>> }
>>>
>>> +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu)
>>> +{
>>> + if (!sd_rq)
>>> + return;
>>> +
>>> + destroy_sched_domains(sd_rq->sd, cpu);
>>> + kfree_rcu(sd_rq, rcu);
>>> +}
>>> +
>>> /*
>>> * Keep a special pointer to the highest sched_domain that has
>>> * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
>>> @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu)
>>> * hold the hotplug lock.
>>> */
>>> static void
>>> -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
>>> +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd,
>>> + int cpu)
>>> {
>>> struct rq *rq = cpu_rq(cpu);
>>> - struct sched_domain *tmp;
>>> + struct sched_domain_rq *tmp_rq;
>>> + struct sched_domain *tmp, *sd = NULL;
>>> +
>>> + /*
>>> + * If we don't have any sched_domain and associated object, we can
>>> + * directly jump to the attach sequence otherwise we try to degenerate
>>> + * the sched_domain
>>> + */
>>> + if (!sd_rq)
>>> + goto attach;
>>> +
>>> + /* Get a pointer to the 1st sched_domain */
>>> + sd = sd_rq->sd;
>>>
>>> /* Remove the sched domains which do not contribute to scheduling. */
>>> for (tmp = sd; tmp; ) {
>>> @@ -5658,14 +5680,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
>>> destroy_sched_domain(tmp, cpu);
>>> if (sd)
>>> sd->child = NULL;
>>> + /* update sched_domain_rq */
>>> + sd_rq->sd = sd;
>>> }
>>>
>>> +attach:
>>> sched_domain_debug(sd, cpu);
>>>
>>> rq_attach_root(rq, rd);
>>> - tmp = rq->sd;
>>> - rcu_assign_pointer(rq->sd, sd);
>>> - destroy_sched_domains(tmp, cpu);
>>> + tmp_rq = rq->sd_rq;
>>> + rcu_assign_pointer(rq->sd_rq, sd_rq);
>>> + destroy_sched_domain_rq(tmp_rq, cpu);
>>>
>>> update_top_cache_domain(cpu);
>>> }
>>> @@ -5695,12 +5720,14 @@ struct sd_data {
>>> };
>>>
>>> struct s_data {
>>> + struct sched_domain_rq ** __percpu sd_rq;
>>> struct sched_domain ** __percpu sd;
>>> struct root_domain *rd;
>>> };
>>>
>>> enum s_alloc {
>>> sa_rootdomain,
>>> + sa_sd_rq,
>>> sa_sd,
>>> sa_sd_storage,
>>> sa_none,
>>> @@ -5935,7 +5962,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
>>> return;
>>>
>>> update_group_power(sd, cpu);
>>> - atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
>>> + atomic_set(&sg->sgp->nr_busy_cpus, 0);
>>
>> Is it possible that we can be dealing here with a
>> sched_group/sched_group_power that is used on another CPU (from that
>> CPU's rq->rq_sd->sd) concurrently?
>> When we call build_sched_groups(), we might reuse an exisiting struct
>> sched_group used elsewhere right? If so, is there a race with the
>> above initialization?
>
> No we are not reusing an existing struct, the
> sched_group/sched_group_power that is initialized here, has just been
> created by __visit_domain_allocation_hell in build_sched_domains. The
> sched_group/sched_group_power is not already attached to any CPU
>

Hi Frederic,

Does it answer your concern and do you have more comments about the patch?

Vincent

> Vincent

2013-04-09 12:45:41

by Frederic Weisbecker

[permalink] [raw]

Subject: Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

2013/4/4 Vincent Guittot <[email protected]>:
> On 4 April 2013 19:07, Frederic Weisbecker <[email protected]> wrote:
>> Is it possible that we can be dealing here with a
>> sched_group/sched_group_power that is used on another CPU (from that
>> CPU's rq->rq_sd->sd) concurrently?
>> When we call build_sched_groups(), we might reuse an exisiting struct
>> sched_group used elsewhere right? If so, is there a race with the
>> above initialization?
>
> No we are not reusing an existing struct, the
> sched_group/sched_group_power that is initialized here, has just been
> created by __visit_domain_allocation_hell in build_sched_domains. The
> sched_group/sched_group_power is not already attached to any CPU

I see. Yeah the group allocations/initialization is done per domain
found in ndoms_new[]. And there is no further reuse of these groups
once these are attached.

Looking at the code it seems we can make some symetric conclusion with
group release? When we destroy a per cpu domain hierarchy and then put
our references to the struct sched_group, all the other per cpu
domains that reference these sched_group are about to put their
reference soon too, right? Because IIUC we always destroy these per
cpu domains per highest level partition (those found in doms_cur[])?

I'm just asking to make sure we don't need some
atomic_dec(nr_busy_cpus) on per cpu domain release, which is not
necessary the sched group is getting released soon.

Thanks for your patience :)

2013-04-10 15:23:40

by Vincent Guittot

[permalink] [raw]

Subject: Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

On 9 April 2013 14:45, Frederic Weisbecker <[email protected]> wrote:
> 2013/4/4 Vincent Guittot <[email protected]>:
>> On 4 April 2013 19:07, Frederic Weisbecker <[email protected]> wrote:
>>> Is it possible that we can be dealing here with a
>>> sched_group/sched_group_power that is used on another CPU (from that
>>> CPU's rq->rq_sd->sd) concurrently?
>>> When we call build_sched_groups(), we might reuse an exisiting struct
>>> sched_group used elsewhere right? If so, is there a race with the
>>> above initialization?
>>
>> No we are not reusing an existing struct, the
>> sched_group/sched_group_power that is initialized here, has just been
>> created by __visit_domain_allocation_hell in build_sched_domains. The
>> sched_group/sched_group_power is not already attached to any CPU
>
> I see. Yeah the group allocations/initialization is done per domain
> found in ndoms_new[]. And there is no further reuse of these groups
> once these are attached.
>
> Looking at the code it seems we can make some symetric conclusion with
> group release? When we destroy a per cpu domain hierarchy and then put
> our references to the struct sched_group, all the other per cpu
> domains that reference these sched_group are about to put their
> reference soon too, right? Because IIUC we always destroy these per
> cpu domains per highest level partition (those found in doms_cur[])?

Yes

>
> I'm just asking to make sure we don't need some
> atomic_dec(nr_busy_cpus) on per cpu domain release, which is not
> necessary the sched group is getting released soon.

yes, it's not needed

>
> Thanks for your patience :)

That's fine.

2013-04-11 14:56:29

by Frederic Weisbecker

[permalink] [raw]

Subject: Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

On Wed, Apr 03, 2013 at 03:37:43PM +0200, Vincent Guittot wrote:
> On my smp platform which is made of 5 cores in 2 clusters, I have the
> nr_busy_cpu field of sched_group_power struct that is not null when the
> platform is fully idle. The root cause is:
> During the boot sequence, some CPUs reach the idle loop and set their
> NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
> field is initialized later with the assumption that all CPUs are in the busy
> state whereas some CPUs have already set their NOHZ_IDLE flag.
>
> More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
> are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.
>
> This condition can be ensured by adding a synchronize_rcu between the
> destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE
> flag will not be updated with old sched_domain once it has been initialized.
> But this solution introduces a additionnal latency in the rebuild sequence
> that is called during cpu hotplug.
>
> As suggested by Frederic Weisbecker, another solution is to have the same
> rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
> a new sched_domain_rq struct that is the entry point for both sched_domains
> and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
> will share the same RCU lifecycle and will be always synchronized.
>
> The synchronization is done at the cost of :
> - an additional indirection for accessing the first sched_domain level
> - an additional indirection and a rcu_dereference before accessing to the
> NOHZ_IDLE flag.
>
> Change since v4:
> - link both sched_domain and NOHZ_IDLE flag in one RCU object so
> their states are always synchronized.
>
> Change since V3;
> - NOHZ flag is not cleared if a NULL domain is attached to the CPU
> - Remove patch 2/2 which becomes useless with latest modifications
>
> Change since V2:
> - change the initialization to idle state instead of busy state so a CPU that
> enters idle during the build of the sched_domain will not corrupt the
> initialization state
>
> Change since V1:
> - remove the patch for SCHED softirq on an idle core use case as it was
> a side effect of the other use cases.
>
> Signed-off-by: Vincent Guittot <[email protected]>

Ok, the lockless scheme involving nr_buzy_cpus and rq flags seem to be correct now.
We can hope somebody will come up with a less complicated solution. But for now that's
the best fix I've seen.

I just have a few comments on details.

> ---
> include/linux/sched.h | 6 +++
> kernel/sched/core.c | 105 ++++++++++++++++++++++++++++++++++++++++++++-----
> kernel/sched/fair.c | 35 +++++++++++------
> kernel/sched/sched.h | 24 +++++++++--
> 4 files changed, 145 insertions(+), 25 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index d35d2b6..2a52188 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -959,6 +959,12 @@ struct sched_domain {
> unsigned long span[0];
> };
>
> +struct sched_domain_rq {
> + struct sched_domain *sd;
> + unsigned long flags;
> + struct rcu_head rcu; /* used during destruction */
> +};

So the reason for this level of indirection won't be intuitive for those
who read that code. Please add some comments that explain why we need
that. ie: because we need the object lifecycle of sched_power and flags to be the
same for the lockless scheme to work.

> +
> static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
> {
> return to_cpumask(sd->span);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 7f12624..69e2313 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
> destroy_sched_domain(sd, cpu);
> }
>
> +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu)
> +{
> + if (!sd_rq)
> + return;
> +
> + destroy_sched_domains(sd_rq->sd, cpu);
> + kfree_rcu(sd_rq, rcu);
> +}
> +
> /*
> * Keep a special pointer to the highest sched_domain that has
> * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
> @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu)
> * hold the hotplug lock.
> */
> static void
> -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
> +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd,
> + int cpu)
> {
> struct rq *rq = cpu_rq(cpu);
> - struct sched_domain *tmp;
> + struct sched_domain_rq *tmp_rq;

old_sd_rq would be a clearer name.

> + struct sched_domain *tmp, *sd = NULL;
> +
> + /*
> + * If we don't have any sched_domain and associated object, we can
> + * directly jump to the attach sequence otherwise we try to degenerate
> + * the sched_domain
> + */
> + if (!sd_rq)
> + goto attach;
> +
> + /* Get a pointer to the 1st sched_domain */
> + sd = sd_rq->sd;
>
> /* Remove the sched domains which do not contribute to scheduling. */
> for (tmp = sd; tmp; ) {
> @@ -5658,14 +5680,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
> destroy_sched_domain(tmp, cpu);
> if (sd)
> sd->child = NULL;
> + /* update sched_domain_rq */
> + sd_rq->sd = sd;
> }
>
> +attach:
> sched_domain_debug(sd, cpu);
>
> rq_attach_root(rq, rd);
> - tmp = rq->sd;
> - rcu_assign_pointer(rq->sd, sd);
> - destroy_sched_domains(tmp, cpu);
> + tmp_rq = rq->sd_rq;
> + rcu_assign_pointer(rq->sd_rq, sd_rq);
> + destroy_sched_domain_rq(tmp_rq, cpu);
>
> update_top_cache_domain(cpu);

[...]
> +static void __sdrq_free(const struct cpumask *cpu_map, struct s_data *d)
> +{
> + int j;
> +
> + for_each_cpu(j, cpu_map)
> + if (*per_cpu_ptr(d->sd_rq, j))
> + kfree(*per_cpu_ptr(d->sd_rq, j));

kfree(NULL) works.

> +}
> +
> +static void build_sched_domain_rq(struct s_data *d, int cpu)
> +{
> + struct sched_domain_rq *sd_rq;
> + struct sched_domain *sd;
> +
> + /* Attach sched_domain to sched_domain_rq */
> + sd = *per_cpu_ptr(d->sd, cpu);
> + sd_rq = *per_cpu_ptr(d->sd_rq, cpu);
> + sd_rq->sd = sd;
> + /* Init flags */
> + set_bit(NOHZ_IDLE, sched_rq_flags(sd_rq));
> +}
> +
> struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
> struct s_data *d, const struct cpumask *cpu_map,
> struct sched_domain_attr *attr, struct sched_domain *child,
> @@ -6495,6 +6572,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
> struct sched_domain_attr *attr)
> {
> enum s_alloc alloc_state = sa_none;
> + struct sched_domain_rq *sd_rq;
> struct sched_domain *sd;
> struct s_data d;
> int i, ret = -ENOMEM;
> @@ -6547,11 +6625,18 @@ static int build_sched_domains(const struct cpumask *cpu_map,
> }
> }
>
> + /* Init objects that must follow the sched_domain lifecycle */
> + for_each_cpu(i, cpu_map) {
> + build_sched_domain_rq(&d, i);
> + }

I suggest you put the above domain_rq initialization before the domain
initialization.

So that we have that more intuitively ordered initialization:

* set up domains_rq
* set up domains
* set up sched groups
* set up sched groups power

> +
> /* Attach the domains */
> rcu_read_lock();
> for_each_cpu(i, cpu_map) {
> - sd = *per_cpu_ptr(d.sd, i);
> - cpu_attach_domain(sd, d.rd, i);
> + sd_rq = *per_cpu_ptr(d.sd_rq, i);
> + cpu_attach_domain(sd_rq, d.rd, i);
> + /* claim allocation of sched_domain_rq object */
> + *per_cpu_ptr(d.sd_rq, i) = NULL;
> }
> rcu_read_unlock();
[...]
> @@ -5673,7 +5681,12 @@ static void run_rebalance_domains(struct softirq_action *h)
>
> static inline int on_null_domain(int cpu)
> {
> - return !rcu_dereference_sched(cpu_rq(cpu)->sd);
> + struct sched_domain_rq *sd_rq =
> + rcu_dereference_sched(cpu_rq(cpu)->sd_rq);
> + struct sched_domain *sd = NULL;
> + if (sd_rq)
> + sd = sd_rq->sd;

Is it possible to have sd_rq->sd == NULL ?

> + return !sd;
> }
>
> /*
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index cc03cfd..f589306 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -417,7 +417,7 @@ struct rq {
>
> #ifdef CONFIG_SMP
> struct root_domain *rd;
> - struct sched_domain *sd;
> + struct sched_domain_rq *sd_rq;
>
> unsigned long cpu_power;
>
> @@ -505,21 +505,37 @@ DECLARE_PER_CPU(struct rq, runqueues);
>
> #ifdef CONFIG_SMP
>
> -#define rcu_dereference_check_sched_domain(p) \
> +#define rcu_dereference_check_sched_domain_rq(p) \
> rcu_dereference_check((p), \
> lockdep_is_held(&sched_domains_mutex))
>
> +#define get_sched_domain_rq(cpu) \
> + rcu_dereference_check_sched_domain_rq(cpu_rq(cpu)->sd_rq)

How about rcu_dereference_domain_rq()? It seems important to me that
we keep the rcu_dereference_*() naming so that we don't hide what really
happens there behind a more opaque naming.

I mean RCU is tricky to deal with and it's important not to obfuscate its use.

> +
> +#define rcu_dereference_check_sched_domain(cpu) ({ \
> + struct sched_domain_rq *__sd_rq = get_sched_domain_rq(cpu); \
> + struct sched_domain *__sd = NULL; \
> + if (__sd_rq) \
> + __sd = __sd_rq->sd; \

Same question for the NULL sd.

> + __sd; \
> +})
> +
> +#define sched_rq_flags(sd_rq) (&sd_rq->flags)

Hm, why this "sched_" prefix? Maybe rq_domain_flags() ?

Thanks.

2013-04-12 08:23:54

by Vincent Guittot

[permalink] [raw]

Subject: Re: [PATCH Resend v5] sched: fix init NOHZ_IDLE flag

On 11 April 2013 16:56, Frederic Weisbecker <[email protected]> wrote:
> On Wed, Apr 03, 2013 at 03:37:43PM +0200, Vincent Guittot wrote:
>> On my smp platform which is made of 5 cores in 2 clusters, I have the
>> nr_busy_cpu field of sched_group_power struct that is not null when the
>> platform is fully idle. The root cause is:
>> During the boot sequence, some CPUs reach the idle loop and set their
>> NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus
>> field is initialized later with the assumption that all CPUs are in the busy
>> state whereas some CPUs have already set their NOHZ_IDLE flag.
>>
>> More generally, the NOHZ_IDLE flag must be initialized when new sched_domains
>> are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned.
>>
>> This condition can be ensured by adding a synchronize_rcu between the
>> destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE
>> flag will not be updated with old sched_domain once it has been initialized.
>> But this solution introduces a additionnal latency in the rebuild sequence
>> that is called during cpu hotplug.
>>
>> As suggested by Frederic Weisbecker, another solution is to have the same
>> rcu lifecycle for both NOHZ_IDLE and sched_domain struct. I have introduce
>> a new sched_domain_rq struct that is the entry point for both sched_domains
>> and objects that must follow the same lifecycle like NOHZ_IDLE flags. They
>> will share the same RCU lifecycle and will be always synchronized.
>>
>> The synchronization is done at the cost of :
>> - an additional indirection for accessing the first sched_domain level
>> - an additional indirection and a rcu_dereference before accessing to the
>> NOHZ_IDLE flag.
>>
>> Change since v4:
>> - link both sched_domain and NOHZ_IDLE flag in one RCU object so
>> their states are always synchronized.
>>
>> Change since V3;
>> - NOHZ flag is not cleared if a NULL domain is attached to the CPU
>> - Remove patch 2/2 which becomes useless with latest modifications
>>
>> Change since V2:
>> - change the initialization to idle state instead of busy state so a CPU that
>> enters idle during the build of the sched_domain will not corrupt the
>> initialization state
>>
>> Change since V1:
>> - remove the patch for SCHED softirq on an idle core use case as it was
>> a side effect of the other use cases.
>>
>> Signed-off-by: Vincent Guittot <[email protected]>
>
> Ok, the lockless scheme involving nr_buzy_cpus and rq flags seem to be correct now.
> We can hope somebody will come up with a less complicated solution. But for now that's
> the best fix I've seen.

Thanks

>
> I just have a few comments on details.
>
>> ---
>> include/linux/sched.h | 6 +++
>> kernel/sched/core.c | 105 ++++++++++++++++++++++++++++++++++++++++++++-----
>> kernel/sched/fair.c | 35 +++++++++++------
>> kernel/sched/sched.h | 24 +++++++++--
>> 4 files changed, 145 insertions(+), 25 deletions(-)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index d35d2b6..2a52188 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -959,6 +959,12 @@ struct sched_domain {
>> unsigned long span[0];
>> };
>>
>> +struct sched_domain_rq {
>> + struct sched_domain *sd;
>> + unsigned long flags;
>> + struct rcu_head rcu; /* used during destruction */
>> +};
>
> So the reason for this level of indirection won't be intuitive for those
> who read that code. Please add some comments that explain why we need
> that. ie: because we need the object lifecycle of sched_power and flags to be the
> same for the lockless scheme to work.

ok, i will add a comment

>
>> +
>> static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
>> {
>> return to_cpumask(sd->span);
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 7f12624..69e2313 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -5602,6 +5602,15 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
>> destroy_sched_domain(sd, cpu);
>> }
>>
>> +static void destroy_sched_domain_rq(struct sched_domain_rq *sd_rq, int cpu)
>> +{
>> + if (!sd_rq)
>> + return;
>> +
>> + destroy_sched_domains(sd_rq->sd, cpu);
>> + kfree_rcu(sd_rq, rcu);
>> +}
>> +
>> /*
>> * Keep a special pointer to the highest sched_domain that has
>> * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
>> @@ -5632,10 +5641,23 @@ static void update_top_cache_domain(int cpu)
>> * hold the hotplug lock.
>> */
>> static void
>> -cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
>> +cpu_attach_domain(struct sched_domain_rq *sd_rq, struct root_domain *rd,
>> + int cpu)
>> {
>> struct rq *rq = cpu_rq(cpu);
>> - struct sched_domain *tmp;
>> + struct sched_domain_rq *tmp_rq;
>
> old_sd_rq would be a clearer name.

ok

>
>> + struct sched_domain *tmp, *sd = NULL;
>> +
>> + /*
>> + * If we don't have any sched_domain and associated object, we can
>> + * directly jump to the attach sequence otherwise we try to degenerate
>> + * the sched_domain
>> + */
>> + if (!sd_rq)
>> + goto attach;
>> +
>> + /* Get a pointer to the 1st sched_domain */
>> + sd = sd_rq->sd;
>>
>> /* Remove the sched domains which do not contribute to scheduling. */
>> for (tmp = sd; tmp; ) {
>> @@ -5658,14 +5680,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
>> destroy_sched_domain(tmp, cpu);
>> if (sd)
>> sd->child = NULL;
>> + /* update sched_domain_rq */
>> + sd_rq->sd = sd;
>> }
>>
>> +attach:
>> sched_domain_debug(sd, cpu);
>>
>> rq_attach_root(rq, rd);
>> - tmp = rq->sd;
>> - rcu_assign_pointer(rq->sd, sd);
>> - destroy_sched_domains(tmp, cpu);
>> + tmp_rq = rq->sd_rq;
>> + rcu_assign_pointer(rq->sd_rq, sd_rq);
>> + destroy_sched_domain_rq(tmp_rq, cpu);
>>
>> update_top_cache_domain(cpu);
>
> [...]
>> +static void __sdrq_free(const struct cpumask *cpu_map, struct s_data *d)
>> +{
>> + int j;
>> +
>> + for_each_cpu(j, cpu_map)
>> + if (*per_cpu_ptr(d->sd_rq, j))
>> + kfree(*per_cpu_ptr(d->sd_rq, j));
>
> kfree(NULL) works.

thanks

>
>> +}
>> +
>> +static void build_sched_domain_rq(struct s_data *d, int cpu)
>> +{
>> + struct sched_domain_rq *sd_rq;
>> + struct sched_domain *sd;
>> +
>> + /* Attach sched_domain to sched_domain_rq */
>> + sd = *per_cpu_ptr(d->sd, cpu);
>> + sd_rq = *per_cpu_ptr(d->sd_rq, cpu);
>> + sd_rq->sd = sd;
>> + /* Init flags */
>> + set_bit(NOHZ_IDLE, sched_rq_flags(sd_rq));
>> +}
>> +
>> struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
>> struct s_data *d, const struct cpumask *cpu_map,
>> struct sched_domain_attr *attr, struct sched_domain *child,
>> @@ -6495,6 +6572,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
>> struct sched_domain_attr *attr)
>> {
>> enum s_alloc alloc_state = sa_none;
>> + struct sched_domain_rq *sd_rq;
>> struct sched_domain *sd;
>> struct s_data d;
>> int i, ret = -ENOMEM;
>> @@ -6547,11 +6625,18 @@ static int build_sched_domains(const struct cpumask *cpu_map,
>> }
>> }
>>
>> + /* Init objects that must follow the sched_domain lifecycle */
>> + for_each_cpu(i, cpu_map) {
>> + build_sched_domain_rq(&d, i);
>> + }
>
> I suggest you put the above domain_rq initialization before the domain
> initialization.

I can't because build_sched_domain will init struct s_data d that is
used in build_sched_domain_rq

>
> So that we have that more intuitively ordered initialization:
>
> * set up domains_rq
> * set up domains
> * set up sched groups
> * set up sched groups power
>
>> +
>> /* Attach the domains */
>> rcu_read_lock();
>> for_each_cpu(i, cpu_map) {
>> - sd = *per_cpu_ptr(d.sd, i);
>> - cpu_attach_domain(sd, d.rd, i);
>> + sd_rq = *per_cpu_ptr(d.sd_rq, i);
>> + cpu_attach_domain(sd_rq, d.rd, i);
>> + /* claim allocation of sched_domain_rq object */
>> + *per_cpu_ptr(d.sd_rq, i) = NULL;
>> }
>> rcu_read_unlock();
> [...]
>> @@ -5673,7 +5681,12 @@ static void run_rebalance_domains(struct softirq_action *h)
>>
>> static inline int on_null_domain(int cpu)
>> {
>> - return !rcu_dereference_sched(cpu_rq(cpu)->sd);
>> + struct sched_domain_rq *sd_rq =
>> + rcu_dereference_sched(cpu_rq(cpu)->sd_rq);
>> + struct sched_domain *sd = NULL;
>> + if (sd_rq)
>> + sd = sd_rq->sd;
>
> Is it possible to have sd_rq->sd == NULL ?

I think so with the isolcpus as an example

>
>> + return !sd;
>> }
>>
>> /*
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index cc03cfd..f589306 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -417,7 +417,7 @@ struct rq {
>>
>> #ifdef CONFIG_SMP
>> struct root_domain *rd;
>> - struct sched_domain *sd;
>> + struct sched_domain_rq *sd_rq;
>>
>> unsigned long cpu_power;
>>
>> @@ -505,21 +505,37 @@ DECLARE_PER_CPU(struct rq, runqueues);
>>
>> #ifdef CONFIG_SMP
>>
>> -#define rcu_dereference_check_sched_domain(p) \
>> +#define rcu_dereference_check_sched_domain_rq(p) \
>> rcu_dereference_check((p), \
>> lockdep_is_held(&sched_domains_mutex))
>>
>> +#define get_sched_domain_rq(cpu) \
>> + rcu_dereference_check_sched_domain_rq(cpu_rq(cpu)->sd_rq)
>
> How about rcu_dereference_domain_rq()? It seems important to me that
> we keep the rcu_dereference_*() naming so that we don't hide what really
> happens there behind a more opaque naming.

you're right

>
> I mean RCU is tricky to deal with and it's important not to obfuscate its use.
>
>> +
>> +#define rcu_dereference_check_sched_domain(cpu) ({ \
>> + struct sched_domain_rq *__sd_rq = get_sched_domain_rq(cpu); \
>> + struct sched_domain *__sd = NULL; \
>> + if (__sd_rq) \
>> + __sd = __sd_rq->sd; \
>
> Same question for the NULL sd.
>
>> + __sd; \
>> +})
>> +
>> +#define sched_rq_flags(sd_rq) (&sd_rq->flags)
>
> Hm, why this "sched_" prefix? Maybe rq_domain_flags() ?

ok i'm going to change the name

>
> Thanks.