LinuxLists.cc - [PATCH 2.6.16-mm1 2/2] sched_domains: Allocate sched

2006-03-25 08:28:32

Subject: [PATCH 2.6.16-mm1 2/2] sched_domains: Allocate sched_groups dynamically

I hit a kernel lockup problem while making a CPUset exclusive. This was
also reported before:

http://lkml.org/lkml/2006/3/16/128 (Ref 1)

Dinakar (CCed here) said that it may be similar to the problem reported
here:

http://lkml.org/lkml/2005/8/20/40 (Ref 2)

Upon code inspection, I found that it is indeed a similar problem for
SCHED_SMT case. The problem is recreated if SCHED_SMT is enabled in the
kernel, and one tries to make a exclusive CPUset with just one thread in
it. This cause the two threads of the same CPU to be in in different
dynamic sched domains, although they share the same sched_group
structure at 'phys_domains' level. This will lead to the same kind of
corruption described in Ref 2 above.

Patch below, against 2.6.16-mm1, fixes the problem for both SCHED_SMT and
SCHED_MC cases.

I have tested the patch on a 8-way (with HT) Intel Xeon machine and
found that the lockups I was facing earlier went away with the patch.

I couldn't test this on a multi-core machine, since I don't think we
have one in our lab.

Suresh, would you mind testing the patch on a multi-core machine, in case you
have access to one?

Basically you would need to do create a exclusive CPUset with one CPU in it
(ensure that its sibling in the same core is not part of the same
CPUset). As soon as you make the CPUset exclusive, you would hit some
kind of hang. With this patch, the hang should go away.

Signed-off-by: Srivatsa Vaddagiri <[email protected]>

kernel/sched.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++--------
1 files changed, 58 insertions(+), 9 deletions(-)

diff -puN kernel/sched.c~sd_dynschedgroups kernel/sched.c
--- linux-2.6.16-mm1/kernel/sched.c~sd_dynschedgroups 2006-03-25 11:45:05.000000000 +0530
+++ linux-2.6.16-mm1-root/kernel/sched.c 2006-03-25 13:30:01.000000000 +0530
@@ -5884,7 +5884,7 @@ static int cpu_to_cpu_group(int cpu)

#ifdef CONFIG_SCHED_MC
static DEFINE_PER_CPU(struct sched_domain, core_domains);
-static struct sched_group sched_group_core[NR_CPUS];
+static struct sched_group *sched_group_core_bycpu[NR_CPUS];
#endif

#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
@@ -5900,7 +5900,7 @@ static int cpu_to_core_group(int cpu)
#endif

static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group sched_group_phys[NR_CPUS];
+static struct sched_group *sched_group_phys_bycpu[NR_CPUS];
static int cpu_to_phys_group(int cpu)
{
#if defined(CONFIG_SCHED_MC)
@@ -5963,7 +5963,12 @@ next_sg:
*/
void build_sched_domains(const cpumask_t *cpu_map)
{
- int i;
+ int i, alloc_phys_failed = 0;
+ struct sched_group *sched_group_phys = NULL;
+#ifdef CONFIG_SCHED_MC
+ int alloc_core_failed = 0;
+ struct sched_group *sched_group_core = NULL;
+#endif
#ifdef CONFIG_NUMA
int alloc_failed = 0;
struct sched_group **sched_group_nodes = NULL;
@@ -6026,15 +6031,41 @@ void build_sched_domains(const cpumask_t
cpus_and(sd->span, sd->span, *cpu_map);
#endif

+ if (!sched_group_phys && !alloc_phys_failed) {
+ sched_group_phys
+ = kmalloc(sizeof(struct sched_group) * NR_CPUS,
+ GFP_KERNEL);
+ if (!sched_group_phys) {
+ printk (KERN_WARNING
+ "Can not alloc phys sched group\n");
+ alloc_phys_failed = 1;
+ }
+ sched_group_phys_bycpu[i] = sched_group_phys;
+ }
+
p = sd;
sd = &per_cpu(phys_domains, i);
group = cpu_to_phys_group(i);
*sd = SD_CPU_INIT;
sd->span = nodemask;
sd->parent = p;
- sd->groups = &sched_group_phys[group];
+ sd->groups = sched_group_phys ? &sched_group_phys[group] : NULL;
+ if (!sd->groups)
+ sd->flags = 0; /* No load balancing */

#ifdef CONFIG_SCHED_MC
+ if (!sched_group_core && !alloc_core_failed) {
+ sched_group_core
+ = kmalloc(sizeof(struct sched_group) * NR_CPUS,
+ GFP_KERNEL);
+ if (!sched_group_core) {
+ printk (KERN_WARNING
+ "Can not alloc core sched group\n");
+ alloc_core_failed = 1;
+ }
+ sched_group_core_bycpu[i] = sched_group_core;
+ }
+
p = sd;
sd = &per_cpu(core_domains, i);
group = cpu_to_core_group(i);
@@ -6042,7 +6073,9 @@ void build_sched_domains(const cpumask_t
sd->span = cpu_coregroup_map(i);
cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p;
- sd->groups = &sched_group_core[group];
+ sd->groups = sched_group_core ? &sched_group_core[group] : NULL;
+ if (!sd->groups)
+ sd->flags = 0; /* No load balancing */
#endif

#ifdef CONFIG_SCHED_SMT
@@ -6077,8 +6110,9 @@ void build_sched_domains(const cpumask_t
cpus_and(this_core_map, this_core_map, *cpu_map);
if (i != first_cpu(this_core_map))
continue;
- init_sched_build_groups(sched_group_core, this_core_map,
- &cpu_to_core_group);
+ if (sched_group_core)
+ init_sched_build_groups(sched_group_core, this_core_map,
+ &cpu_to_core_group);
}
#endif

@@ -6091,7 +6125,8 @@ void build_sched_domains(const cpumask_t
if (cpus_empty(nodemask))
continue;

- init_sched_build_groups(sched_group_phys, nodemask,
+ if (sched_group_phys)
+ init_sched_build_groups(sched_group_phys, nodemask,
&cpu_to_phys_group);
}

@@ -6249,9 +6284,9 @@ static void arch_init_sched_domains(cons

static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
{
+ int cpu;
#ifdef CONFIG_NUMA
int i;
- int cpu;

for_each_cpu_mask(cpu, *cpu_map) {
struct sched_group *sched_group_allnodes
@@ -6289,6 +6324,20 @@ next_sg:
sched_group_nodes_bycpu[cpu] = NULL;
}
#endif
+ for_each_cpu_mask(cpu, *cpu_map) {
+ if (sched_group_phys_bycpu[cpu]) {
+ kfree(sched_group_phys_bycpu[cpu]);
+ sched_group_phys_bycpu[cpu] = NULL;
+ }
+ }
+#ifdef CONFIG_SCHED_MC
+ for_each_cpu_mask(cpu, *cpu_map) {
+ if (sched_group_core_bycpu[cpu]) {
+ kfree(sched_group_core_bycpu[cpu]);
+ sched_group_core_bycpu[cpu] = NULL;
+ }
+ }
+#endif
}

/*

_
_

--
Regards,
vatsa

2006-03-27 21:18:50

by Suresh Siddha

[permalink] [raw]

Subject: Re: [PATCH 2.6.16-mm1 2/2] sched_domains: Allocate sched_groups dynamically

On Sat, Mar 25, 2006 at 01:58:04PM +0530, Srivatsa Vaddagiri wrote:
> + if (!sched_group_phys && !alloc_phys_failed) {
> + sched_group_phys
> + = kmalloc(sizeof(struct sched_group) * NR_CPUS,
> + GFP_KERNEL);
> + if (!sched_group_phys) {
> + printk (KERN_WARNING
> + "Can not alloc phys sched group\n");
> + alloc_phys_failed = 1;
> + }
> + sched_group_phys_bycpu[i] = sched_group_phys;
> + }

We can move this allocation outside the for loop and avoid the complexities
of alloc_phys_failed, alloc_core_failed..

thanks,
suresh