LinuxLists.cc - [RFC] cpuset: remove sched domain hooks from cpusets

2006-10-19 09:24:07

Subject: [RFC] cpuset: remove sched domain hooks from cpusets

From: Paul Jackson <[email protected]>

Remove the cpuset hooks that defined sched domains depending on the
setting of the 'cpu_exclusive' flag.

The cpu_exclusive flag can only be set on a child if it is set on
the parent.

This made that flag painfully unsuitable for use as a flag defining
a partitioning of a system.

It was entirely unobvious to a cpuset user what partitioning of sched
domains they would be causing when they set that one cpu_exclusive bit
on one cpuset, because it depended on what CPUs were in the remainder
of that cpusets siblings and child cpusets, after subtracting out
other cpu_exclusive cpusets.

Furthermore, there was no way on production systems to query the
result.

Using the cpu_exclusive flag for this was simply wrong from the get go.

Fortunately, it was sufficiently borked that so far as I know, no
one has made much use of this feature, past the simplest case of
isolating some CPUs from scheduler balancing. A future patch will
propose a simple mechanism for this simple case.

Furthermore, since there was no way on a running system to see what
one was doing with sched domains, this change will be invisible to
any using code. Unless they have deep insight to the scheduler load
balancing choices, they will be unable to detect that this change
has been made in the kernel's behaviour.

Signed-off-by: Paul Jackson <[email protected]>

---

Documentation/cpusets.txt | 17 ---------
include/linux/sched.h | 3 -
kernel/cpuset.c | 84 +---------------------------------------------
kernel/sched.c | 27 --------------
4 files changed, 2 insertions(+), 129 deletions(-)

--- 2.6.19-rc1-mm1.orig/kernel/cpuset.c 2006-10-19 01:47:50.000000000 -0700
+++ 2.6.19-rc1-mm1/kernel/cpuset.c 2006-10-19 01:48:10.000000000 -0700
@@ -754,68 +754,13 @@ static int validate_change(const struct
}

/*
- * For a given cpuset cur, partition the system as follows
- * a. All cpus in the parent cpuset's cpus_allowed that are not part of any
- * exclusive child cpusets
- * b. All cpus in the current cpuset's cpus_allowed that are not part of any
- * exclusive child cpusets
- * Build these two partitions by calling partition_sched_domains
- *
- * Call with manage_mutex held. May nest a call to the
- * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
- * Must not be called holding callback_mutex, because we must
- * not call lock_cpu_hotplug() while holding callback_mutex.
- */
-
-static void update_cpu_domains(struct cpuset *cur)
-{
- struct cpuset *c, *par = cur->parent;
- cpumask_t pspan, cspan;
-
- if (par == NULL || cpus_empty(cur->cpus_allowed))
- return;
-
- /*
- * Get all cpus from parent's cpus_allowed not part of exclusive
- * children
- */
- pspan = par->cpus_allowed;
- list_for_each_entry(c, &par->children, sibling) {
- if (is_cpu_exclusive(c))
- cpus_andnot(pspan, pspan, c->cpus_allowed);
- }
- if (!is_cpu_exclusive(cur)) {
- cpus_or(pspan, pspan, cur->cpus_allowed);
- if (cpus_equal(pspan, cur->cpus_allowed))
- return;
- cspan = CPU_MASK_NONE;
- } else {
- if (cpus_empty(pspan))
- return;
- cspan = cur->cpus_allowed;
- /*
- * Get all cpus from current cpuset's cpus_allowed not part
- * of exclusive children
- */
- list_for_each_entry(c, &cur->children, sibling) {
- if (is_cpu_exclusive(c))
- cpus_andnot(cspan, cspan, c->cpus_allowed);
- }
- }
-
- lock_cpu_hotplug();
- partition_sched_domains(&pspan, &cspan);
- unlock_cpu_hotplug();
-}
-
-/*
* Call with manage_mutex held. May take callback_mutex during call.
*/

static int update_cpumask(struct cpuset *cs, char *buf)
{
struct cpuset trialcs;
- int retval, cpus_unchanged;
+ int retval;

/* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
if (cs == &top_cpuset)
@@ -831,12 +776,9 @@ static int update_cpumask(struct cpuset
retval = validate_change(cs, &trialcs);
if (retval < 0)
return retval;
- cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
mutex_lock(&callback_mutex);
cs->cpus_allowed = trialcs.cpus_allowed;
mutex_unlock(&callback_mutex);
- if (is_cpu_exclusive(cs) && !cpus_unchanged)
- update_cpu_domains(cs);
return 0;
}

@@ -1046,7 +988,7 @@ static int update_flag(cpuset_flagbits_t
{
int turning_on;
struct cpuset trialcs;
- int err, cpu_exclusive_changed;
+ int err;

turning_on = (simple_strtoul(buf, NULL, 10) != 0);

@@ -1059,14 +1001,10 @@ static int update_flag(cpuset_flagbits_t
err = validate_change(cs, &trialcs);
if (err < 0)
return err;
- cpu_exclusive_changed =
- (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
mutex_lock(&callback_mutex);
cs->flags = trialcs.flags;
mutex_unlock(&callback_mutex);

- if (cpu_exclusive_changed)
- update_cpu_domains(cs);
return 0;
}

@@ -1930,17 +1868,6 @@ static int cpuset_mkdir(struct inode *di
return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
}

-/*
- * Locking note on the strange update_flag() call below:
- *
- * If the cpuset being removed is marked cpu_exclusive, then simulate
- * turning cpu_exclusive off, which will call update_cpu_domains().
- * The lock_cpu_hotplug() call in update_cpu_domains() must not be
- * made while holding callback_mutex. Elsewhere the kernel nests
- * callback_mutex inside lock_cpu_hotplug() calls. So the reverse
- * nesting would risk an ABBA deadlock.
- */
-
static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
{
struct cpuset *cs = dentry->d_fsdata;
@@ -1960,13 +1887,6 @@ static int cpuset_rmdir(struct inode *un
mutex_unlock(&manage_mutex);
return -EBUSY;
}
- if (is_cpu_exclusive(cs)) {
- int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0");
- if (retval < 0) {
- mutex_unlock(&manage_mutex);
- return retval;
- }
- }
parent = cs->parent;
mutex_lock(&callback_mutex);
set_bit(CS_REMOVED, &cs->flags);
--- 2.6.19-rc1-mm1.orig/Documentation/cpusets.txt 2006-10-19 01:47:09.000000000 -0700
+++ 2.6.19-rc1-mm1/Documentation/cpusets.txt 2006-10-19 01:48:10.000000000 -0700
@@ -86,9 +86,6 @@ This can be especially valuable on:
and a database), or
* NUMA systems running large HPC applications with demanding
performance characteristics.
- * Also cpu_exclusive cpusets are useful for servers running orthogonal
- workloads such as RT applications requiring low latency and HPC
- applications that are throughput sensitive

These subsets, or "soft partitions" must be able to be dynamically
adjusted, as the job mix changes, without impacting other concurrently
@@ -131,8 +128,6 @@ Cpusets extends these two mechanisms as
- A cpuset may be marked exclusive, which ensures that no other
cpuset (except direct ancestors and descendents) may contain
any overlapping CPUs or Memory Nodes.
- Also a cpu_exclusive cpuset would be associated with a sched
- domain.
- You can list all the tasks (by pid) attached to any cpuset.

The implementation of cpusets requires a few, simple hooks
@@ -144,9 +139,6 @@ into the rest of the kernel, none in per
allowed in that tasks cpuset.
- in sched.c migrate_all_tasks(), to keep migrating tasks within
the CPUs allowed by their cpuset, if possible.
- - in sched.c, a new API partition_sched_domains for handling
- sched domain changes associated with cpu_exclusive cpusets
- and related changes in both sched.c and arch/ia64/kernel/domain.c
- in the mbind and set_mempolicy system calls, to mask the requested
Memory Nodes by what's allowed in that tasks cpuset.
- in page_alloc.c, to restrict memory to allowed nodes.
@@ -231,15 +223,6 @@ If a cpuset is cpu or mem exclusive, no
a direct ancestor or descendent, may share any of the same CPUs or
Memory Nodes.

-A cpuset that is cpu_exclusive has a scheduler (sched) domain
-associated with it. The sched domain consists of all CPUs in the
-current cpuset that are not part of any exclusive child cpusets.
-This ensures that the scheduler load balancing code only balances
-against the CPUs that are in the sched domain as defined above and
-not all of the CPUs in the system. This removes any overhead due to
-load balancing code trying to pull tasks outside of the cpu_exclusive
-cpuset only to be prevented by the tasks' cpus_allowed mask.
-
A cpuset that is mem_exclusive restricts kernel allocations for
page, buffer and other data commonly shared by the kernel across
multiple users. All cpusets, whether mem_exclusive or not, restrict
--- 2.6.19-rc1-mm1.orig/include/linux/sched.h 2006-10-19 01:47:09.000000000 -0700
+++ 2.6.19-rc1-mm1/include/linux/sched.h 2006-10-19 01:48:10.000000000 -0700
@@ -715,9 +715,6 @@ struct sched_domain {
#endif
};

-extern int partition_sched_domains(cpumask_t *partition1,
- cpumask_t *partition2);
-
/*
* Maximum cache size the migration-costs auto-tuning code will
* search from:
--- 2.6.19-rc1-mm1.orig/kernel/sched.c 2006-10-19 01:47:09.000000000 -0700
+++ 2.6.19-rc1-mm1/kernel/sched.c 2006-10-19 01:48:10.000000000 -0700
@@ -6735,33 +6735,6 @@ static void detach_destroy_domains(const
arch_destroy_sched_domains(cpu_map);
}

-/*
- * Partition sched domains as specified by the cpumasks below.
- * This attaches all cpus from the cpumasks to the NULL domain,
- * waits for a RCU quiescent period, recalculates sched
- * domain information and then attaches them back to the
- * correct sched domains
- * Call with hotplug lock held
- */
-int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
-{
- cpumask_t change_map;
- int err = 0;
-
- cpus_and(*partition1, *partition1, cpu_online_map);
- cpus_and(*partition2, *partition2, cpu_online_map);
- cpus_or(change_map, *partition1, *partition2);
-
- /* Detach sched domains from all of the affected cpus */
- detach_destroy_domains(&change_map);
- if (!cpus_empty(*partition1))
- err = build_sched_domains(partition1);
- if (!err && !cpus_empty(*partition2))
- err = build_sched_domains(partition2);
-
- return err;
-}
-
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
int arch_reinit_sched_domains(void)
{

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2006-10-19 10:25:10

by Nick Piggin

[permalink] [raw]

Subject: Re: [RFC] cpuset: remove sched domain hooks from cpusets

Fix sched-domains partitioning by cpusets. Walk the whole cpusets tree after
something interesting changes, and recreate all partitions.

Index: linux-2.6/kernel/cpuset.c
===================================================================
--- linux-2.6.orig/kernel/cpuset.c 2006-10-19 19:26:54.000000000 +1000
+++ linux-2.6/kernel/cpuset.c 2006-10-19 20:21:29.000000000 +1000
@@ -751,6 +751,24 @@ static int validate_change(const struct
return 0;
}

+static void update_cpu_domains_children(struct cpuset *par,
+ cpumask_t *non_partitioned)
+{
+ struct cpuset *c;
+
+ list_for_each_entry(c, &par->children, sibling) {
+ if (cpus_empty(c->cpus_allowed))
+ continue;
+ if (is_cpu_exclusive(c)) {
+ if (!partition_sched_domains(&c->cpus_allowed)) {
+ cpus_andnot(*non_partitioned,
+ *non_partitioned, c->cpus_allowed);
+ }
+ } else
+ update_cpu_domains_children(c, non_partitioned);
+ }
+}
+
/*
* For a given cpuset cur, partition the system as follows
* a. All cpus in the parent cpuset's cpus_allowed that are not part of any
@@ -760,53 +778,38 @@ static int validate_change(const struct
* Build these two partitions by calling partition_sched_domains
*
* Call with manage_mutex held. May nest a call to the
- * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
- * Must not be called holding callback_mutex, because we must
- * not call lock_cpu_hotplug() while holding callback_mutex.
+ * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. Must not be called holding
+ * callback_mutex, because we must not call lock_cpu_hotplug() while holding
+ * callback_mutex.
*/

-static void update_cpu_domains(struct cpuset *cur)
+static void update_cpu_domains(void)
{
- struct cpuset *c, *par = cur->parent;
- cpumask_t pspan, cspan;
+ cpumask_t non_partitioned;

- if (par == NULL || cpus_empty(cur->cpus_allowed))
- return;
-
- /*
- * Get all cpus from parent's cpus_allowed not part of exclusive
- * children
- */
- pspan = par->cpus_allowed;
- list_for_each_entry(c, &par->children, sibling) {
- if (is_cpu_exclusive(c))
- cpus_andnot(pspan, pspan, c->cpus_allowed);
- }
- if (!is_cpu_exclusive(cur)) {
- cpus_or(pspan, pspan, cur->cpus_allowed);
- if (cpus_equal(pspan, cur->cpus_allowed))
- return;
- cspan = CPU_MASK_NONE;
- } else {
- if (cpus_empty(pspan))
- return;
- cspan = cur->cpus_allowed;
- /*
- * Get all cpus from current cpuset's cpus_allowed not part
- * of exclusive children
- */
- list_for_each_entry(c, &cur->children, sibling) {
- if (is_cpu_exclusive(c))
- cpus_andnot(cspan, cspan, c->cpus_allowed);
- }
- }
+ BUG_ON(!mutex_is_locked(&manage_mutex));

lock_cpu_hotplug();
- partition_sched_domains(&pspan, &cspan);
+ non_partitioned = top_cpuset.cpus_allowed;
+ update_cpu_domains_children(&top_cpuset, &non_partitioned);
+ partition_sched_domains(&non_partitioned);
unlock_cpu_hotplug();
}

/*
+ * Same as above except called with lock_cpu_hotplug and without manage_mutex.
+ */
+
+int cpuset_hotplug_update_sched_domains(void)
+{
+ cpumask_t non_partitioned;
+
+ non_partitioned = top_cpuset.cpus_allowed;
+ update_cpu_domains_children(&top_cpuset, &non_partitioned);
+ return partition_sched_domains(&non_partitioned);
+}
+
+/*
* Call with manage_mutex held. May take callback_mutex during call.
*/

@@ -833,8 +836,8 @@ static int update_cpumask(struct cpuset
mutex_lock(&callback_mutex);
cs->cpus_allowed = trialcs.cpus_allowed;
mutex_unlock(&callback_mutex);
- if (is_cpu_exclusive(cs) && !cpus_unchanged)
- update_cpu_domains(cs);
+ if (!cpus_unchanged)
+ update_cpu_domains();
return 0;
}

@@ -1067,7 +1070,7 @@ static int update_flag(cpuset_flagbits_t
mutex_unlock(&callback_mutex);

if (cpu_exclusive_changed)
- update_cpu_domains(cs);
+ update_cpu_domains();
return 0;
}

@@ -1931,19 +1934,9 @@ static int cpuset_mkdir(struct inode *di
return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
}

-/*
- * Locking note on the strange update_flag() call below:
- *
- * If the cpuset being removed is marked cpu_exclusive, then simulate
- * turning cpu_exclusive off, which will call update_cpu_domains().
- * The lock_cpu_hotplug() call in update_cpu_domains() must not be
- * made while holding callback_mutex. Elsewhere the kernel nests
- * callback_mutex inside lock_cpu_hotplug() calls. So the reverse
- * nesting would risk an ABBA deadlock.
- */
-
static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
{
+ int is_exclusive;
struct cpuset *cs = dentry->d_fsdata;
struct dentry *d;
struct cpuset *parent;
@@ -1961,13 +1954,8 @@ static int cpuset_rmdir(struct inode *un
mutex_unlock(&manage_mutex);
return -EBUSY;
}
- if (is_cpu_exclusive(cs)) {
- int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0");
- if (retval < 0) {
- mutex_unlock(&manage_mutex);
- return retval;
- }
- }
+ is_exclusive = is_cpu_exclusive(cs);
+
parent = cs->parent;
mutex_lock(&callback_mutex);
set_bit(CS_REMOVED, &cs->flags);
@@ -1982,8 +1970,13 @@ static int cpuset_rmdir(struct inode *un
mutex_unlock(&callback_mutex);
if (list_empty(&parent->children))
check_for_release(parent, &pathbuf);
+
+ if (is_exclusive)
+ update_cpu_domains();
+
mutex_unlock(&manage_mutex);
cpuset_release_agent(pathbuf);
+
return 0;
}

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c 2006-10-19 19:24:48.000000000 +1000
+++ linux-2.6/kernel/sched.c 2006-10-19 20:21:50.000000000 +1000
@@ -6586,6 +6586,9 @@ error:
*/
static int arch_init_sched_domains(const cpumask_t *cpu_map)
{
+#ifdef CONFIG_CPUSETS
+ return cpuset_hotplug_update_sched_domains();
+#else
cpumask_t cpu_default_map;
int err;

@@ -6599,6 +6602,7 @@ static int arch_init_sched_domains(const
err = build_sched_domains(&cpu_default_map);

return err;
+#endif
}

static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
@@ -6622,29 +6626,26 @@ static void detach_destroy_domains(const

/*
* Partition sched domains as specified by the cpumasks below.
- * This attaches all cpus from the cpumasks to the NULL domain,
+ * This attaches all cpus from the partition to the NULL domain,
* waits for a RCU quiescent period, recalculates sched
- * domain information and then attaches them back to the
- * correct sched domains
- * Call with hotplug lock held
+ * domain information and then attaches them back to their own
+ * isolated partition.
+ *
+ * Called with hotplug lock held
+ *
+ * Returns 0 on success.
*/
-int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+int partition_sched_domains(cpumask_t *partition)
{
+ cpumask_t non_isolated_cpus;
cpumask_t change_map;
- int err = 0;

- cpus_and(*partition1, *partition1, cpu_online_map);
- cpus_and(*partition2, *partition2, cpu_online_map);
- cpus_or(change_map, *partition1, *partition2);
+ cpus_andnot(non_isolated_cpus, cpu_online_map, cpu_isolated_map);
+ cpus_and(change_map, *partition, non_isolated_cpus);

/* Detach sched domains from all of the affected cpus */
detach_destroy_domains(&change_map);
- if (!cpus_empty(*partition1))
- err = build_sched_domains(partition1);
- if (!err && !cpus_empty(*partition2))
- err = build_sched_domains(partition2);
-
- return err;
+ return build_sched_domains(&change_map);
}

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h 2006-10-19 20:02:24.000000000 +1000
+++ linux-2.6/include/linux/sched.h 2006-10-19 20:02:30.000000000 +1000
@@ -707,8 +707,7 @@ struct sched_domain {
#endif
};

-extern int partition_sched_domains(cpumask_t *partition1,
- cpumask_t *partition2);
+extern int partition_sched_domains(cpumask_t *partition);

/*
* Maximum cache size the migration-costs auto-tuning code will
Index: linux-2.6/include/linux/cpuset.h
===================================================================
--- linux-2.6.orig/include/linux/cpuset.h 2006-10-19 20:07:24.000000000 +1000
+++ linux-2.6/include/linux/cpuset.h 2006-10-19 20:21:08.000000000 +1000
@@ -14,6 +14,8 @@

#ifdef CONFIG_CPUSETS

+extern int cpuset_hotplug_update_sched_domains(void);
+
extern int number_of_cpusets; /* How many cpusets are defined in system? */

extern int cpuset_init_early(void);

Attachments:

sched-domains-cpusets-fixes.patch (8.27 kB)

2006-10-19 19:04:35