Subject: [RFC PATCH 0/3] Refcount Based Cpu Hotplug V3

Hi,
This is Try #3 for the Refcount + Waitqueue based implementation
for cpu-hotplug locking. The earlier versions can be found at

http://lkml.org/lkml/2007/10/24/36
and
http://lkml.org/lkml/2007/10/16/118.

This version drops the patch 4 from the earlier series,
which was basically removing the CPU_DEAD and CPU_UP_CANCELLED
event handling in the workqueue subsystem. This was based on
Oleg Nestertov's comments since the problem which the patch
was attempting to solve was more complicated and needs to be looked
into seperately.

The patch series is against v2.6.24-rc2 and has survived
overnight kern_bench + cpu-offline-online tests on
- 8 way x86,
- 8 way x86_64,
- 128 way ppc64 boxes.

The patch series has gone through many iterations
and looks to have matured in the process.

Andrew, could you please consider it for the next -mm
release if there are no objections ?

Thanks and Regards
gautham.

--
Gautham R Shenoy
Linux Technology Center
IBM India.
"Freedom comes with a price tag of responsibility, which is still a bargain,
because Freedom is priceless!"


Subject: [RFC PATCH 1/3] cpu-hotplug: Refcount Based Cpu Hotplug implementation

From: Gautham R Shenoy <[email protected]>
Date: Thu, 15 Nov 2007 18:14:20 +0530
Subject: [PATCH 1/3] cpu-hotplug: Refcount based Cpu Hotplug implementation

This patch implements a Refcount + Waitqueue based model for
cpu-hotplug.

Now, a thread which wants to prevent cpu-hotplug, will bump up a
global refcount and the thread which wants to perform a cpu-hotplug
operation will block till the global refcount goes to zero.

The readers, if any, during an ongoing cpu-hotplug operation are blocked
until the cpu-hotplug operation is over.

Signed-off-by: Gautham R Shenoy <[email protected]>
Signed-off-by: Paul Jackson <[email protected]> [For !CONFIG_HOTPLUG_CPU ]
---
include/linux/cpu.h | 3 +
init/main.c | 1 +
kernel/cpu.c | 152 +++++++++++++++++++++++++++++++++++++--------------
3 files changed, 115 insertions(+), 41 deletions(-)

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index b79c575..d1676dd 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -83,6 +83,9 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb)

#endif /* CONFIG_SMP */
extern struct sysdev_class cpu_sysdev_class;
+extern void cpu_hotplug_init(void);
+extern void cpu_maps_update_begin(void);
+extern void cpu_maps_update_done(void);

#ifdef CONFIG_HOTPLUG_CPU
/* Stop CPUs going up and down. */
diff --git a/init/main.c b/init/main.c
index f605a96..466a3ac 100644
--- a/init/main.c
+++ b/init/main.c
@@ -606,6 +606,7 @@ asmlinkage void __init start_kernel(void)
vfs_caches_init_early();
cpuset_init_early();
mem_init();
+ cpu_hotplug_init();
kmem_cache_init();
setup_per_cpu_pageset();
numa_policy_init();
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6b3a0c1..656dc3f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,9 +15,8 @@
#include <linux/stop_machine.h>
#include <linux/mutex.h>

-/* This protects CPUs going up and down... */
+/* Serializes the updates to cpu_online_map, cpu_present_map */
static DEFINE_MUTEX(cpu_add_remove_lock);
-static DEFINE_MUTEX(cpu_bitmask_lock);

static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);

@@ -26,52 +25,123 @@ static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
*/
static int cpu_hotplug_disabled;

-#ifdef CONFIG_HOTPLUG_CPU
+static struct {
+ struct task_struct *active_writer;
+ struct mutex lock; /* Synchronizes accesses to refcount, */
+ /*
+ * Also blocks the new readers during
+ * an ongoing cpu hotplug operation.
+ */
+ int refcount;
+ wait_queue_head_t writer_queue;
+} cpu_hotplug;

-/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
-static struct task_struct *recursive;
-static int recursive_depth;
+#define writer_exists() (cpu_hotplug.active_writer != NULL)
+
+void __init cpu_hotplug_init(void)
+{
+ cpu_hotplug.active_writer = NULL;
+ mutex_init(&cpu_hotplug.lock);
+ cpu_hotplug.refcount = 0;
+ init_waitqueue_head(&cpu_hotplug.writer_queue);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU

void lock_cpu_hotplug(void)
{
- struct task_struct *tsk = current;
-
- if (tsk == recursive) {
- static int warnings = 10;
- if (warnings) {
- printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n");
- WARN_ON(1);
- warnings--;
- }
- recursive_depth++;
+ might_sleep();
+ if (cpu_hotplug.active_writer == current)
return;
- }
- mutex_lock(&cpu_bitmask_lock);
- recursive = tsk;
+ mutex_lock(&cpu_hotplug.lock);
+ cpu_hotplug.refcount++;
+ mutex_unlock(&cpu_hotplug.lock);
+
}
EXPORT_SYMBOL_GPL(lock_cpu_hotplug);

void unlock_cpu_hotplug(void)
{
- WARN_ON(recursive != current);
- if (recursive_depth) {
- recursive_depth--;
+ if (cpu_hotplug.active_writer == current)
return;
- }
- recursive = NULL;
- mutex_unlock(&cpu_bitmask_lock);
+ mutex_lock(&cpu_hotplug.lock);
+ cpu_hotplug.refcount--;
+
+ if (unlikely(writer_exists()) && !cpu_hotplug.refcount)
+ wake_up(&cpu_hotplug.writer_queue);
+
+ mutex_unlock(&cpu_hotplug.lock);
+
}
EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);

#endif /* CONFIG_HOTPLUG_CPU */

+/*
+ * The following two API's must be used when attempting
+ * to serialize the updates to cpu_online_map, cpu_present_map.
+ */
+void cpu_maps_update_begin(void)
+{
+ mutex_lock(&cpu_add_remove_lock);
+}
+
+void cpu_maps_update_done(void)
+{
+ mutex_unlock(&cpu_add_remove_lock);
+}
+
+/*
+ * This ensures that the hotplug operation can begin only when the
+ * refcount goes to zero.
+ *
+ * Note that during a cpu-hotplug operation, the new readers, if any,
+ * will be blocked by the cpu_hotplug.lock
+ *
+ * Since cpu_maps_update_begin is always called after invoking
+ * cpu_maps_update_begin, we can be sure that only one writer is active.
+ *
+ * Note that theoretically, there is a possibility of a livelock:
+ * - Refcount goes to zero, last reader wakes up the sleeping
+ * writer.
+ * - Last reader unlocks the cpu_hotplug.lock.
+ * - A new reader arrives at this moment, bumps up the refcount.
+ * - The writer acquires the cpu_hotplug.lock finds the refcount
+ * non zero and goes to sleep again.
+ *
+ * However, this is very difficult to achieve in practice since
+ * lock_cpu_hotplug() not an api which is called all that often.
+ *
+ */
+static void cpu_hotplug_begin(void)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ mutex_lock(&cpu_hotplug.lock);
+
+ cpu_hotplug.active_writer = current;
+ add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait);
+ while (cpu_hotplug.refcount) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&cpu_hotplug.lock);
+ schedule();
+ mutex_lock(&cpu_hotplug.lock);
+ }
+ remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait);
+}
+
+static void cpu_hotplug_done(void)
+{
+ cpu_hotplug.active_writer = NULL;
+ mutex_unlock(&cpu_hotplug.lock);
+}
/* Need to know about CPUs going up/down? */
int __cpuinit register_cpu_notifier(struct notifier_block *nb)
{
int ret;
- mutex_lock(&cpu_add_remove_lock);
+ cpu_maps_update_begin();
ret = raw_notifier_chain_register(&cpu_chain, nb);
- mutex_unlock(&cpu_add_remove_lock);
+ cpu_maps_update_done();
return ret;
}

@@ -81,9 +151,9 @@ EXPORT_SYMBOL(register_cpu_notifier);

void unregister_cpu_notifier(struct notifier_block *nb)
{
- mutex_lock(&cpu_add_remove_lock);
+ cpu_maps_update_begin();
raw_notifier_chain_unregister(&cpu_chain, nb);
- mutex_unlock(&cpu_add_remove_lock);
+ cpu_maps_update_done();
}
EXPORT_SYMBOL(unregister_cpu_notifier);

@@ -147,6 +217,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
if (!cpu_online(cpu))
return -EINVAL;

+ cpu_hotplug_begin();
raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
hcpu, -1, &nr_calls);
@@ -166,9 +237,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
cpu_clear(cpu, tmp);
set_cpus_allowed(current, tmp);

- mutex_lock(&cpu_bitmask_lock);
p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
- mutex_unlock(&cpu_bitmask_lock);

if (IS_ERR(p) || cpu_online(cpu)) {
/* CPU didn't die: tell everyone. Can't complain. */
@@ -203,6 +272,7 @@ out_allowed:
set_cpus_allowed(current, old_allowed);
out_release:
raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
+ cpu_hotplug_done();
return err;
}

@@ -210,13 +280,13 @@ int cpu_down(unsigned int cpu)
{
int err = 0;

- mutex_lock(&cpu_add_remove_lock);
+ cpu_maps_update_begin();
if (cpu_hotplug_disabled)
err = -EBUSY;
else
err = _cpu_down(cpu, 0);

- mutex_unlock(&cpu_add_remove_lock);
+ cpu_maps_update_done();
return err;
}
#endif /*CONFIG_HOTPLUG_CPU*/
@@ -231,6 +301,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
if (cpu_online(cpu) || !cpu_present(cpu))
return -EINVAL;

+ cpu_hotplug_begin();
raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
-1, &nr_calls);
@@ -243,9 +314,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
}

/* Arch-specific enabling code. */
- mutex_lock(&cpu_bitmask_lock);
ret = __cpu_up(cpu);
- mutex_unlock(&cpu_bitmask_lock);
if (ret != 0)
goto out_notify;
BUG_ON(!cpu_online(cpu));
@@ -258,6 +327,7 @@ out_notify:
__raw_notifier_call_chain(&cpu_chain,
CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
+ cpu_hotplug_done();

return ret;
}
@@ -275,13 +345,13 @@ int __cpuinit cpu_up(unsigned int cpu)
return -EINVAL;
}

- mutex_lock(&cpu_add_remove_lock);
+ cpu_maps_update_begin();
if (cpu_hotplug_disabled)
err = -EBUSY;
else
err = _cpu_up(cpu, 0);

- mutex_unlock(&cpu_add_remove_lock);
+ cpu_maps_update_done();
return err;
}

@@ -292,7 +362,7 @@ int disable_nonboot_cpus(void)
{
int cpu, first_cpu, error = 0;

- mutex_lock(&cpu_add_remove_lock);
+ cpu_maps_update_begin();
first_cpu = first_cpu(cpu_online_map);
/* We take down all of the non-boot CPUs in one shot to avoid races
* with the userspace trying to use the CPU hotplug at the same time
@@ -319,7 +389,7 @@ int disable_nonboot_cpus(void)
} else {
printk(KERN_ERR "Non-boot CPUs are not disabled\n");
}
- mutex_unlock(&cpu_add_remove_lock);
+ cpu_maps_update_done();
return error;
}

@@ -328,7 +398,7 @@ void enable_nonboot_cpus(void)
int cpu, error;

/* Allow everyone to use the CPU hotplug again */
- mutex_lock(&cpu_add_remove_lock);
+ cpu_maps_update_begin();
cpu_hotplug_disabled = 0;
if (cpus_empty(frozen_cpus))
goto out;
@@ -344,6 +414,6 @@ void enable_nonboot_cpus(void)
}
cpus_clear(frozen_cpus);
out:
- mutex_unlock(&cpu_add_remove_lock);
+ cpu_maps_update_done();
}
#endif /* CONFIG_PM_SLEEP_SMP */
--
1.5.2.5

--
Gautham R Shenoy
Linux Technology Center
IBM India.
"Freedom comes with a price tag of responsibility, which is still a bargain,
because Freedom is priceless!"

Subject: [RFC PATCH 2/3] cpu-hotplug: Replace lock_cpu_hotplug() with get_online_cpus()

From: Gautham R Shenoy <[email protected]>
Date: Thu, 15 Nov 2007 18:14:29 +0530
Subject: [PATCH 2/3] cpu-hotplug: Replace lock_cpu_hotplug() with get_online_cpus()

Replace all lock_cpu_hotplug/unlock_cpu_hotplug from the kernel and use
get_online_cpus and put_online_cpus instead as it highlights
the refcount semantics in these operations.

The new API guarantees protection against the cpu-hotplug operation,
but it doesn't guarantee serialized access to any of the local data
structures. Hence the changes needs to be reviewed.

In case of pseries_add_processor/pseries_remove_processor, use
cpu_maps_update_begin()/cpu_maps_update_done() as we're modifying the
cpu_present_map there.

Signed-off-by: Gautham R Shenoy <[email protected]>
---
Documentation/cpu-hotplug.txt | 11 ++++++-----
arch/mips/kernel/mips-mt-fpaff.c | 10 +++++-----
arch/powerpc/platforms/pseries/hotplug-cpu.c | 8 ++++----
arch/powerpc/platforms/pseries/rtasd.c | 8 ++++----
arch/x86/kernel/cpu/mtrr/main.c | 8 ++++----
arch/x86/kernel/microcode.c | 16 ++++++++--------
drivers/lguest/x86/core.c | 8 ++++----
drivers/s390/char/sclp_config.c | 4 ++--
include/linux/cpu.h | 8 ++++----
kernel/cpu.c | 10 +++++-----
kernel/cpuset.c | 14 +++++++-------
kernel/rcutorture.c | 6 +++---
kernel/stop_machine.c | 4 ++--
net/core/flow.c | 4 ++--
14 files changed, 60 insertions(+), 59 deletions(-)

diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt
index a741f65..fb94f5a 100644
--- a/Documentation/cpu-hotplug.txt
+++ b/Documentation/cpu-hotplug.txt
@@ -109,12 +109,13 @@ Never use anything other than cpumask_t to represent bitmap of CPUs.
for_each_cpu_mask(x,mask) - Iterate over some random collection of cpu mask.

#include <linux/cpu.h>
- lock_cpu_hotplug() and unlock_cpu_hotplug():
+ get_online_cpus() and put_online_cpus():

-The above calls are used to inhibit cpu hotplug operations. While holding the
-cpucontrol mutex, cpu_online_map will not change. If you merely need to avoid
-cpus going away, you could also use preempt_disable() and preempt_enable()
-for those sections. Just remember the critical section cannot call any
+The above calls are used to inhibit cpu hotplug operations. While the
+cpu_hotplug.refcount is non zero, the cpu_online_map will not change.
+If you merely need to avoid cpus going away, you could also use
+preempt_disable() and preempt_enable() for those sections.
+Just remember the critical section cannot call any
function that can sleep or schedule this process away. The preempt_disable()
will work as long as stop_machine_run() is used to take a cpu down.

diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c
index 892665b..bb4f00c 100644
--- a/arch/mips/kernel/mips-mt-fpaff.c
+++ b/arch/mips/kernel/mips-mt-fpaff.c
@@ -58,13 +58,13 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len,
if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
return -EFAULT;

- lock_cpu_hotplug();
+ get_online_cpus();
read_lock(&tasklist_lock);

p = find_process_by_pid(pid);
if (!p) {
read_unlock(&tasklist_lock);
- unlock_cpu_hotplug();
+ put_online_cpus();
return -ESRCH;
}

@@ -106,7 +106,7 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len,

out_unlock:
put_task_struct(p);
- unlock_cpu_hotplug();
+ put_online_cpus();
return retval;
}

@@ -125,7 +125,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
if (len < real_len)
return -EINVAL;

- lock_cpu_hotplug();
+ get_online_cpus();
read_lock(&tasklist_lock);

retval = -ESRCH;
@@ -140,7 +140,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,

out_unlock:
read_unlock(&tasklist_lock);
- unlock_cpu_hotplug();
+ put_online_cpus();
if (retval)
return retval;
if (copy_to_user(user_mask_ptr, &mask, real_len))
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index fc48b96..67b8016 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -151,7 +151,7 @@ static int pseries_add_processor(struct device_node *np)
for (i = 0; i < nthreads; i++)
cpu_set(i, tmp);

- lock_cpu_hotplug();
+ cpu_maps_update_begin();

BUG_ON(!cpus_subset(cpu_present_map, cpu_possible_map));

@@ -188,7 +188,7 @@ static int pseries_add_processor(struct device_node *np)
}
err = 0;
out_unlock:
- unlock_cpu_hotplug();
+ cpu_maps_update_done();
return err;
}

@@ -209,7 +209,7 @@ static void pseries_remove_processor(struct device_node *np)

nthreads = len / sizeof(u32);

- lock_cpu_hotplug();
+ cpu_maps_update_begin();
for (i = 0; i < nthreads; i++) {
for_each_present_cpu(cpu) {
if (get_hard_smp_processor_id(cpu) != intserv[i])
@@ -223,7 +223,7 @@ static void pseries_remove_processor(struct device_node *np)
printk(KERN_WARNING "Could not find cpu to remove "
"with physical id 0x%x\n", intserv[i]);
}
- unlock_cpu_hotplug();
+ cpu_maps_update_done();
}

static int pseries_smp_notifier(struct notifier_block *nb,
diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/platforms/pseries/rtasd.c
index 73401c8..e3078ce 100644
--- a/arch/powerpc/platforms/pseries/rtasd.c
+++ b/arch/powerpc/platforms/pseries/rtasd.c
@@ -382,7 +382,7 @@ static void do_event_scan_all_cpus(long delay)
{
int cpu;

- lock_cpu_hotplug();
+ get_online_cpus();
cpu = first_cpu(cpu_online_map);
for (;;) {
set_cpus_allowed(current, cpumask_of_cpu(cpu));
@@ -390,15 +390,15 @@ static void do_event_scan_all_cpus(long delay)
set_cpus_allowed(current, CPU_MASK_ALL);

/* Drop hotplug lock, and sleep for the specified delay */
- unlock_cpu_hotplug();
+ put_online_cpus();
msleep_interruptible(delay);
- lock_cpu_hotplug();
+ get_online_cpus();

cpu = next_cpu(cpu, cpu_online_map);
if (cpu == NR_CPUS)
break;
}
- unlock_cpu_hotplug();
+ put_online_cpus();
}

static int rtasd(void *unused)
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 9abbdf7..6aa0fbc 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -351,7 +351,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
replace = -1;

/* No CPU hotplug when we change MTRR entries */
- lock_cpu_hotplug();
+ get_online_cpus();
/* Search for existing MTRR */
mutex_lock(&mtrr_mutex);
for (i = 0; i < num_var_ranges; ++i) {
@@ -407,7 +407,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
error = i;
out:
mutex_unlock(&mtrr_mutex);
- unlock_cpu_hotplug();
+ put_online_cpus();
return error;
}

@@ -497,7 +497,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)

max = num_var_ranges;
/* No CPU hotplug when we change MTRR entries */
- lock_cpu_hotplug();
+ get_online_cpus();
mutex_lock(&mtrr_mutex);
if (reg < 0) {
/* Search for existing MTRR */
@@ -538,7 +538,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
error = reg;
out:
mutex_unlock(&mtrr_mutex);
- unlock_cpu_hotplug();
+ put_online_cpus();
return error;
}
/**
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 09c3152..40cfd54 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -436,7 +436,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
return -EINVAL;
}

- lock_cpu_hotplug();
+ get_online_cpus();
mutex_lock(&microcode_mutex);

user_buffer = (void __user *) buf;
@@ -447,7 +447,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
ret = (ssize_t)len;

mutex_unlock(&microcode_mutex);
- unlock_cpu_hotplug();
+ put_online_cpus();

return ret;
}
@@ -658,14 +658,14 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)

old = current->cpus_allowed;

- lock_cpu_hotplug();
+ get_online_cpus();
set_cpus_allowed(current, cpumask_of_cpu(cpu));

mutex_lock(&microcode_mutex);
if (uci->valid)
err = cpu_request_microcode(cpu);
mutex_unlock(&microcode_mutex);
- unlock_cpu_hotplug();
+ put_online_cpus();
set_cpus_allowed(current, old);
}
if (err)
@@ -817,9 +817,9 @@ static int __init microcode_init (void)
return PTR_ERR(microcode_pdev);
}

- lock_cpu_hotplug();
+ get_online_cpus();
error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
- unlock_cpu_hotplug();
+ put_online_cpus();
if (error) {
microcode_dev_exit();
platform_device_unregister(microcode_pdev);
@@ -839,9 +839,9 @@ static void __exit microcode_exit (void)

unregister_hotcpu_notifier(&mc_cpu_notifier);

- lock_cpu_hotplug();
+ get_online_cpus();
sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
- unlock_cpu_hotplug();
+ put_online_cpus();

platform_device_unregister(microcode_pdev);
}
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 482aec2..96d0fd0 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -459,7 +459,7 @@ void __init lguest_arch_host_init(void)

/* We don't need the complexity of CPUs coming and going while we're
* doing this. */
- lock_cpu_hotplug();
+ get_online_cpus();
if (cpu_has_pge) { /* We have a broader idea of "global". */
/* Remember that this was originally set (for cleanup). */
cpu_had_pge = 1;
@@ -469,20 +469,20 @@ void __init lguest_arch_host_init(void)
/* Turn off the feature in the global feature set. */
clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
}
- unlock_cpu_hotplug();
+ put_online_cpus();
};
/*:*/

void __exit lguest_arch_host_fini(void)
{
/* If we had PGE before we started, turn it back on now. */
- lock_cpu_hotplug();
+ get_online_cpus();
if (cpu_had_pge) {
set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
/* adjust_pge's argument "1" means set PGE. */
on_each_cpu(adjust_pge, (void *)1, 0, 1);
}
- unlock_cpu_hotplug();
+ put_online_cpus();
}


diff --git a/drivers/s390/char/sclp_config.c b/drivers/s390/char/sclp_config.c
index 5322e5e..9dc77f1 100644
--- a/drivers/s390/char/sclp_config.c
+++ b/drivers/s390/char/sclp_config.c
@@ -29,12 +29,12 @@ static void sclp_cpu_capability_notify(struct work_struct *work)
struct sys_device *sysdev;

printk(KERN_WARNING TAG "cpu capability changed.\n");
- lock_cpu_hotplug();
+ get_online_cpus();
for_each_online_cpu(cpu) {
sysdev = get_cpu_sysdev(cpu);
kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
}
- unlock_cpu_hotplug();
+ put_online_cpus();
}

static void sclp_conf_receiver_fn(struct evbuf_header *evbuf)
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index d1676dd..9b80c1a 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -100,8 +100,8 @@ static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
mutex_unlock(cpu_hp_mutex);
}

-extern void lock_cpu_hotplug(void);
-extern void unlock_cpu_hotplug(void);
+extern void get_online_cpus(void);
+extern void put_online_cpus(void);
#define hotcpu_notifier(fn, pri) { \
static struct notifier_block fn##_nb = \
{ .notifier_call = fn, .priority = pri }; \
@@ -119,8 +119,8 @@ static inline void cpuhotplug_mutex_lock(struct mutex *cpu_hp_mutex)
static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
{ }

-#define lock_cpu_hotplug() do { } while (0)
-#define unlock_cpu_hotplug() do { } while (0)
+#define get_online_cpus() do { } while (0)
+#define put_online_cpus() do { } while (0)
#define hotcpu_notifier(fn, pri) do { (void)(fn); } while (0)
/* These aren't inline functions due to a GCC bug. */
#define register_hotcpu_notifier(nb) ({ (void)(nb); 0; })
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 656dc3f..b0c4152 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -48,7 +48,7 @@ void __init cpu_hotplug_init(void)

#ifdef CONFIG_HOTPLUG_CPU

-void lock_cpu_hotplug(void)
+void get_online_cpus(void)
{
might_sleep();
if (cpu_hotplug.active_writer == current)
@@ -58,9 +58,9 @@ void lock_cpu_hotplug(void)
mutex_unlock(&cpu_hotplug.lock);

}
-EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
+EXPORT_SYMBOL_GPL(get_online_cpus);

-void unlock_cpu_hotplug(void)
+void put_online_cpus(void)
{
if (cpu_hotplug.active_writer == current)
return;
@@ -73,7 +73,7 @@ void unlock_cpu_hotplug(void)
mutex_unlock(&cpu_hotplug.lock);

}
-EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
+EXPORT_SYMBOL_GPL(put_online_cpus);

#endif /* CONFIG_HOTPLUG_CPU */

@@ -110,7 +110,7 @@ void cpu_maps_update_done(void)
* non zero and goes to sleep again.
*
* However, this is very difficult to achieve in practice since
- * lock_cpu_hotplug() not an api which is called all that often.
+ * get_online_cpus() not an api which is called all that often.
*
*/
static void cpu_hotplug_begin(void)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 50f5dc4..cfaf641 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,10 +537,10 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
*
* Call with cgroup_mutex held. May take callback_mutex during
* call due to the kfifo_alloc() and kmalloc() calls. May nest
- * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
+ * a call to the get_online_cpus()/put_online_cpus() pair.
* Must not be called holding callback_mutex, because we must not
- * call lock_cpu_hotplug() while holding callback_mutex. Elsewhere
- * the kernel nests callback_mutex inside lock_cpu_hotplug() calls.
+ * call get_online_cpus() while holding callback_mutex. Elsewhere
+ * the kernel nests callback_mutex inside get_online_cpus() calls.
* So the reverse nesting would risk an ABBA deadlock.
*
* The three key local variables below are:
@@ -691,9 +691,9 @@ restart:

rebuild:
/* Have scheduler rebuild sched domains */
- lock_cpu_hotplug();
+ get_online_cpus();
partition_sched_domains(ndoms, doms);
- unlock_cpu_hotplug();
+ put_online_cpus();

done:
if (q && !IS_ERR(q))
@@ -1617,10 +1617,10 @@ static struct cgroup_subsys_state *cpuset_create(
*
* If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains(). The lock_cpu_hotplug()
+ * will call rebuild_sched_domains(). The get_online_cpus()
* call in rebuild_sched_domains() must not be made while holding
* callback_mutex. Elsewhere the kernel nests callback_mutex inside
- * lock_cpu_hotplug() calls. So the reverse nesting would risk an
+ * get_online_cpus() calls. So the reverse nesting would risk an
* ABBA deadlock.
*/

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index c3e165c..fd59982 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -726,11 +726,11 @@ static void rcu_torture_shuffle_tasks(void)
cpumask_t tmp_mask = CPU_MASK_ALL;
int i;

- lock_cpu_hotplug();
+ get_online_cpus();

/* No point in shuffling if there is only one online CPU (ex: UP) */
if (num_online_cpus() == 1) {
- unlock_cpu_hotplug();
+ put_online_cpus();
return;
}

@@ -762,7 +762,7 @@ static void rcu_torture_shuffle_tasks(void)
else
rcu_idle_cpu--;

- unlock_cpu_hotplug();
+ put_online_cpus();
}

/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 319821e..51b5ee5 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -203,13 +203,13 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
int ret;

/* No CPUs can come up or down during this. */
- lock_cpu_hotplug();
+ get_online_cpus();
p = __stop_machine_run(fn, data, cpu);
if (!IS_ERR(p))
ret = kthread_stop(p);
else
ret = PTR_ERR(p);
- unlock_cpu_hotplug();
+ put_online_cpus();

return ret;
}
diff --git a/net/core/flow.c b/net/core/flow.c
index 3ed2b4b..6489f4e 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -293,7 +293,7 @@ void flow_cache_flush(void)
static DEFINE_MUTEX(flow_flush_sem);

/* Don't want cpus going down or up during this. */
- lock_cpu_hotplug();
+ get_online_cpus();
mutex_lock(&flow_flush_sem);
atomic_set(&info.cpuleft, num_online_cpus());
init_completion(&info.completion);
@@ -305,7 +305,7 @@ void flow_cache_flush(void)

wait_for_completion(&info.completion);
mutex_unlock(&flow_flush_sem);
- unlock_cpu_hotplug();
+ put_online_cpus();
}

static void __devinit flow_cache_cpu_prepare(int cpu)
--
1.5.2.5

--
Gautham R Shenoy
Linux Technology Center
IBM India.
"Freedom comes with a price tag of responsibility, which is still a bargain,
because Freedom is priceless!"

Subject: [RFC PATCH 3/3] cpu-hotplug: Replace per-subsystem mutexes with get_online_cpus()

From: Gautham R Shenoy <[email protected]>
Date: Thu, 15 Nov 2007 18:14:29 +0530
Subject: [PATCH 3/3] cpu-hotplug: Replace per-subsystem mutexes with get_online_cpus()

This patch converts the known per-subsystem mutexes to get_online_cpus
put_online_cpus. It also eliminates the CPU_LOCK_ACQUIRE
and CPU_LOCK_RELEASE hotplug notification events.

Signed-off-by: Gautham R Shenoy <[email protected]>
---
include/linux/notifier.h | 4 +---
kernel/cpu.c | 4 ----
kernel/sched.c | 25 +++++++++----------------
kernel/workqueue.c | 35 +++++++++++++++--------------------
mm/slab.c | 18 +++++++++++-------
5 files changed, 36 insertions(+), 50 deletions(-)

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 0c40cc0..5dfbc68 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -207,9 +207,7 @@ static inline int notifier_to_errno(int ret)
#define CPU_DOWN_PREPARE 0x0005 /* CPU (unsigned)v going down */
#define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */
#define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */
-#define CPU_LOCK_ACQUIRE 0x0008 /* Acquire all hotcpu locks */
-#define CPU_LOCK_RELEASE 0x0009 /* Release all hotcpu locks */
-#define CPU_DYING 0x000A /* CPU (unsigned)v not running any task,
+#define CPU_DYING 0x0008 /* CPU (unsigned)v not running any task,
* not handling interrupts, soon dead */

/* Used for CPU hotplug events occuring while tasks are frozen due to a suspend
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b0c4152..e0d3a4f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -218,7 +218,6 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
return -EINVAL;

cpu_hotplug_begin();
- raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
hcpu, -1, &nr_calls);
if (err == NOTIFY_BAD) {
@@ -271,7 +270,6 @@ out_thread:
out_allowed:
set_cpus_allowed(current, old_allowed);
out_release:
- raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
cpu_hotplug_done();
return err;
}
@@ -302,7 +300,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
return -EINVAL;

cpu_hotplug_begin();
- raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
-1, &nr_calls);
if (ret == NOTIFY_BAD) {
@@ -326,7 +323,6 @@ out_notify:
if (ret != 0)
__raw_notifier_call_chain(&cpu_chain,
CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
- raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
cpu_hotplug_done();

return ret;
diff --git a/kernel/sched.c b/kernel/sched.c
index 3f6bd11..5c30adc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -364,7 +364,6 @@ struct rq {
};

static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static DEFINE_MUTEX(sched_hotcpu_mutex);

static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
{
@@ -4474,13 +4473,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
struct task_struct *p;
int retval;

- mutex_lock(&sched_hotcpu_mutex);
+ get_online_cpus();
read_lock(&tasklist_lock);

p = find_process_by_pid(pid);
if (!p) {
read_unlock(&tasklist_lock);
- mutex_unlock(&sched_hotcpu_mutex);
+ put_online_cpus();
return -ESRCH;
}

@@ -4520,7 +4519,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
}
out_unlock:
put_task_struct(p);
- mutex_unlock(&sched_hotcpu_mutex);
+ put_online_cpus();
return retval;
}

@@ -4577,7 +4576,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
struct task_struct *p;
int retval;

- mutex_lock(&sched_hotcpu_mutex);
+ get_online_cpus();
read_lock(&tasklist_lock);

retval = -ESRCH;
@@ -4593,7 +4592,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)

out_unlock:
read_unlock(&tasklist_lock);
- mutex_unlock(&sched_hotcpu_mutex);
+ put_online_cpus();

return retval;
}
@@ -5536,9 +5535,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
struct rq *rq;

switch (action) {
- case CPU_LOCK_ACQUIRE:
- mutex_lock(&sched_hotcpu_mutex);
- break;

case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
@@ -5606,9 +5602,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
spin_unlock_irq(&rq->lock);
break;
#endif
- case CPU_LOCK_RELEASE:
- mutex_unlock(&sched_hotcpu_mutex);
- break;
}
return NOTIFY_OK;
}
@@ -6562,10 +6555,10 @@ static int arch_reinit_sched_domains(void)
{
int err;

- mutex_lock(&sched_hotcpu_mutex);
+ get_online_cpus();
detach_destroy_domains(&cpu_online_map);
err = arch_init_sched_domains(&cpu_online_map);
- mutex_unlock(&sched_hotcpu_mutex);
+ put_online_cpus();

return err;
}
@@ -6676,12 +6669,12 @@ void __init sched_init_smp(void)
{
cpumask_t non_isolated_cpus;

- mutex_lock(&sched_hotcpu_mutex);
+ get_online_cpus();
arch_init_sched_domains(&cpu_online_map);
cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
if (cpus_empty(non_isolated_cpus))
cpu_set(smp_processor_id(), non_isolated_cpus);
- mutex_unlock(&sched_hotcpu_mutex);
+ put_online_cpus();
/* XXX: Theoretical race here - CPU may be hotplugged now */
hotcpu_notifier(update_sched_domains, 0);

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 52d5e7c..1bddee3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -67,9 +67,8 @@ struct workqueue_struct {
#endif
};

-/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
- threads to each one as cpus come/go. */
-static DEFINE_MUTEX(workqueue_mutex);
+/* Serializes the accesses to the list of workqueues. */
+static DEFINE_SPINLOCK(workqueue_lock);
static LIST_HEAD(workqueues);

static int singlethread_cpu __read_mostly;
@@ -592,8 +591,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
* Returns zero on success.
* Returns -ve errno on failure.
*
- * Appears to be racy against CPU hotplug.
- *
* schedule_on_each_cpu() is very slow.
*/
int schedule_on_each_cpu(work_func_t func)
@@ -605,7 +602,7 @@ int schedule_on_each_cpu(work_func_t func)
if (!works)
return -ENOMEM;

- preempt_disable(); /* CPU hotplug */
+ get_online_cpus();
for_each_online_cpu(cpu) {
struct work_struct *work = per_cpu_ptr(works, cpu);

@@ -613,8 +610,8 @@ int schedule_on_each_cpu(work_func_t func)
set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
}
- preempt_enable();
flush_workqueue(keventd_wq);
+ put_online_cpus();
free_percpu(works);
return 0;
}
@@ -749,8 +746,10 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
err = create_workqueue_thread(cwq, singlethread_cpu);
start_workqueue_thread(cwq, -1);
} else {
- mutex_lock(&workqueue_mutex);
+ get_online_cpus();
+ spin_lock(&workqueue_lock);
list_add(&wq->list, &workqueues);
+ spin_unlock(&workqueue_lock);

for_each_possible_cpu(cpu) {
cwq = init_cpu_workqueue(wq, cpu);
@@ -759,7 +758,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
err = create_workqueue_thread(cwq, cpu);
start_workqueue_thread(cwq, cpu);
}
- mutex_unlock(&workqueue_mutex);
+ put_online_cpus();
}

if (err) {
@@ -774,7 +773,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
{
/*
* Our caller is either destroy_workqueue() or CPU_DEAD,
- * workqueue_mutex protects cwq->thread
+ * get_online_cpus() protects cwq->thread.
*/
if (cwq->thread == NULL)
return;
@@ -809,9 +808,11 @@ void destroy_workqueue(struct workqueue_struct *wq)
struct cpu_workqueue_struct *cwq;
int cpu;

- mutex_lock(&workqueue_mutex);
+ get_online_cpus();
+ spin_lock(&workqueue_lock);
list_del(&wq->list);
- mutex_unlock(&workqueue_mutex);
+ spin_unlock(&workqueue_lock);
+ put_online_cpus();

for_each_cpu_mask(cpu, *cpu_map) {
cwq = per_cpu_ptr(wq->cpu_wq, cpu);
@@ -834,13 +835,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
action &= ~CPU_TASKS_FROZEN;

switch (action) {
- case CPU_LOCK_ACQUIRE:
- mutex_lock(&workqueue_mutex);
- return NOTIFY_OK;
-
- case CPU_LOCK_RELEASE:
- mutex_unlock(&workqueue_mutex);
- return NOTIFY_OK;

case CPU_UP_PREPARE:
cpu_set(cpu, cpu_populated_map);
@@ -853,7 +847,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
case CPU_UP_PREPARE:
if (!create_workqueue_thread(cwq, cpu))
break;
- printk(KERN_ERR "workqueue for %i failed\n", cpu);
+ printk(KERN_ERR "workqueue [%s] for %i failed\n",
+ wq->name, cpu);
return NOTIFY_BAD;

case CPU_ONLINE:
diff --git a/mm/slab.c b/mm/slab.c
index cfa6be4..c2333b8 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -730,8 +730,7 @@ static inline void init_lock_keys(void)
#endif

/*
- * 1. Guard access to the cache-chain.
- * 2. Protect sanity of cpu_online_map against cpu hotplug events
+ * Guard access to the cache-chain.
*/
static DEFINE_MUTEX(cache_chain_mutex);
static struct list_head cache_chain;
@@ -1331,12 +1330,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
int err = 0;

switch (action) {
- case CPU_LOCK_ACQUIRE:
- mutex_lock(&cache_chain_mutex);
- break;
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
+ mutex_lock(&cache_chain_mutex);
err = cpuup_prepare(cpu);
+ mutex_unlock(&cache_chain_mutex);
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
@@ -1373,9 +1371,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
#endif
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
+ mutex_lock(&cache_chain_mutex);
cpuup_canceled(cpu);
- break;
- case CPU_LOCK_RELEASE:
mutex_unlock(&cache_chain_mutex);
break;
}
@@ -2170,6 +2167,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
* We use cache_chain_mutex to ensure a consistent view of
* cpu_online_map as well. Please see cpuup_callback
*/
+ get_online_cpus();
mutex_lock(&cache_chain_mutex);

list_for_each_entry(pc, &cache_chain, next) {
@@ -2396,6 +2394,7 @@ oops:
panic("kmem_cache_create(): failed to create slab `%s'\n",
name);
mutex_unlock(&cache_chain_mutex);
+ put_online_cpus();
return cachep;
}
EXPORT_SYMBOL(kmem_cache_create);
@@ -2547,9 +2546,11 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
int ret;
BUG_ON(!cachep || in_interrupt());

+ get_online_cpus();
mutex_lock(&cache_chain_mutex);
ret = __cache_shrink(cachep);
mutex_unlock(&cache_chain_mutex);
+ put_online_cpus();
return ret;
}
EXPORT_SYMBOL(kmem_cache_shrink);
@@ -2575,6 +2576,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
BUG_ON(!cachep || in_interrupt());

/* Find the cache in the chain of caches. */
+ get_online_cpus();
mutex_lock(&cache_chain_mutex);
/*
* the chain is never empty, cache_cache is never destroyed
@@ -2584,6 +2586,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
slab_error(cachep, "Can't free all objects");
list_add(&cachep->next, &cache_chain);
mutex_unlock(&cache_chain_mutex);
+ put_online_cpus();
return;
}

@@ -2592,6 +2595,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)

__kmem_cache_destroy(cachep);
mutex_unlock(&cache_chain_mutex);
+ put_online_cpus();
}
EXPORT_SYMBOL(kmem_cache_destroy);

--
1.5.2.5

--
Gautham R Shenoy
Linux Technology Center
IBM India.
"Freedom comes with a price tag of responsibility, which is still a bargain,
because Freedom is priceless!"

2007-11-15 14:17:15

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC PATCH 1/3] cpu-hotplug: Refcount Based Cpu Hotplug implementation


* Gautham R Shenoy <[email protected]> wrote:

> void lock_cpu_hotplug(void)
> {
> - struct task_struct *tsk = current;
> -
> - if (tsk == recursive) {
> - static int warnings = 10;
> - if (warnings) {
> - printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n");
> - WARN_ON(1);
> - warnings--;
> - }
> - recursive_depth++;

i nominate it for a 2.6.24 merge due to it fixing the above regression.
I've seen regular instances of the "Lukewarm IQ" message.

Ingo

2007-11-15 14:18:06

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] Refcount Based Cpu Hotplug V3


* Gautham R Shenoy <[email protected]> wrote:

> Hi,
> This is Try #3 for the Refcount + Waitqueue based implementation
> for cpu-hotplug locking. The earlier versions can be found at
>
> http://lkml.org/lkml/2007/10/24/36
> and
> http://lkml.org/lkml/2007/10/16/118.
>
> This version drops the patch 4 from the earlier series, which was
> basically removing the CPU_DEAD and CPU_UP_CANCELLED event handling in
> the workqueue subsystem. This was based on Oleg Nestertov's comments
> since the problem which the patch was attempting to solve was more
> complicated and needs to be looked into seperately.
>
> The patch series is against v2.6.24-rc2 and has survived
> overnight kern_bench + cpu-offline-online tests on
> - 8 way x86,
> - 8 way x86_64,
> - 128 way ppc64 boxes.
>
> The patch series has gone through many iterations and looks to have
> matured in the process.
>
> Andrew, could you please consider it for the next -mm release if there
> are no objections ?

really nice stuff!

Acked-by: Ingo Molnar <[email protected]>

since this is in essence a regression fix, how about a v2.6.24 merge?
CPU online/offline never worked really reliably and it's used by
suspend/resume as wel.

Ingo

2007-11-15 15:35:18

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC PATCH 1/3] cpu-hotplug: Refcount Based Cpu Hotplug implementation


* Ingo Molnar <[email protected]> wrote:

> * Gautham R Shenoy <[email protected]> wrote:
>
> > void lock_cpu_hotplug(void)
> > {
> > - struct task_struct *tsk = current;
> > -
> > - if (tsk == recursive) {
> > - static int warnings = 10;
> > - if (warnings) {
> > - printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n");
> > - WARN_ON(1);
> > - warnings--;
> > - }
> > - recursive_depth++;
>
> i nominate it for a 2.6.24 merge due to it fixing the above
> regression. I've seen regular instances of the "Lukewarm IQ" message.

FYI, i've put these 3 patches into the scheduler git tree and it's
looking very good so far. So unless Andrew or Linus objects to put this
into v2.6.24, or there's serious questions during review, could we merge
it this way?

Ingo

2007-11-15 15:38:34

by Ralf Baechle

[permalink] [raw]
Subject: Re: [RFC PATCH 2/3] cpu-hotplug: Replace lock_cpu_hotplug() with get_online_cpus()

On Thu, Nov 15, 2007 at 07:22:02PM +0530, Gautham R Shenoy wrote:

> Replace all lock_cpu_hotplug/unlock_cpu_hotplug from the kernel and use
> get_online_cpus and put_online_cpus instead as it highlights
> the refcount semantics in these operations.
>
> The new API guarantees protection against the cpu-hotplug operation,
> but it doesn't guarantee serialized access to any of the local data
> structures. Hence the changes needs to be reviewed.
>
> In case of pseries_add_processor/pseries_remove_processor, use
> cpu_maps_update_begin()/cpu_maps_update_done() as we're modifying the
> cpu_present_map there.
>
> Signed-off-by: Gautham R Shenoy <[email protected]>
> ---
> Documentation/cpu-hotplug.txt | 11 ++++++-----
> arch/mips/kernel/mips-mt-fpaff.c | 10 +++++-----

Acked-by: Ralf Baechle <[email protected]>

for the MIPS part.

Ralf

2007-11-15 16:15:30

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC PATCH 1/3] cpu-hotplug: Refcount Based Cpu Hotplug implementation


* Ingo Molnar <[email protected]> wrote:

> FYI, i've put these 3 patches into the scheduler git tree and it's
> looking very good so far. So unless Andrew or Linus objects to put
> this into v2.6.24, or there's serious questions during review, could
> we merge it this way?

i've got this trivial build fix for !SMP - otherwise it's still looking
good.

Ingo

------------------>
Subject: cpu hotplug: fix build on !CONFIG_SMP
From: Ingo Molnar <[email protected]>

fix build on !CONFIG_SMP.

Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/cpu.h | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)

Index: linux/include/linux/cpu.h
===================================================================
--- linux.orig/include/linux/cpu.h
+++ linux/include/linux/cpu.h
@@ -71,19 +71,25 @@ static inline void unregister_cpu_notifi

int cpu_up(unsigned int cpu);

+extern void cpu_hotplug_init(void);
+
#else

static inline int register_cpu_notifier(struct notifier_block *nb)
{
return 0;
}
+
static inline void unregister_cpu_notifier(struct notifier_block *nb)
{
}

+static inline void cpu_hotplug_init(void)
+{
+}
+
#endif /* CONFIG_SMP */
extern struct sysdev_class cpu_sysdev_class;
-extern void cpu_hotplug_init(void);
extern void cpu_maps_update_begin(void);
extern void cpu_maps_update_done(void);

2007-11-15 16:36:47

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [RFC PATCH 3/3] cpu-hotplug: Replace per-subsystem mutexes with get_online_cpus()

On 11/15, Gautham R Shenoy wrote:
>
> This patch converts the known per-subsystem mutexes to get_online_cpus
> put_online_cpus. It also eliminates the CPU_LOCK_ACQUIRE
> and CPU_LOCK_RELEASE hotplug notification events.
>
> Signed-off-by: Gautham R Shenoy <[email protected]>
> ---
> include/linux/notifier.h | 4 +---
> kernel/cpu.c | 4 ----
> kernel/sched.c | 25 +++++++++----------------
> kernel/workqueue.c | 35 +++++++++++++++--------------------
> mm/slab.c | 18 +++++++++++-------

Can't really comment the changes outside the workqueue.c, but imho the
whole series is very good.

Oleg.

2007-11-15 17:12:28

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [RFC PATCH 1/3] cpu-hotplug: Refcount Based Cpu Hotplug implementation

On 11/15, Gautham R Shenoy wrote:
>
> +static struct {
> + struct task_struct *active_writer;
> + struct mutex lock; /* Synchronizes accesses to refcount, */
> + /*
> + * Also blocks the new readers during
> + * an ongoing cpu hotplug operation.
> + */
> + int refcount;
> + wait_queue_head_t writer_queue;
> +} cpu_hotplug;
> ...
> void unlock_cpu_hotplug(void)
> {
> - WARN_ON(recursive != current);
> - if (recursive_depth) {
> - recursive_depth--;
> + if (cpu_hotplug.active_writer == current)
> return;
> - }
> - recursive = NULL;
> - mutex_unlock(&cpu_bitmask_lock);
> + mutex_lock(&cpu_hotplug.lock);
> + cpu_hotplug.refcount--;
> +
> + if (unlikely(writer_exists()) && !cpu_hotplug.refcount)
> + wake_up(&cpu_hotplug.writer_queue);
> +
> + mutex_unlock(&cpu_hotplug.lock);
> +
> }
> ...
> +static void cpu_hotplug_begin(void)
> +{
> + DECLARE_WAITQUEUE(wait, current);
> +
> + mutex_lock(&cpu_hotplug.lock);
> +
> + cpu_hotplug.active_writer = current;
> + add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait);
> + while (cpu_hotplug.refcount) {
> + set_current_state(TASK_UNINTERRUPTIBLE);
> + mutex_unlock(&cpu_hotplug.lock);
> + schedule();
> + mutex_lock(&cpu_hotplug.lock);
> + }
> + remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait);
> +}

Perhaps we can simplify this a little bit? I don't think we really need
cpu_hotplug.writer_queue, afaics we can just do

void unlock_cpu_hotplug(void)
{
if (cpu_hotplug.active_writer == current)
return;
mutex_lock(&cpu_hotplug.lock);
if (!--cpu_hotplug.refcount && cpu_hotplug.active_writer)
wake_up_process(cpu_hotplug.active_writer);
mutex_unlock(&cpu_hotplug.lock);

}

static void cpu_hotplug_begin(void)
{
mutex_lock(&cpu_hotplug.lock);
cpu_hotplug.active_writer = current;

while (cpu_hotplug.refcount) {
__set_current_state(TASK_UNINTERRUPTIBLE);
mutex_unlock(&cpu_hotplug.lock);
schedule();
mutex_lock(&cpu_hotplug.lock);
}
}

(not that it matters, we can do this later even if I am right)

Oleg.

2007-11-15 18:27:27

by Christoph Lameter

[permalink] [raw]
Subject: Re: [RFC PATCH 0/3] Refcount Based Cpu Hotplug V3

Good stuff! Better than the other locking schemes that I have seen go by.

Subject: Re: [RFC PATCH 1/3] cpu-hotplug: Refcount Based Cpu Hotplug implementation

On Thu, Nov 15, 2007 at 05:05:20PM +0100, Ingo Molnar wrote:
>
> * Ingo Molnar <[email protected]> wrote:
>
> > FYI, i've put these 3 patches into the scheduler git tree and it's
> > looking very good so far. So unless Andrew or Linus objects to put
> > this into v2.6.24, or there's serious questions during review, could
> > we merge it this way?
>
> i've got this trivial build fix for !SMP - otherwise it's still looking
> good.
>

Thanks for this. I had compile tested only for !CONFIG_HOTPLUG_CPU and
missed !CONFIG_SMP.

Regards
gautham.

> Ingo
>
> ------------------>
> Subject: cpu hotplug: fix build on !CONFIG_SMP
> From: Ingo Molnar <[email protected]>
>
> fix build on !CONFIG_SMP.
>
> Signed-off-by: Ingo Molnar <[email protected]>
> ---
> include/linux/cpu.h | 8 +++++++-
> 1 file changed, 7 insertions(+), 1 deletion(-)
>
> Index: linux/include/linux/cpu.h
> ===================================================================
> --- linux.orig/include/linux/cpu.h
> +++ linux/include/linux/cpu.h
> @@ -71,19 +71,25 @@ static inline void unregister_cpu_notifi
>
> int cpu_up(unsigned int cpu);
>
> +extern void cpu_hotplug_init(void);
> +
> #else
>
> static inline int register_cpu_notifier(struct notifier_block *nb)
> {
> return 0;
> }
> +
> static inline void unregister_cpu_notifier(struct notifier_block *nb)
> {
> }
>
> +static inline void cpu_hotplug_init(void)
> +{
> +}
> +
> #endif /* CONFIG_SMP */
> extern struct sysdev_class cpu_sysdev_class;
> -extern void cpu_hotplug_init(void);
> extern void cpu_maps_update_begin(void);
> extern void cpu_maps_update_done(void);
>

--
Gautham R Shenoy
Linux Technology Center
IBM India.
"Freedom comes with a price tag of responsibility, which is still a bargain,
because Freedom is priceless!"