2010-11-17 04:46:09

by Zheng, Shaohui

[permalink] [raw]
Subject: [5/8,v3] NUMA Hotplug Emulator: support cpu probe/release in x86

From: Shaohui Zheng <[email protected]>

Add cpu interface probe/release under sysfs for x86. User can use this
interface to emulate the cpu hot-add process, it is for cpu hotplug
test purpose. Add a kernel option CONFIG_ARCH_CPU_PROBE_RELEASE for this
feature.

This interface provides a mechanism to emulate cpu hotplug with software
methods, it becomes possible to do cpu hotplug automation and stress
testing.

Directive:
*) Reserve CPU throu grub parameter like:
maxcpus=4

the rest CPUs will not be initiliazed.

*) Probe CPU
we can use the probe interface to hot-add new CPUs:
echo nid > /sys/devices/system/cpu/probe

*) Release a CPU
echo cpu > /sys/devices/system/cpu/release

A reserved CPU will be hot-added to the specified node.
1) nid == 0, the CPU will be added to the real node which the CPU
should be in
2) nid != 0, add the CPU to node nid even through it is a fake node.

CC: Ingo Molnar <[email protected]>
CC: Len Brown <[email protected]>
CC: Yinghai Lu <[email protected]>
Signed-off-by: Shaohui Zheng <[email protected]>
Signed-off-by: Haicheng Li <[email protected]>
---
Index: linux-hpe4/arch/x86/kernel/acpi/boot.c
===================================================================
--- linux-hpe4.orig/arch/x86/kernel/acpi/boot.c 2010-11-17 09:00:59.742608402 +0800
+++ linux-hpe4/arch/x86/kernel/acpi/boot.c 2010-11-17 09:01:10.202837209 +0800
@@ -647,8 +647,44 @@
}
EXPORT_SYMBOL(acpi_map_lsapic);

+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+static void acpi_map_cpu2node_emu(int cpu, int physid, int nid)
+{
+#ifdef CONFIG_ACPI_NUMA
+#ifdef CONFIG_X86_64
+ apicid_to_node[physid] = nid;
+ numa_set_node(cpu, nid);
+#else /* CONFIG_X86_32 */
+ apicid_2_node[physid] = nid;
+ cpu_to_node_map[cpu] = nid;
+#endif
+#endif
+}
+
+static u16 cpu_to_apicid_saved[CONFIG_NR_CPUS];
+int __ref acpi_map_lsapic_emu(int pcpu, int nid)
+{
+ /* backup cpu apicid to array cpu_to_apicid_saved */
+ if (cpu_to_apicid_saved[pcpu] == 0 &&
+ per_cpu(x86_cpu_to_apicid, pcpu) != BAD_APICID)
+ cpu_to_apicid_saved[pcpu] = per_cpu(x86_cpu_to_apicid, pcpu);
+
+ per_cpu(x86_cpu_to_apicid, pcpu) = cpu_to_apicid_saved[pcpu];
+ acpi_map_cpu2node_emu(pcpu, per_cpu(x86_cpu_to_apicid, pcpu), nid);
+
+ return pcpu;
+}
+EXPORT_SYMBOL(acpi_map_lsapic_emu);
+#endif
+
int acpi_unmap_lsapic(int cpu)
{
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+ /* backup cpu apicid to array cpu_to_apicid_saved */
+ if (cpu_to_apicid_saved[cpu] == 0 &&
+ per_cpu(x86_cpu_to_apicid, cpu) != BAD_APICID)
+ cpu_to_apicid_saved[cpu] = per_cpu(x86_cpu_to_apicid, cpu);
+#endif
per_cpu(x86_cpu_to_apicid, cpu) = -1;
set_cpu_present(cpu, false);
num_processors--;
Index: linux-hpe4/arch/x86/kernel/smpboot.c
===================================================================
--- linux-hpe4.orig/arch/x86/kernel/smpboot.c 2010-11-17 09:00:59.753464132 +0800
+++ linux-hpe4/arch/x86/kernel/smpboot.c 2010-11-17 10:05:26.913464702 +0800
@@ -107,8 +107,6 @@
mutex_unlock(&x86_cpu_hotplug_driver_mutex);
}

-ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
-ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
#else
static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
#define get_idle_for_cpu(x) (idle_thread_array[(x)])
Index: linux-hpe4/arch/x86/kernel/topology.c
===================================================================
--- linux-hpe4.orig/arch/x86/kernel/topology.c 2010-11-17 09:01:10.192838977 +0800
+++ linux-hpe4/arch/x86/kernel/topology.c 2010-11-17 10:05:26.924085712 +0800
@@ -30,6 +30,9 @@
#include <linux/init.h>
#include <linux/smp.h>
#include <asm/cpu.h>
+#include <linux/cpu.h>
+#include <linux/topology.h>
+#include <linux/acpi.h>

static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);

@@ -66,6 +69,74 @@
unregister_cpu(&per_cpu(cpu_devices, num).cpu);
}
EXPORT_SYMBOL(arch_unregister_cpu);
+
+ssize_t arch_cpu_probe(const char *buf, size_t count)
+{
+ int nid = 0;
+ int num = 0, selected = 0;
+
+ /* check parameters */
+ if (!buf || count < 2)
+ return -EPERM;
+
+ nid = simple_strtoul(buf, NULL, 0);
+ printk(KERN_DEBUG "Add a cpu to node : %d\n", nid);
+
+ if (nid < 0 || nid > nr_node_ids - 1) {
+ printk(KERN_ERR "Invalid NUMA node id: %d (0 <= nid < %d).\n",
+ nid, nr_node_ids);
+ return -EPERM;
+ }
+
+ if (!node_online(nid)) {
+ printk(KERN_ERR "NUMA node %d is not online, give up.\n", nid);
+ return -EPERM;
+ }
+
+ /* find first uninitialized cpu */
+ for_each_present_cpu(num) {
+ if (per_cpu(cpu_sys_devices, num) == NULL) {
+ selected = num;
+ break;
+ }
+ }
+
+ if (selected >= num_possible_cpus()) {
+ printk(KERN_ERR "No free cpu, give up cpu probing.\n");
+ return -EPERM;
+ }
+
+ /* register cpu */
+ arch_register_cpu_node(selected, nid);
+ acpi_map_lsapic_emu(selected, nid);
+
+ return count;
+}
+EXPORT_SYMBOL(arch_cpu_probe);
+
+ssize_t arch_cpu_release(const char *buf, size_t count)
+{
+ int cpu = 0;
+
+ cpu = simple_strtoul(buf, NULL, 0);
+ /* cpu 0 is not hotplugable */
+ if (cpu == 0) {
+ printk(KERN_ERR "can not release cpu 0.\n");
+ return -EPERM;
+ }
+
+ if (cpu_online(cpu)) {
+ printk(KERN_DEBUG "offline cpu %d.\n", cpu);
+ cpu_down(cpu);
+ }
+
+ arch_unregister_cpu(cpu);
+ acpi_unmap_lsapic(cpu);
+
+ return count;
+}
+EXPORT_SYMBOL(arch_cpu_release);
+
#else /* CONFIG_HOTPLUG_CPU */

static int __init arch_register_cpu(int num)
@@ -83,8 +154,14 @@
register_one_node(i);
#endif

- for_each_present_cpu(i)
- arch_register_cpu(i);
+ /*
+ * when cpu hotplug emulation enabled, register the online cpu only,
+ * the rests are reserved for cpu probe.
+ */
+ for_each_present_cpu(i) {
+ if ((cpu_hpe_on && cpu_online(i)) || !cpu_hpe_on)
+ arch_register_cpu(i);
+ }

return 0;
}
Index: linux-hpe4/arch/x86/mm/numa_64.c
===================================================================
--- linux-hpe4.orig/arch/x86/mm/numa_64.c 2010-11-17 09:01:10.132837502 +0800
+++ linux-hpe4/arch/x86/mm/numa_64.c 2010-11-17 09:01:10.202837209 +0800
@@ -12,6 +12,7 @@
#include <linux/module.h>
#include <linux/nodemask.h>
#include <linux/sched.h>
+#include <linux/cpu.h>

#include <asm/e820.h>
#include <asm/proto.h>
@@ -915,6 +916,19 @@
}
#endif

+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+static __init int cpu_hpe_setup(char *opt)
+{
+ if (!opt)
+ return -EINVAL;
+
+ if (!strncmp(opt, "on", 2) || !strncmp(opt, "1", 1))
+ cpu_hpe_on = 1;
+
+ return 0;
+}
+early_param("cpu_hpe", cpu_hpe_setup);
+#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */

void __cpuinit numa_set_node(int cpu, int node)
{
Index: linux-hpe4/drivers/acpi/processor_driver.c
===================================================================
--- linux-hpe4.orig/drivers/acpi/processor_driver.c 2010-11-17 09:00:59.765335724 +0800
+++ linux-hpe4/drivers/acpi/processor_driver.c 2010-11-17 09:01:10.212839478 +0800
@@ -530,6 +530,14 @@
goto err_free_cpumask;

sysdev = get_cpu_sysdev(pr->id);
+ /*
+ * Reserve cpu for hotplug emulation, the reserved cpu can be hot-added
+ * throu the cpu probe interface. Return directly.
+ */
+ if (sysdev == NULL) {
+ goto out;
+ }
+
if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev")) {
result = -EFAULT;
goto err_remove_fs;
@@ -570,6 +578,7 @@
goto err_remove_sysfs;
}

+out:
return 0;

err_remove_sysfs:
Index: linux-hpe4/drivers/base/cpu.c
===================================================================
--- linux-hpe4.orig/drivers/base/cpu.c 2010-11-17 09:01:10.192838977 +0800
+++ linux-hpe4/drivers/base/cpu.c 2010-11-17 09:01:10.212839478 +0800
@@ -22,9 +22,15 @@
};
EXPORT_SYMBOL(cpu_sysdev_class);

-static DEFINE_PER_CPU(struct sys_device *, cpu_sys_devices);
+DEFINE_PER_CPU(struct sys_device *, cpu_sys_devices);

#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * cpu_hpe_on is a switch to enable/disable cpu hotplug emulation. it is
+ * disabled in default, we can enable it throu grub parameter cpu_hpe=on
+ */
+int cpu_hpe_on;
+
static ssize_t show_online(struct sys_device *dev, struct sysdev_attribute *attr,
char *buf)
{
Index: linux-hpe4/include/linux/acpi.h
===================================================================
--- linux-hpe4.orig/include/linux/acpi.h 2010-11-17 09:00:59.772898926 +0800
+++ linux-hpe4/include/linux/acpi.h 2010-11-17 09:01:10.212839478 +0800
@@ -102,6 +102,7 @@
#ifdef CONFIG_ACPI_HOTPLUG_CPU
/* Arch dependent functions for cpu hotplug support */
int acpi_map_lsapic(acpi_handle handle, int *pcpu);
+int acpi_map_lsapic_emu(int pcpu, int nid);
int acpi_unmap_lsapic(int cpu);
#endif /* CONFIG_ACPI_HOTPLUG_CPU */

Index: linux-hpe4/include/linux/cpu.h
===================================================================
--- linux-hpe4.orig/include/linux/cpu.h 2010-11-17 09:01:10.192838977 +0800
+++ linux-hpe4/include/linux/cpu.h 2010-11-17 09:01:10.212839478 +0800
@@ -30,6 +30,8 @@
struct sys_device sysdev;
};

+DECLARE_PER_CPU(struct sys_device *, cpu_sys_devices);
+
extern int register_cpu_node(struct cpu *cpu, int num, int nid);

static inline int register_cpu(struct cpu *cpu, int num)
@@ -149,6 +151,7 @@
#define register_hotcpu_notifier(nb) register_cpu_notifier(nb)
#define unregister_hotcpu_notifier(nb) unregister_cpu_notifier(nb)
int cpu_down(unsigned int cpu);
+extern int cpu_hpe_on;

#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
extern void cpu_hotplug_driver_lock(void);
@@ -171,6 +174,7 @@
/* These aren't inline functions due to a GCC bug. */
#define register_hotcpu_notifier(nb) ({ (void)(nb); 0; })
#define unregister_hotcpu_notifier(nb) ({ (void)(nb); })
+static int cpu_hpe_on;
#endif /* CONFIG_HOTPLUG_CPU */

#ifdef CONFIG_PM_SLEEP_SMP
Index: linux-hpe4/mm/Kconfig
===================================================================
--- linux-hpe4.orig/mm/Kconfig 2010-11-17 09:01:10.192838977 +0800
+++ linux-hpe4/mm/Kconfig 2010-11-17 10:05:20.994710783 +0800
@@ -162,6 +162,17 @@
N is the number of hidden nodes, size is the memory size per
hidden node. This is only useful for debugging.

+config ARCH_CPU_PROBE_RELEASE
+ def_bool y
+ bool "CPU hotplug emulation"
+ depends on NUMA_HOTPLUG_EMU
+ ---help---
+ Enable cpu hotplug emulation. Reserve cpu with grub parameter
+ "maxcpus=N", where N is the initial CPU number, the rest physical
+ CPUs will not be initialized; there is a probe/release interface
+ is for cpu hot-add/hot-remove to specified node in software method.
+ This is for debuging and testing purpose
+
#
# If we have space for more page flags then we can enable additional
# optimizations and functionality.

--
Thanks & Regards,
Shaohui


2010-11-21 14:42:04

by Cong Wang

[permalink] [raw]
Subject: Re: [5/8,v3] NUMA Hotplug Emulator: support cpu probe/release in x86

On Wed, Nov 17, 2010 at 10:08:04AM +0800, [email protected] wrote:
>From: Shaohui Zheng <[email protected]>
>
>Add cpu interface probe/release under sysfs for x86. User can use this
>interface to emulate the cpu hot-add process, it is for cpu hotplug
>test purpose. Add a kernel option CONFIG_ARCH_CPU_PROBE_RELEASE for this
>feature.
>
>This interface provides a mechanism to emulate cpu hotplug with software
> methods, it becomes possible to do cpu hotplug automation and stress
>testing.
>

Huh? We already have CPU online/offline...

Can you describe more about the difference?

Thanks.

2010-11-22 01:22:44

by Zheng, Shaohui

[permalink] [raw]
Subject: Re: [5/8,v3] NUMA Hotplug Emulator: support cpu probe/release in x86

On Sun, Nov 21, 2010 at 10:45:11PM +0800, Am?rico Wang wrote:
> On Wed, Nov 17, 2010 at 10:08:04AM +0800, [email protected] wrote:
> >From: Shaohui Zheng <[email protected]>
> >
> >Add cpu interface probe/release under sysfs for x86. User can use this
> >interface to emulate the cpu hot-add process, it is for cpu hotplug
> >test purpose. Add a kernel option CONFIG_ARCH_CPU_PROBE_RELEASE for this
> >feature.
> >
> >This interface provides a mechanism to emulate cpu hotplug with software
> > methods, it becomes possible to do cpu hotplug automation and stress
> >testing.
> >
>
> Huh? We already have CPU online/offline...
>
> Can you describe more about the difference?
>
> Thanks.

Again, we already try to discribe the difference between logcial cpu
online/offline and physical cpu online/offline many times.

The following is the my reply on other threads.
-------------------------------------------------------------------------------------------
>
> I don't get it. CPU hotplug can already be tested using echo 0/1 >
> online, and that works on 386. How is this different?
>
> It seems to add some numa magic. Why is it important?

Pavel,
it is not an easy thing to understand the full story since you may not work on this project
so you have such question. Let me do a simpe introductions about the background.

We need to understand 2 differnets concepts if you wnat to know the reason why we develop
the hotplug emulaor.

- CPU logcial online/offline
it is the existed feature which you mentioned, we can online/offline CPUs throu sysfs
interface /sys/device/system/cpu/cpuX/online (X is an integer, it stands for the CPU number)

echo 0/1 > /sys/device/cpu/cpuX/online

This is is logical CPU online/offline, when we do such operation, the CPU is already pluged
into the motherboard, and the OS initialized the CPU. the data structure and CPU entries on sysfs
are created, the CPU present mask and possible mask are setted, it does not refer to any physical
hardware. the CPU status becomes online from offline, and ready to schedule to run process by
scheduler.

CPU online/offline is control by the kernel option CONFIG_HOTPLUG_CPU.

- CPU hot-add/hot-remove

This is physical CPU hot-add/hot-remove into motherboard, without shutdown the machine, after
the hot-add operation, the new CPU will be powered on, and the OS recognize the new CPUs throu SCI
interrupts, then OS intializes the new CPUs, create the related CPU structures, create sysfs entries
for the new CPUs. Once all done, the CPU is ready to logcial online.

The process to hot-add CPU:
1) Physical CPU hot-add to motherboard when after the machine is powered on
2) the BIOS send SCI interrupts to notice the OS
3) Linux hotplug handler parse the data from the acpi_handle data
4) hotplug handler initialize the CPU structure according the cpu ACPI data

Current situation:
1) Provides developers an envronment
Only very few hardware can support CPU hot-add/hot-remove, we need create an working environment
for developers to write and debug hotplug code even through they do not has such hardward on hand.
It is what NUMA hotplug emulator does exactly. Physcial hotplug emuator should be a better name.

We have 2 solutions to solve this problem, and this one is selected finally; if you want to know
more about the solutions, we can continue to on this thread.

2) Offers an automation test inferface for Linux CPU hot-add/hot-remove code
Linux hot-add/hot-remove code has obvious bugs, but we do not see any automation test suite for it,
even in LTP project(LTP has hotplug suite for logical CPU online/offline).

It is a know difficult work to test physcial hot-add/hot-remove code in automation way, but the hotplug
emualtor does a good job for it. We reproduce all the major hotlug bugs against the internal emulator
v2 and v3.

We are sharing it to the community, wish more wisdoms and talents are included in it. We want to show an
exmaple of software emualtion, and hopes more guys benifit from it, this is the purpose for this group
patches.

PowerPC supporting
For ppc, it was added about half year ago by Nathan Fontenot, but x86 does not has such feature.
Thanks for lethal to mention it, we already did some researching about it, I will reply it in another
thread.

commit 12633e803a2a556f6469e0933d08233d0844a2d9
Author: Nathan Fontenot <[email protected]>
Date: Wed Nov 25 17:23:25 2009 +0000

commit 1a8061c46c46c960f715c597b9d279ea2ba42bd9
Author: Nathan Fontenot <[email protected]>
Date: Tue Nov 24 21:13:32 2009 +0000


We inherit the name style from ppc, CPU hot-add/hot-remove is called CPU probe/release in kernel, it was
control by kernel option CONFIG_ARCH_CPU_PROBE_RELEASE.
--
Thanks & Regards,
Shaohui

2010-11-22 15:48:42

by Cong Wang

[permalink] [raw]
Subject: Re: [5/8,v3] NUMA Hotplug Emulator: support cpu probe/release in x86

On Mon, Nov 22, 2010 at 08:01:04AM +0800, Shaohui Zheng wrote:
>On Sun, Nov 21, 2010 at 10:45:11PM +0800, Américo Wang wrote:
>> On Wed, Nov 17, 2010 at 10:08:04AM +0800, [email protected] wrote:
>> >From: Shaohui Zheng <[email protected]>
>> >
>> >Add cpu interface probe/release under sysfs for x86. User can use this
>> >interface to emulate the cpu hot-add process, it is for cpu hotplug
>> >test purpose. Add a kernel option CONFIG_ARCH_CPU_PROBE_RELEASE for this
>> >feature.
>> >
>> >This interface provides a mechanism to emulate cpu hotplug with software
>> > methods, it becomes possible to do cpu hotplug automation and stress
>> >testing.
>> >
>>
>> Huh? We already have CPU online/offline...
>>
>> Can you describe more about the difference?
>>
>> Thanks.
>
>Again, we already try to discribe the difference between logcial cpu
>online/offline and physical cpu online/offline many times.
>

I see, with "maxcpus=" we will only have the specified number
of CPU's which can be online/offline, you are trying to bring
the rest of CPU's hidden by "maxcpus=". :) Correct?

I think the idea is cool, but I think you need to improve
the documetion, for people who don't follow the hardware
concepts like me. ;)

Thanks.

--
Live like a child, think like the god.

2010-11-23 00:51:37

by Zheng, Shaohui

[permalink] [raw]
Subject: Re: [5/8,v3] NUMA Hotplug Emulator: support cpu probe/release in x86

On Mon, Nov 22, 2010 at 11:51:52PM +0800, Am?rico Wang wrote:
> On Mon, Nov 22, 2010 at 08:01:04AM +0800, Shaohui Zheng wrote:
> >On Sun, Nov 21, 2010 at 10:45:11PM +0800, Am?rico Wang wrote:
> >> On Wed, Nov 17, 2010 at 10:08:04AM +0800, [email protected] wrote:
> >> >From: Shaohui Zheng <[email protected]>
> >> >
> >> >Add cpu interface probe/release under sysfs for x86. User can use this
> >> >interface to emulate the cpu hot-add process, it is for cpu hotplug
> >> >test purpose. Add a kernel option CONFIG_ARCH_CPU_PROBE_RELEASE for this
> >> >feature.
> >> >
> >> >This interface provides a mechanism to emulate cpu hotplug with software
> >> > methods, it becomes possible to do cpu hotplug automation and stress
> >> >testing.
> >> >
> >>
> >> Huh? We already have CPU online/offline...
> >>
> >> Can you describe more about the difference?
> >>
> >> Thanks.
> >
> >Again, we already try to discribe the difference between logcial cpu
> >online/offline and physical cpu online/offline many times.
> >
>
> I see, with "maxcpus=" we will only have the specified number
> of CPU's which can be online/offline, you are trying to bring
> the rest of CPU's hidden by "maxcpus=". :) Correct?
Yes, when we online the rest CPUs, it test our cpu hot-add code logical.
>
> I think the idea is cool, but I think you need to improve
> the documetion, for people who don't follow the hardware
> concepts like me. ;)
CPU hot-add is supported by only a few hardwares, so many users might never
see such hardware, we should document it better. thanks for the remind.
>
> Thanks.
>
> --
> Live like a child, think like the god.
>

--
Thanks & Regards,
Shaohui