LinuxLists.cc - [RESEND RFC PATCH 1/2] x86/cpu hotplug: make apicid <--> cpuid mapping persistent

2015-04-24 10:17:55

Subject: [RESEND RFC PATCH 1/2] x86/cpu hotplug: make apicid <--> cpuid mapping persistent

Yasuaki Ishimatsu found that with node online/offline, cpu<->node relationship
is established. Because workqueue uses a info which was established at boot
time, but it may be changed by node hotpluging.

Once pool->node points to a stale node, following allocation failure
happens.
==
SLUB: Unable to allocate memory on node 2 (gfp=0x80d0)
cache: kmalloc-192, object size: 192, buffer size: 192, default
order:
1, min order: 0
node 0: slabs: 6172, objs: 259224, free: 245741
node 1: slabs: 3261, objs: 136962, free: 127656
==

As the apicid <---> pxm and pxm <--> node relationship are persistent, then
the apicid <--> node mapping is persistent, so the root cause is the
cpu-id <-> lapicid mapping is not persistent (because the currently implementation
always choose the first free cpu id for the new added cpu). If we can build
persistent cpu-id <-> lapicid relationship, this problem will be fixed.

This patch tries to build the whole world mapping cpuid <-> apicid <-> pxm <-> node
for all possible processor at the boot, the detail implementation are 2 steps:
Step1: generate a logic cpu id for all the local apic (both enabled and dsiabled)
when register local apic
Step2: map the cpu to the phyical node via an additional acpi ns walk for processor.

Please refer to:
https://lkml.org/lkml/2015/2/27/145
https://lkml.org/lkml/2015/3/25/989
for the previous discussion.

Reported-by: Yasuaki Ishimatsu <[email protected]>
Signed-off-by: Gu Zheng <[email protected]>
---
arch/ia64/kernel/acpi.c | 2 +-
arch/x86/include/asm/mpspec.h | 1 +
arch/x86/kernel/acpi/boot.c | 8 +--
arch/x86/kernel/apic/apic.c | 71 +++++++++++++++++++++++----
arch/x86/mm/numa.c | 20 --------
drivers/acpi/acpi_processor.c | 2 +-
drivers/acpi/bus.c | 3 +
drivers/acpi/processor_core.c | 108 ++++++++++++++++++++++++++++++++++------
include/linux/acpi.h | 2 +
9 files changed, 162 insertions(+), 55 deletions(-)

diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 2c44989..e7958f8 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -796,7 +796,7 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
* ACPI based hotplug CPU support
*/
#ifdef CONFIG_ACPI_HOTPLUG_CPU
-static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
+int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
{
#ifdef CONFIG_ACPI_NUMA
/*
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index b07233b..db902d8 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -86,6 +86,7 @@ static inline void early_reserve_e820_mpc_new(void) { }
#endif

int generic_processor_info(int apicid, int version);
+int __generic_processor_info(int apicid, int version, bool enabled);

#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC)

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 803b684..b084cc0 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -174,15 +174,13 @@ static int acpi_register_lapic(int id, u8 enabled)
return -EINVAL;
}

- if (!enabled) {
+ if (!enabled)
++disabled_cpus;
- return -EINVAL;
- }

if (boot_cpu_physical_apicid != -1U)
ver = apic_version[boot_cpu_physical_apicid];

- return generic_processor_info(id, ver);
+ return __generic_processor_info(id, ver, enabled);
}

static int __init
@@ -726,7 +724,7 @@ static void __init acpi_set_irq_model_ioapic(void)
#ifdef CONFIG_ACPI_HOTPLUG_CPU
#include <acpi/processor.h>

-static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
+void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
{
#ifdef CONFIG_ACPI_NUMA
int nid;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index dcb5285..7fbf2cb 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1977,7 +1977,38 @@ void disconnect_bsp_APIC(int virt_wire_setup)
apic_write(APIC_LVT1, value);
}

-int generic_processor_info(int apicid, int version)
+/*
+ * Logic cpu number(cpuid) to local APIC id persistent mappings.
+ * Do not clear the mapping even if cpu hot removed.
+ * */
+static int apicid_to_cpuid[] = {
+ [0 ... NR_CPUS - 1] = -1,
+};
+
+/*
+ * Internal cpu id bits, set the bit once cpu present, and never clear it.
+ * */
+static cpumask_t cpuid_mask = CPU_MASK_NONE;
+
+static int get_cpuid(int apicid)
+{
+ int free_id, i;
+
+ free_id = cpumask_next_zero(-1, &cpuid_mask);
+ if (free_id >= nr_cpu_ids)
+ return -1;
+
+ for (i = 0; i < free_id; i++)
+ if (apicid_to_cpuid[i] == apicid)
+ return i;
+
+ apicid_to_cpuid[free_id] = apicid;
+ cpumask_set_cpu(free_id, &cpuid_mask);
+
+ return free_id;
+}
+
+int __generic_processor_info(int apicid, int version, bool enabled)
{
int cpu, max = nr_cpu_ids;
bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
@@ -2010,8 +2041,8 @@ int generic_processor_info(int apicid, int version)
pr_warning("APIC: Disabling requested cpu."
" Processor %d/0x%x ignored.\n",
thiscpu, apicid);
-
- disabled_cpus++;
+ if (enabled)
+ disabled_cpus++;
return -ENODEV;
}

@@ -2027,8 +2058,8 @@ int generic_processor_info(int apicid, int version)
"ACPI: NR_CPUS/possible_cpus limit of %i almost"
" reached. Keeping one slot for boot cpu."
" Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
-
- disabled_cpus++;
+ if (enabled)
+ disabled_cpus++;
return -ENODEV;
}

@@ -2039,11 +2070,11 @@ int generic_processor_info(int apicid, int version)
"ACPI: NR_CPUS/possible_cpus limit of %i reached."
" Processor %d/0x%x ignored.\n", max, thiscpu, apicid);

- disabled_cpus++;
+ if (enabled)
+ disabled_cpus++;
return -EINVAL;
}

- num_processors++;
if (apicid == boot_cpu_physical_apicid) {
/*
* x86_bios_cpu_apicid is required to have processors listed
@@ -2053,9 +2084,20 @@ int generic_processor_info(int apicid, int version)
* for BSP.
*/
cpu = 0;
- } else
- cpu = cpumask_next_zero(-1, cpu_present_mask);
+ } else {
+ cpu = get_cpuid(apicid);
+ if (cpu < 0) {
+ int thiscpu = max + disabled_cpus;

+ pr_warning(" Processor %d/0x%x ignored.\n",
+ thiscpu, apicid);
+ if (enabled)
+ disabled_cpus++;
+ return -EINVAL;
+ }
+ }
+ if (enabled)
+ num_processors++;
/*
* Validate version
*/
@@ -2071,7 +2113,8 @@ int generic_processor_info(int apicid, int version)
apic_version[boot_cpu_physical_apicid], cpu, version);
}

- physid_set(apicid, phys_cpu_present_map);
+ if (enabled)
+ physid_set(apicid, phys_cpu_present_map);
if (apicid > max_physical_apicid)
max_physical_apicid = apicid;

@@ -2084,11 +2127,17 @@ int generic_processor_info(int apicid, int version)
apic->x86_32_early_logical_apicid(cpu);
#endif
set_cpu_possible(cpu, true);
- set_cpu_present(cpu, true);
+ if (enabled)
+ set_cpu_present(cpu, true);

return cpu;
}

+int generic_processor_info(int apicid, int version)
+{
+ return __generic_processor_info(apicid, version, true);
+}
+
int hard_smp_processor_id(void)
{
return read_apic_id();
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 4053bb5..a733cf9 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -702,24 +702,6 @@ void __init x86_numa_init(void)
numa_init(dummy_numa_init);
}

-static __init int find_near_online_node(int node)
-{
- int n, val;
- int min_val = INT_MAX;
- int best_node = -1;
-
- for_each_online_node(n) {
- val = node_distance(node, n);
-
- if (val < min_val) {
- min_val = val;
- best_node = n;
- }
- }
-
- return best_node;
-}
-
/*
* Setup early cpu_to_node.
*
@@ -746,8 +728,6 @@ void __init init_cpu_to_node(void)

if (node == NUMA_NO_NODE)
continue;
- if (!node_online(node))
- node = find_near_online_node(node);
numa_set_node(cpu, node);
}
}
diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
index 1020b1b..9c7f842 100644
--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -284,7 +284,7 @@ static int acpi_processor_get_info(struct acpi_device *device)
* less than the max # of CPUs. They should be ignored _iff
* they are physically not present.
*/
- if (pr->id == -1) {
+ if (pr->id == -1 || !cpu_present(pr->id)) {
int ret = acpi_processor_hotadd_init(pr);
if (ret)
return ret;
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 8b67bd0..36413b1 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -671,6 +671,9 @@ static int __init acpi_init(void)
acpi_debugfs_init();
acpi_sleep_proc_init();
acpi_wakeup_device_init();
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+ acpi_set_processor_mapping();
+#endif
return 0;
}

diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 7962651..b2b44a0 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -32,12 +32,12 @@ static struct acpi_table_madt *get_madt_table(void)
}

static int map_lapic_id(struct acpi_subtable_header *entry,
- u32 acpi_id, int *apic_id)
+ u32 acpi_id, int *apic_id, bool ignore_disabled)
{
struct acpi_madt_local_apic *lapic =
container_of(entry, struct acpi_madt_local_apic, header);

- if (!(lapic->lapic_flags & ACPI_MADT_ENABLED))
+ if (ignore_disabled && !(lapic->lapic_flags & ACPI_MADT_ENABLED))
return -ENODEV;

if (lapic->processor_id != acpi_id)
@@ -48,12 +48,13 @@ static int map_lapic_id(struct acpi_subtable_header *entry,
}

static int map_x2apic_id(struct acpi_subtable_header *entry,
- int device_declaration, u32 acpi_id, int *apic_id)
+ int device_declaration, u32 acpi_id,
+ int *apic_id, bool ignore_disabled)
{
struct acpi_madt_local_x2apic *apic =
container_of(entry, struct acpi_madt_local_x2apic, header);

- if (!(apic->lapic_flags & ACPI_MADT_ENABLED))
+ if (ignore_disabled && !(apic->lapic_flags & ACPI_MADT_ENABLED))
return -ENODEV;

if (device_declaration && (apic->uid == acpi_id)) {
@@ -65,12 +66,13 @@ static int map_x2apic_id(struct acpi_subtable_header *entry,
}

static int map_lsapic_id(struct acpi_subtable_header *entry,
- int device_declaration, u32 acpi_id, int *apic_id)
+ int device_declaration, u32 acpi_id,
+ int *apic_id, bool ignore_disabled)
{
struct acpi_madt_local_sapic *lsapic =
container_of(entry, struct acpi_madt_local_sapic, header);

- if (!(lsapic->lapic_flags & ACPI_MADT_ENABLED))
+ if (ignore_disabled && !(lsapic->lapic_flags & ACPI_MADT_ENABLED))
return -ENODEV;

if (device_declaration) {
@@ -83,7 +85,7 @@ static int map_lsapic_id(struct acpi_subtable_header *entry,
return 0;
}

-static int map_madt_entry(int type, u32 acpi_id)
+static int map_madt_entry(int type, u32 acpi_id, bool ignore_disabled)
{
unsigned long madt_end, entry;
int phys_id = -1; /* CPU hardware ID */
@@ -103,13 +105,16 @@ static int map_madt_entry(int type, u32 acpi_id)
struct acpi_subtable_header *header =
(struct acpi_subtable_header *)entry;
if (header->type == ACPI_MADT_TYPE_LOCAL_APIC) {
- if (!map_lapic_id(header, acpi_id, &phys_id))
+ if (!map_lapic_id(header, acpi_id, &phys_id,
+ ignore_disabled))
break;
} else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC) {
- if (!map_x2apic_id(header, type, acpi_id, &phys_id))
+ if (!map_x2apic_id(header, type, acpi_id, &phys_id,
+ ignore_disabled))
break;
} else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC) {
- if (!map_lsapic_id(header, type, acpi_id, &phys_id))
+ if (!map_lsapic_id(header, type, acpi_id, &phys_id,
+ ignore_disabled))
break;
}
entry += header->length;
@@ -117,7 +122,8 @@ static int map_madt_entry(int type, u32 acpi_id)
return phys_id;
}

-static int map_mat_entry(acpi_handle handle, int type, u32 acpi_id)
+static int map_mat_entry(acpi_handle handle, int type, u32 acpi_id,
+ bool ignore_disabled)
{
struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
union acpi_object *obj;
@@ -138,28 +144,34 @@ static int map_mat_entry(acpi_handle handle, int type, u32 acpi_id)

header = (struct acpi_subtable_header *)obj->buffer.pointer;
if (header->type == ACPI_MADT_TYPE_LOCAL_APIC)
- map_lapic_id(header, acpi_id, &phys_id);
+ map_lapic_id(header, acpi_id, &phys_id, ignore_disabled);
else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC)
- map_lsapic_id(header, type, acpi_id, &phys_id);
+ map_lsapic_id(header, type, acpi_id, &phys_id, ignore_disabled);
else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC)
- map_x2apic_id(header, type, acpi_id, &phys_id);
+ map_x2apic_id(header, type, acpi_id, &phys_id, ignore_disabled);

exit:
kfree(buffer.pointer);
return phys_id;
}

-int acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id)
+static int __acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id,
+ bool ignore_disabled)
{
int phys_id;

- phys_id = map_mat_entry(handle, type, acpi_id);
+ phys_id = map_mat_entry(handle, type, acpi_id, ignore_disabled);
if (phys_id == -1)
- phys_id = map_madt_entry(type, acpi_id);
+ phys_id = map_madt_entry(type, acpi_id, ignore_disabled);

return phys_id;
}

+int acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id)
+{
+ return __acpi_get_phys_id(handle, type, acpi_id, true);
+}
+
int acpi_map_cpuid(int phys_id, u32 acpi_id)
{
#ifdef CONFIG_SMP
@@ -216,6 +228,68 @@ int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id)
}
EXPORT_SYMBOL_GPL(acpi_get_cpuid);

+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+static bool map_processor(acpi_handle handle, int *phys_id, int *cpuid)
+{
+ int type;
+ u32 acpi_id;
+ acpi_status status;
+ acpi_object_type acpi_type;
+ unsigned long long tmp;
+ union acpi_object object = { 0 };
+ struct acpi_buffer buffer = { sizeof(union acpi_object), &object };
+
+ status = acpi_get_type(handle, &acpi_type);
+ if (ACPI_FAILURE(status))
+ return false;
+
+ switch (acpi_type) {
+ case ACPI_TYPE_PROCESSOR:
+ status = acpi_evaluate_object(handle, NULL, NULL, &buffer);
+ if (ACPI_FAILURE(status))
+ return false;
+ acpi_id = object.processor.proc_id;
+ break;
+ case ACPI_TYPE_DEVICE:
+ status = acpi_evaluate_integer(handle, "_UID", NULL, &tmp);
+ if (ACPI_FAILURE(status))
+ return false;
+ acpi_id = tmp;
+ break;
+ default:
+ return false;
+ }
+
+ type = (acpi_type == ACPI_TYPE_DEVICE) ? 1 : 0;
+
+ *phys_id = __acpi_get_phys_id(handle, type, acpi_id, false);
+ *cpuid = acpi_map_cpuid(*phys_id, acpi_id);
+ if (*cpuid == -1)
+ return false;
+ return true;
+}
+
+static acpi_status __init
+set_processor_node_mapping(acpi_handle handle, u32 lvl, void *context,
+ void **rv)
+{
+ u32 apic_id;
+ int cpu_id;
+
+ if (!map_processor(handle, &apic_id, &cpu_id))
+ return AE_ERROR;
+ acpi_map_cpu2node(handle, cpu_id, apic_id);
+ return AE_OK;
+}
+
+void __init acpi_set_processor_mapping(void)
+{
+ acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
+ ACPI_UINT32_MAX,
+ set_processor_node_mapping, NULL, NULL, NULL);
+}
+#endif
+
#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
static int get_ioapic_id(struct acpi_subtable_header *entry, u32 gsi_base,
u64 *phys_addr, int *ioapic_id)
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index dd12127..a7bf53c 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -156,6 +156,8 @@ void acpi_numa_arch_fixup(void);
/* Arch dependent functions for cpu hotplug support */
int acpi_map_cpu(acpi_handle handle, int physid, int *pcpu);
int acpi_unmap_cpu(int cpu);
+void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid);
+void __init acpi_set_processor_mapping(void);
#endif /* CONFIG_ACPI_HOTPLUG_CPU */

#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
--
1.7.7

2015-04-24 10:18:06

by Gu Zheng

[permalink] [raw]

Subject: [RESEND RFC PATCH 2/2] gfp: use the best near online node if the target node is offline

Since the change to the cpu <--> mapping (map the cpu to the physical
node for all possible at the boot), the node of cpu may be not present,
so we use the best near online node if the node is not present in the low
level allocation APIs.

Signed-off-by: Gu Zheng <[email protected]>
---
include/linux/gfp.h | 28 +++++++++++++++++++++++++++-
1 files changed, 27 insertions(+), 1 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 97a9373..19684a8 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -298,9 +298,31 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
}

+static int find_near_online_node(int node)
+{
+ int n, val;
+ int min_val = INT_MAX;
+ int best_node = -1;
+
+ for_each_online_node(n) {
+ val = node_distance(node, n);
+
+ if (val < min_val) {
+ min_val = val;
+ best_node = n;
+ }
+ }
+
+ return best_node;
+}
+
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
unsigned int order)
{
+ /* Offline node, use the best near online node */
+ if (!node_online(nid))
+ nid = find_near_online_node(nid);
+
/* Unknown node is current node */
if (nid < 0)
nid = numa_node_id();
@@ -311,7 +333,11 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask,
unsigned int order)
{
- VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid));
+ /* Offline node, use the best near online node */
+ if (!node_online(nid))
+ nid = find_near_online_node(nid);
+
+ VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);

return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
}
--
1.7.7

2015-04-24 14:20:24

by Rafael J. Wysocki

[permalink] [raw]

Subject: Re: [RESEND RFC PATCH 1/2] x86/cpu hotplug: make apicid <--> cpuid mapping persistent

On Friday, April 24, 2015 05:58:32 PM Gu Zheng wrote:
> Yasuaki Ishimatsu found that with node online/offline, cpu<->node relationship
> is established. Because workqueue uses a info which was established at boot
> time, but it may be changed by node hotpluging.
>
> Once pool->node points to a stale node, following allocation failure
> happens.
> ==
> SLUB: Unable to allocate memory on node 2 (gfp=0x80d0)
> cache: kmalloc-192, object size: 192, buffer size: 192, default
> order:
> 1, min order: 0
> node 0: slabs: 6172, objs: 259224, free: 245741
> node 1: slabs: 3261, objs: 136962, free: 127656
> ==
>
> As the apicid <---> pxm and pxm <--> node relationship are persistent, then
> the apicid <--> node mapping is persistent, so the root cause is the
> cpu-id <-> lapicid mapping is not persistent (because the currently implementation
> always choose the first free cpu id for the new added cpu). If we can build
> persistent cpu-id <-> lapicid relationship, this problem will be fixed.
>
> This patch tries to build the whole world mapping cpuid <-> apicid <-> pxm <-> node
> for all possible processor at the boot, the detail implementation are 2 steps:
> Step1: generate a logic cpu id for all the local apic (both enabled and dsiabled)
> when register local apic
> Step2: map the cpu to the phyical node via an additional acpi ns walk for processor.
>
> Please refer to:
> https://lkml.org/lkml/2015/2/27/145
> https://lkml.org/lkml/2015/3/25/989
> for the previous discussion.
>
> Reported-by: Yasuaki Ishimatsu <[email protected]>
> Signed-off-by: Gu Zheng <[email protected]>

This one will conflict with the ARM64 ACPI material when that goes in, so it'll
need to be rebased on top of that.

> ---
> arch/ia64/kernel/acpi.c | 2 +-
> arch/x86/include/asm/mpspec.h | 1 +
> arch/x86/kernel/acpi/boot.c | 8 +--
> arch/x86/kernel/apic/apic.c | 71 +++++++++++++++++++++++----
> arch/x86/mm/numa.c | 20 --------
> drivers/acpi/acpi_processor.c | 2 +-
> drivers/acpi/bus.c | 3 +
> drivers/acpi/processor_core.c | 108 ++++++++++++++++++++++++++++++++++------
> include/linux/acpi.h | 2 +
> 9 files changed, 162 insertions(+), 55 deletions(-)
>
> diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
> index 2c44989..e7958f8 100644
> --- a/arch/ia64/kernel/acpi.c
> +++ b/arch/ia64/kernel/acpi.c
> @@ -796,7 +796,7 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
> * ACPI based hotplug CPU support
> */
> #ifdef CONFIG_ACPI_HOTPLUG_CPU
> -static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
> +int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
> {
> #ifdef CONFIG_ACPI_NUMA
> /*
> diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
> index b07233b..db902d8 100644
> --- a/arch/x86/include/asm/mpspec.h
> +++ b/arch/x86/include/asm/mpspec.h
> @@ -86,6 +86,7 @@ static inline void early_reserve_e820_mpc_new(void) { }
> #endif
>
> int generic_processor_info(int apicid, int version);
> +int __generic_processor_info(int apicid, int version, bool enabled);
>
> #define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC)
>
> diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
> index 803b684..b084cc0 100644
> --- a/arch/x86/kernel/acpi/boot.c
> +++ b/arch/x86/kernel/acpi/boot.c
> @@ -174,15 +174,13 @@ static int acpi_register_lapic(int id, u8 enabled)
> return -EINVAL;
> }
>
> - if (!enabled) {
> + if (!enabled)
> ++disabled_cpus;
> - return -EINVAL;
> - }
>
> if (boot_cpu_physical_apicid != -1U)
> ver = apic_version[boot_cpu_physical_apicid];
>
> - return generic_processor_info(id, ver);
> + return __generic_processor_info(id, ver, enabled);
> }
>
> static int __init
> @@ -726,7 +724,7 @@ static void __init acpi_set_irq_model_ioapic(void)
> #ifdef CONFIG_ACPI_HOTPLUG_CPU
> #include <acpi/processor.h>
>
> -static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
> +void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
> {
> #ifdef CONFIG_ACPI_NUMA
> int nid;
> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> index dcb5285..7fbf2cb 100644
> --- a/arch/x86/kernel/apic/apic.c
> +++ b/arch/x86/kernel/apic/apic.c
> @@ -1977,7 +1977,38 @@ void disconnect_bsp_APIC(int virt_wire_setup)
> apic_write(APIC_LVT1, value);
> }
>
> -int generic_processor_info(int apicid, int version)
> +/*
> + * Logic cpu number(cpuid) to local APIC id persistent mappings.
> + * Do not clear the mapping even if cpu hot removed.
> + * */
> +static int apicid_to_cpuid[] = {
> + [0 ... NR_CPUS - 1] = -1,
> +};
> +
> +/*
> + * Internal cpu id bits, set the bit once cpu present, and never clear it.
> + * */
> +static cpumask_t cpuid_mask = CPU_MASK_NONE;
> +
> +static int get_cpuid(int apicid)
> +{
> + int free_id, i;
> +
> + free_id = cpumask_next_zero(-1, &cpuid_mask);
> + if (free_id >= nr_cpu_ids)
> + return -1;
> +
> + for (i = 0; i < free_id; i++)
> + if (apicid_to_cpuid[i] == apicid)
> + return i;
> +
> + apicid_to_cpuid[free_id] = apicid;
> + cpumask_set_cpu(free_id, &cpuid_mask);
> +
> + return free_id;
> +}
> +
> +int __generic_processor_info(int apicid, int version, bool enabled)
> {
> int cpu, max = nr_cpu_ids;
> bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
> @@ -2010,8 +2041,8 @@ int generic_processor_info(int apicid, int version)
> pr_warning("APIC: Disabling requested cpu."
> " Processor %d/0x%x ignored.\n",
> thiscpu, apicid);
> -
> - disabled_cpus++;
> + if (enabled)
> + disabled_cpus++;
> return -ENODEV;
> }
>
> @@ -2027,8 +2058,8 @@ int generic_processor_info(int apicid, int version)
> "ACPI: NR_CPUS/possible_cpus limit of %i almost"
> " reached. Keeping one slot for boot cpu."
> " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
> -
> - disabled_cpus++;
> + if (enabled)
> + disabled_cpus++;
> return -ENODEV;
> }
>
> @@ -2039,11 +2070,11 @@ int generic_processor_info(int apicid, int version)
> "ACPI: NR_CPUS/possible_cpus limit of %i reached."
> " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
>
> - disabled_cpus++;
> + if (enabled)
> + disabled_cpus++;
> return -EINVAL;
> }
>
> - num_processors++;
> if (apicid == boot_cpu_physical_apicid) {
> /*
> * x86_bios_cpu_apicid is required to have processors listed
> @@ -2053,9 +2084,20 @@ int generic_processor_info(int apicid, int version)
> * for BSP.
> */
> cpu = 0;
> - } else
> - cpu = cpumask_next_zero(-1, cpu_present_mask);
> + } else {
> + cpu = get_cpuid(apicid);
> + if (cpu < 0) {
> + int thiscpu = max + disabled_cpus;
>
> + pr_warning(" Processor %d/0x%x ignored.\n",
> + thiscpu, apicid);
> + if (enabled)
> + disabled_cpus++;
> + return -EINVAL;
> + }
> + }
> + if (enabled)
> + num_processors++;
> /*
> * Validate version
> */
> @@ -2071,7 +2113,8 @@ int generic_processor_info(int apicid, int version)
> apic_version[boot_cpu_physical_apicid], cpu, version);
> }
>
> - physid_set(apicid, phys_cpu_present_map);
> + if (enabled)
> + physid_set(apicid, phys_cpu_present_map);
> if (apicid > max_physical_apicid)
> max_physical_apicid = apicid;
>
> @@ -2084,11 +2127,17 @@ int generic_processor_info(int apicid, int version)
> apic->x86_32_early_logical_apicid(cpu);
> #endif
> set_cpu_possible(cpu, true);
> - set_cpu_present(cpu, true);
> + if (enabled)
> + set_cpu_present(cpu, true);
>
> return cpu;
> }
>
> +int generic_processor_info(int apicid, int version)
> +{
> + return __generic_processor_info(apicid, version, true);
> +}
> +
> int hard_smp_processor_id(void)
> {
> return read_apic_id();
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 4053bb5..a733cf9 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -702,24 +702,6 @@ void __init x86_numa_init(void)
> numa_init(dummy_numa_init);
> }
>
> -static __init int find_near_online_node(int node)
> -{
> - int n, val;
> - int min_val = INT_MAX;
> - int best_node = -1;
> -
> - for_each_online_node(n) {
> - val = node_distance(node, n);
> -
> - if (val < min_val) {
> - min_val = val;
> - best_node = n;
> - }
> - }
> -
> - return best_node;
> -}
> -
> /*
> * Setup early cpu_to_node.
> *
> @@ -746,8 +728,6 @@ void __init init_cpu_to_node(void)
>
> if (node == NUMA_NO_NODE)
> continue;
> - if (!node_online(node))
> - node = find_near_online_node(node);
> numa_set_node(cpu, node);
> }
> }
> diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
> index 1020b1b..9c7f842 100644
> --- a/drivers/acpi/acpi_processor.c
> +++ b/drivers/acpi/acpi_processor.c
> @@ -284,7 +284,7 @@ static int acpi_processor_get_info(struct acpi_device *device)
> * less than the max # of CPUs. They should be ignored _iff
> * they are physically not present.
> */
> - if (pr->id == -1) {
> + if (pr->id == -1 || !cpu_present(pr->id)) {
> int ret = acpi_processor_hotadd_init(pr);
> if (ret)
> return ret;
> diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
> index 8b67bd0..36413b1 100644
> --- a/drivers/acpi/bus.c
> +++ b/drivers/acpi/bus.c
> @@ -671,6 +671,9 @@ static int __init acpi_init(void)
> acpi_debugfs_init();
> acpi_sleep_proc_init();
> acpi_wakeup_device_init();
> +#ifdef CONFIG_ACPI_HOTPLUG_CPU
> + acpi_set_processor_mapping();
> +#endif
> return 0;
> }
>
> diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
> index 7962651..b2b44a0 100644
> --- a/drivers/acpi/processor_core.c
> +++ b/drivers/acpi/processor_core.c
> @@ -32,12 +32,12 @@ static struct acpi_table_madt *get_madt_table(void)
> }
>
> static int map_lapic_id(struct acpi_subtable_header *entry,
> - u32 acpi_id, int *apic_id)
> + u32 acpi_id, int *apic_id, bool ignore_disabled)
> {
> struct acpi_madt_local_apic *lapic =
> container_of(entry, struct acpi_madt_local_apic, header);
>
> - if (!(lapic->lapic_flags & ACPI_MADT_ENABLED))
> + if (ignore_disabled && !(lapic->lapic_flags & ACPI_MADT_ENABLED))
> return -ENODEV;
>
> if (lapic->processor_id != acpi_id)
> @@ -48,12 +48,13 @@ static int map_lapic_id(struct acpi_subtable_header *entry,
> }
>
> static int map_x2apic_id(struct acpi_subtable_header *entry,
> - int device_declaration, u32 acpi_id, int *apic_id)
> + int device_declaration, u32 acpi_id,
> + int *apic_id, bool ignore_disabled)
> {
> struct acpi_madt_local_x2apic *apic =
> container_of(entry, struct acpi_madt_local_x2apic, header);
>
> - if (!(apic->lapic_flags & ACPI_MADT_ENABLED))
> + if (ignore_disabled && !(apic->lapic_flags & ACPI_MADT_ENABLED))
> return -ENODEV;
>
> if (device_declaration && (apic->uid == acpi_id)) {
> @@ -65,12 +66,13 @@ static int map_x2apic_id(struct acpi_subtable_header *entry,
> }
>
> static int map_lsapic_id(struct acpi_subtable_header *entry,
> - int device_declaration, u32 acpi_id, int *apic_id)
> + int device_declaration, u32 acpi_id,
> + int *apic_id, bool ignore_disabled)
> {
> struct acpi_madt_local_sapic *lsapic =
> container_of(entry, struct acpi_madt_local_sapic, header);
>
> - if (!(lsapic->lapic_flags & ACPI_MADT_ENABLED))
> + if (ignore_disabled && !(lsapic->lapic_flags & ACPI_MADT_ENABLED))
> return -ENODEV;
>
> if (device_declaration) {
> @@ -83,7 +85,7 @@ static int map_lsapic_id(struct acpi_subtable_header *entry,
> return 0;
> }
>
> -static int map_madt_entry(int type, u32 acpi_id)
> +static int map_madt_entry(int type, u32 acpi_id, bool ignore_disabled)
> {
> unsigned long madt_end, entry;
> int phys_id = -1; /* CPU hardware ID */
> @@ -103,13 +105,16 @@ static int map_madt_entry(int type, u32 acpi_id)
> struct acpi_subtable_header *header =
> (struct acpi_subtable_header *)entry;
> if (header->type == ACPI_MADT_TYPE_LOCAL_APIC) {
> - if (!map_lapic_id(header, acpi_id, &phys_id))
> + if (!map_lapic_id(header, acpi_id, &phys_id,
> + ignore_disabled))
> break;
> } else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC) {
> - if (!map_x2apic_id(header, type, acpi_id, &phys_id))
> + if (!map_x2apic_id(header, type, acpi_id, &phys_id,
> + ignore_disabled))
> break;
> } else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC) {
> - if (!map_lsapic_id(header, type, acpi_id, &phys_id))
> + if (!map_lsapic_id(header, type, acpi_id, &phys_id,
> + ignore_disabled))
> break;
> }
> entry += header->length;
> @@ -117,7 +122,8 @@ static int map_madt_entry(int type, u32 acpi_id)
> return phys_id;
> }
>
> -static int map_mat_entry(acpi_handle handle, int type, u32 acpi_id)
> +static int map_mat_entry(acpi_handle handle, int type, u32 acpi_id,
> + bool ignore_disabled)
> {
> struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
> union acpi_object *obj;
> @@ -138,28 +144,34 @@ static int map_mat_entry(acpi_handle handle, int type, u32 acpi_id)
>
> header = (struct acpi_subtable_header *)obj->buffer.pointer;
> if (header->type == ACPI_MADT_TYPE_LOCAL_APIC)
> - map_lapic_id(header, acpi_id, &phys_id);
> + map_lapic_id(header, acpi_id, &phys_id, ignore_disabled);
> else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC)
> - map_lsapic_id(header, type, acpi_id, &phys_id);
> + map_lsapic_id(header, type, acpi_id, &phys_id, ignore_disabled);
> else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC)
> - map_x2apic_id(header, type, acpi_id, &phys_id);
> + map_x2apic_id(header, type, acpi_id, &phys_id, ignore_disabled);
>
> exit:
> kfree(buffer.pointer);
> return phys_id;
> }
>
> -int acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id)
> +static int __acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id,
> + bool ignore_disabled)
> {
> int phys_id;
>
> - phys_id = map_mat_entry(handle, type, acpi_id);
> + phys_id = map_mat_entry(handle, type, acpi_id, ignore_disabled);
> if (phys_id == -1)
> - phys_id = map_madt_entry(type, acpi_id);
> + phys_id = map_madt_entry(type, acpi_id, ignore_disabled);
>
> return phys_id;
> }
>
> +int acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id)
> +{
> + return __acpi_get_phys_id(handle, type, acpi_id, true);
> +}
> +
> int acpi_map_cpuid(int phys_id, u32 acpi_id)
> {
> #ifdef CONFIG_SMP
> @@ -216,6 +228,68 @@ int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id)
> }
> EXPORT_SYMBOL_GPL(acpi_get_cpuid);
>
> +#ifdef CONFIG_ACPI_HOTPLUG_CPU
> +static bool map_processor(acpi_handle handle, int *phys_id, int *cpuid)
> +{
> + int type;
> + u32 acpi_id;
> + acpi_status status;
> + acpi_object_type acpi_type;
> + unsigned long long tmp;
> + union acpi_object object = { 0 };
> + struct acpi_buffer buffer = { sizeof(union acpi_object), &object };
> +
> + status = acpi_get_type(handle, &acpi_type);
> + if (ACPI_FAILURE(status))
> + return false;
> +
> + switch (acpi_type) {
> + case ACPI_TYPE_PROCESSOR:
> + status = acpi_evaluate_object(handle, NULL, NULL, &buffer);
> + if (ACPI_FAILURE(status))
> + return false;
> + acpi_id = object.processor.proc_id;
> + break;
> + case ACPI_TYPE_DEVICE:
> + status = acpi_evaluate_integer(handle, "_UID", NULL, &tmp);
> + if (ACPI_FAILURE(status))
> + return false;
> + acpi_id = tmp;
> + break;
> + default:
> + return false;
> + }
> +
> + type = (acpi_type == ACPI_TYPE_DEVICE) ? 1 : 0;
> +
> + *phys_id = __acpi_get_phys_id(handle, type, acpi_id, false);
> + *cpuid = acpi_map_cpuid(*phys_id, acpi_id);
> + if (*cpuid == -1)
> + return false;
> + return true;
> +}
> +
> +static acpi_status __init
> +set_processor_node_mapping(acpi_handle handle, u32 lvl, void *context,
> + void **rv)
> +{
> + u32 apic_id;
> + int cpu_id;
> +
> + if (!map_processor(handle, &apic_id, &cpu_id))
> + return AE_ERROR;
> + acpi_map_cpu2node(handle, cpu_id, apic_id);
> + return AE_OK;
> +}
> +
> +void __init acpi_set_processor_mapping(void)
> +{
> + acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
> + ACPI_UINT32_MAX,
> + set_processor_node_mapping, NULL, NULL, NULL);
> +}
> +#endif
> +
> #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
> static int get_ioapic_id(struct acpi_subtable_header *entry, u32 gsi_base,
> u64 *phys_addr, int *ioapic_id)
> diff --git a/include/linux/acpi.h b/include/linux/acpi.h
> index dd12127..a7bf53c 100644
> --- a/include/linux/acpi.h
> +++ b/include/linux/acpi.h
> @@ -156,6 +156,8 @@ void acpi_numa_arch_fixup(void);
> /* Arch dependent functions for cpu hotplug support */
> int acpi_map_cpu(acpi_handle handle, int physid, int *pcpu);
> int acpi_unmap_cpu(int cpu);
> +void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid);
> +void __init acpi_set_processor_mapping(void);
> #endif /* CONFIG_ACPI_HOTPLUG_CPU */
>
> #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
>

--
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.

2015-04-24 20:01:57

by Andrew Morton

[permalink] [raw]

Subject: Re: [RESEND RFC PATCH 2/2] gfp: use the best near online node if the target node is offline

On Fri, 24 Apr 2015 17:58:33 +0800 Gu Zheng <[email protected]> wrote:

> Since the change to the cpu <--> mapping (map the cpu to the physical
> node for all possible at the boot), the node of cpu may be not present,
> so we use the best near online node if the node is not present in the low
> level allocation APIs.
>
> ...
>
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -298,9 +298,31 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
> return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
> }
>
> +static int find_near_online_node(int node)
> +{
> + int n, val;
> + int min_val = INT_MAX;
> + int best_node = -1;
> +
> + for_each_online_node(n) {
> + val = node_distance(node, n);
> +
> + if (val < min_val) {
> + min_val = val;
> + best_node = n;
> + }
> + }
> +
> + return best_node;
> +}

This should be `inline' if it's in a header file.

But it is far too large to be inlined anyway - please move it to a .c file.

And please document it. A critical thing to describe is how we
determine whether a node is "near". There are presumably multiple ways
in which we could decide that a node is "near" (number of hops, minimum
latency, ...). Which one did you choose, and why?

> static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
> unsigned int order)
> {
> + /* Offline node, use the best near online node */
> + if (!node_online(nid))
> + nid = find_near_online_node(nid);
> +
> /* Unknown node is current node */
> if (nid < 0)
> nid = numa_node_id();
> @@ -311,7 +333,11 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
> static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask,
> unsigned int order)
> {
> - VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid));
> + /* Offline node, use the best near online node */
> + if (!node_online(nid))
> + nid = find_near_online_node(nid);
> +
> + VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
>
> return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
> }

Ouch. These functions are called very frequently, and adding overhead
to them is a big deal. And the patch even adds overhead to non-x86
architectures which don't benefit from it!

Is there no way this problem can be fixed somewhere else? Preferably
by fixing things up at hotplug time.

2015-04-25 10:15:15

by Hanjun Guo

[permalink] [raw]

Subject: Re: [RESEND RFC PATCH 1/2] x86/cpu hotplug: make apicid <--> cpuid mapping persistent

On 2015/4/24 22:45, Rafael J. Wysocki wrote:
> On Friday, April 24, 2015 05:58:32 PM Gu Zheng wrote:
>> Yasuaki Ishimatsu found that with node online/offline, cpu<->node relationship
>> is established. Because workqueue uses a info which was established at boot
>> time, but it may be changed by node hotpluging.
>>
>> Once pool->node points to a stale node, following allocation failure
>> happens.
>> ==
>> SLUB: Unable to allocate memory on node 2 (gfp=0x80d0)
>> cache: kmalloc-192, object size: 192, buffer size: 192, default
>> order:
>> 1, min order: 0
>> node 0: slabs: 6172, objs: 259224, free: 245741
>> node 1: slabs: 3261, objs: 136962, free: 127656
>> ==
>>
>> As the apicid <---> pxm and pxm <--> node relationship are persistent, then
>> the apicid <--> node mapping is persistent, so the root cause is the
>> cpu-id <-> lapicid mapping is not persistent (because the currently implementation
>> always choose the first free cpu id for the new added cpu). If we can build
>> persistent cpu-id <-> lapicid relationship, this problem will be fixed.
>>
>> This patch tries to build the whole world mapping cpuid <-> apicid <-> pxm <-> node
>> for all possible processor at the boot, the detail implementation are 2 steps:
>> Step1: generate a logic cpu id for all the local apic (both enabled and dsiabled)
>> when register local apic
>> Step2: map the cpu to the phyical node via an additional acpi ns walk for processor.
>>
>> Please refer to:
>> https://lkml.org/lkml/2015/2/27/145
>> https://lkml.org/lkml/2015/3/25/989
>> for the previous discussion.
>>
>> Reported-by: Yasuaki Ishimatsu <[email protected]>
>> Signed-off-by: Gu Zheng <[email protected]>
> This one will conflict with the ARM64 ACPI material when that goes in, so it'll
> need to be rebased on top of that.

Yes, please. Then I will take a look too.

Thanks
Hanjun

2015-04-27 09:45:38

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: Re: [RESEND RFC PATCH 2/2] gfp: use the best near online node if the target node is offline

On 2015/04/25 5:01, Andrew Morton wrote:
> On Fri, 24 Apr 2015 17:58:33 +0800 Gu Zheng <[email protected]> wrote:
>
>> Since the change to the cpu <--> mapping (map the cpu to the physical
>> node for all possible at the boot), the node of cpu may be not present,
>> so we use the best near online node if the node is not present in the low
>> level allocation APIs.
>>
>> ...
>>
>> --- a/include/linux/gfp.h
>> +++ b/include/linux/gfp.h
>> @@ -298,9 +298,31 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
>> return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
>> }
>>
>> +static int find_near_online_node(int node)
>> +{
>> + int n, val;
>> + int min_val = INT_MAX;
>> + int best_node = -1;
>> +
>> + for_each_online_node(n) {
>> + val = node_distance(node, n);
>> +
>> + if (val < min_val) {
>> + min_val = val;
>> + best_node = n;
>> + }
>> + }
>> +
>> + return best_node;
>> +}
>
> This should be `inline' if it's in a header file.
>
> But it is far too large to be inlined anyway - please move it to a .c file.
>
> And please document it. A critical thing to describe is how we
> determine whether a node is "near". There are presumably multiple ways
> in which we could decide that a node is "near" (number of hops, minimum
> latency, ...). Which one did you choose, and why?
>
>> static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
>> unsigned int order)
>> {
>> + /* Offline node, use the best near online node */
>> + if (!node_online(nid))
>> + nid = find_near_online_node(nid);
>> +
>> /* Unknown node is current node */
>> if (nid < 0)
>> nid = numa_node_id();
>> @@ -311,7 +333,11 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
>> static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask,
>> unsigned int order)
>> {
>> - VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid));
>> + /* Offline node, use the best near online node */
>> + if (!node_online(nid))
>> + nid = find_near_online_node(nid);

In above VM_BUG_ON(), !node_online(nid) is the bug.

>> +
>> + VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
>>
>> return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
>> }
>
> Ouch. These functions are called very frequently, and adding overhead
> to them is a big deal. And the patch even adds overhead to non-x86
> architectures which don't benefit from it!
>
> Is there no way this problem can be fixed somewhere else? Preferably
> by fixing things up at hotplug time.

I agree. the results should be cached. If necessary, in per-cpu line.

Thanks,
-Kame

2015-04-27 11:02:34

by Gu Zheng

[permalink] [raw]

Subject: Re: [RESEND RFC PATCH 2/2] gfp: use the best near online node if the target node is offline

Hi Andrew,

On 04/25/2015 04:01 AM, Andrew Morton wrote:

> On Fri, 24 Apr 2015 17:58:33 +0800 Gu Zheng <[email protected]> wrote:
>
>> Since the change to the cpu <--> mapping (map the cpu to the physical
>> node for all possible at the boot), the node of cpu may be not present,
>> so we use the best near online node if the node is not present in the low
>> level allocation APIs.
>>
>> ...
>>
>> --- a/include/linux/gfp.h
>> +++ b/include/linux/gfp.h
>> @@ -298,9 +298,31 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
>> return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
>> }
>>
>> +static int find_near_online_node(int node)
>> +{
>> + int n, val;
>> + int min_val = INT_MAX;
>> + int best_node = -1;
>> +
>> + for_each_online_node(n) {
>> + val = node_distance(node, n);
>> +
>> + if (val < min_val) {
>> + min_val = val;
>> + best_node = n;
>> + }
>> + }
>> +
>> + return best_node;
>> +}
>
> This should be `inline' if it's in a header file.
>
> But it is far too large to be inlined anyway - please move it to a .c file.

Agree.

>
> And please document it. A critical thing to describe is how we
> determine whether a node is "near". There are presumably multiple ways
> in which we could decide that a node is "near" (number of hops, minimum
> latency, ...). Which one did you choose, and why?

It just reuse the dropped code in PATCH 1/2, based on the node_distance table,
which is a arch special defined one, and the data mostly comes from the
firmware info, e.g. SLIT table.

>
>> static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
>> unsigned int order)
>> {
>> + /* Offline node, use the best near online node */
>> + if (!node_online(nid))
>> + nid = find_near_online_node(nid);
>> +
>> /* Unknown node is current node */
>> if (nid < 0)
>> nid = numa_node_id();
>> @@ -311,7 +333,11 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
>> static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask,
>> unsigned int order)
>> {
>> - VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid));
>> + /* Offline node, use the best near online node */
>> + if (!node_online(nid))
>> + nid = find_near_online_node(nid);
>> +
>> + VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
>>
>> return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
>> }
>
> Ouch. These functions are called very frequently, and adding overhead
> to them is a big deal. And the patch even adds overhead to non-x86
> architectures which don't benefit from it!
>
> Is there no way this problem can be fixed somewhere else? Preferably
> by fixing things up at hotplug time.

As Kame suggested, maintaining a per-cpu cache about the alternative-node
only for x86 arch seems a good choice.

Regards,
Gu

> .
>

2015-04-27 11:02:40

by Gu Zheng

[permalink] [raw]

Subject: Re: [RESEND RFC PATCH 2/2] gfp: use the best near online node if the target node is offline

Hi Kame-san,

On 04/27/2015 05:44 PM, Kamezawa Hiroyuki wrote:

> On 2015/04/25 5:01, Andrew Morton wrote:
>> On Fri, 24 Apr 2015 17:58:33 +0800 Gu Zheng <[email protected]> wrote:
>>
>>> Since the change to the cpu <--> mapping (map the cpu to the physical
>>> node for all possible at the boot), the node of cpu may be not present,
>>> so we use the best near online node if the node is not present in the low
>>> level allocation APIs.
>>>
>>> ...
>>>
>>> --- a/include/linux/gfp.h
>>> +++ b/include/linux/gfp.h
>>> @@ -298,9 +298,31 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
>>> return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
>>> }
>>>
>>> +static int find_near_online_node(int node)
>>> +{
>>> + int n, val;
>>> + int min_val = INT_MAX;
>>> + int best_node = -1;
>>> +
>>> + for_each_online_node(n) {
>>> + val = node_distance(node, n);
>>> +
>>> + if (val < min_val) {
>>> + min_val = val;
>>> + best_node = n;
>>> + }
>>> + }
>>> +
>>> + return best_node;
>>> +}
>>
>> This should be `inline' if it's in a header file.
>>
>> But it is far too large to be inlined anyway - please move it to a .c file.
>>
>> And please document it. A critical thing to describe is how we
>> determine whether a node is "near". There are presumably multiple ways
>> in which we could decide that a node is "near" (number of hops, minimum
>> latency, ...). Which one did you choose, and why?
>>
>>> static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
>>> unsigned int order)
>>> {
>>> + /* Offline node, use the best near online node */
>>> + if (!node_online(nid))
>>> + nid = find_near_online_node(nid);
>>> +
>>> /* Unknown node is current node */
>>> if (nid < 0)
>>> nid = numa_node_id();
>>> @@ -311,7 +333,11 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
>>> static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask,
>>> unsigned int order)
>>> {
>>> - VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid));
>>> + /* Offline node, use the best near online node */
>>> + if (!node_online(nid))
>>> + nid = find_near_online_node(nid);
>
> In above VM_BUG_ON(), !node_online(nid) is the bug.

But it will be possible here with the change in PATCH 1/2.

>
>>> +
>>> + VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
>>>
>>> return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
>>> }
>>
>> Ouch. These functions are called very frequently, and adding overhead
>> to them is a big deal. And the patch even adds overhead to non-x86
>> architectures which don't benefit from it!
>>
>> Is there no way this problem can be fixed somewhere else? Preferably
>> by fixing things up at hotplug time.
>
> I agree. the results should be cached. If necessary, in per-cpu line.

Sounds great, will try this way.

Regards,
Gu

>
>
> Thanks,
> -Kame
>
>
> .
>

2015-04-27 11:03:00

by Gu Zheng

[permalink] [raw]

Subject: Re: [RESEND RFC PATCH 1/2] x86/cpu hotplug: make apicid <--> cpuid mapping persistent

Hi Hanjun, Rafael,

On 04/25/2015 06:14 PM, Hanjun Guo wrote:

> On 2015/4/24 22:45, Rafael J. Wysocki wrote:
>> On Friday, April 24, 2015 05:58:32 PM Gu Zheng wrote:
>>> Yasuaki Ishimatsu found that with node online/offline, cpu<->node relationship
>>> is established. Because workqueue uses a info which was established at boot
>>> time, but it may be changed by node hotpluging.
>>>
>>> Once pool->node points to a stale node, following allocation failure
>>> happens.
>>> ==
>>> SLUB: Unable to allocate memory on node 2 (gfp=0x80d0)
>>> cache: kmalloc-192, object size: 192, buffer size: 192, default
>>> order:
>>> 1, min order: 0
>>> node 0: slabs: 6172, objs: 259224, free: 245741
>>> node 1: slabs: 3261, objs: 136962, free: 127656
>>> ==
>>>
>>> As the apicid <---> pxm and pxm <--> node relationship are persistent, then
>>> the apicid <--> node mapping is persistent, so the root cause is the
>>> cpu-id <-> lapicid mapping is not persistent (because the currently implementation
>>> always choose the first free cpu id for the new added cpu). If we can build
>>> persistent cpu-id <-> lapicid relationship, this problem will be fixed.
>>>
>>> This patch tries to build the whole world mapping cpuid <-> apicid <-> pxm <-> node
>>> for all possible processor at the boot, the detail implementation are 2 steps:
>>> Step1: generate a logic cpu id for all the local apic (both enabled and dsiabled)
>>> when register local apic
>>> Step2: map the cpu to the phyical node via an additional acpi ns walk for processor.
>>>
>>> Please refer to:
>>> https://lkml.org/lkml/2015/2/27/145
>>> https://lkml.org/lkml/2015/3/25/989
>>> for the previous discussion.
>>>
>>> Reported-by: Yasuaki Ishimatsu <[email protected]>
>>> Signed-off-by: Gu Zheng <[email protected]>
>> This one will conflict with the ARM64 ACPI material when that goes in, so it'll
>> need to be rebased on top of that.
>
> Yes, please. Then I will take a look too.

Thanks for your reminder, will rebase it soon.

Regards,
Gu

>
> Thanks
> Hanjun
>
> .
>