Yasuaki Ishimatsu found that with node online/offline, cpu<->node
relationship is established. Because workqueue uses a info which
was established at boot time, but it may be changed by node hotpluging.
Once pool->node points to a stale node, following allocation failure
happens.
==
SLUB: Unable to allocate memory on node 2 (gfp=0x80d0)
cache: kmalloc-192, object size: 192, buffer size: 192, default
order:
1, min order: 0
node 0: slabs: 6172, objs: 259224, free: 245741
node 1: slabs: 3261, objs: 136962, free: 127656
==
As the apicid <---> pxm and pxm <--> node relationship are persistent, then
the apicid <--> node mapping is persistent, so the root cause is the
cpu-id <-> lapicid mapping is not persistent (because the currently
implementation always choose the first free cpu id for the new added cpu).
If we can build persistent cpu-id <-> lapicid relationship, this problem
will be fixed.
This patch tries to build the whole world mapping cpuid <-> apicid <-> pxm <-> node
for all possible processor at the boot, the detail implementation are 2 steps:
Step1: generate a logic cpu id for all the local apic (both enabled and dsiabled)
when register local apic
Step2: map the cpu to the phyical node via an additional acpi ns walk for processor.
Please refer to:
https://lkml.org/lkml/2015/2/27/145
https://lkml.org/lkml/2015/3/25/989
for the previous discussion.
---
V2: rebase on latest upstream.
---
Signed-off-by: Gu Zheng <[email protected]>
---
arch/ia64/kernel/acpi.c | 2 +-
arch/x86/include/asm/mpspec.h | 1 +
arch/x86/kernel/acpi/boot.c | 8 ++-
arch/x86/kernel/apic/apic.c | 73 ++++++++++++++++++++-----
arch/x86/mm/numa.c | 20 -------
drivers/acpi/acpi_processor.c | 2 +-
drivers/acpi/bus.c | 3 ++
drivers/acpi/processor_core.c | 121 ++++++++++++++++++++++++++++++++++--------
include/linux/acpi.h | 2 +
9 files changed, 172 insertions(+), 60 deletions(-)
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index b1698bc..7db5563 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -796,7 +796,7 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
* ACPI based hotplug CPU support
*/
#ifdef CONFIG_ACPI_HOTPLUG_CPU
-static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
+int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
{
#ifdef CONFIG_ACPI_NUMA
/*
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index b07233b..db902d8 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -86,6 +86,7 @@ static inline void early_reserve_e820_mpc_new(void) { }
#endif
int generic_processor_info(int apicid, int version);
+int __generic_processor_info(int apicid, int version, bool enabled);
#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index dbe76a1..c79115b 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -174,15 +174,13 @@ static int acpi_register_lapic(int id, u8 enabled)
return -EINVAL;
}
- if (!enabled) {
+ if (!enabled)
++disabled_cpus;
- return -EINVAL;
- }
if (boot_cpu_physical_apicid != -1U)
ver = apic_version[boot_cpu_physical_apicid];
- return generic_processor_info(id, ver);
+ return __generic_processor_info(id, ver, enabled);
}
static int __init
@@ -726,7 +724,7 @@ static void __init acpi_set_irq_model_ioapic(void)
#ifdef CONFIG_ACPI_HOTPLUG_CPU
#include <acpi/processor.h>
-static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
+void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
{
#ifdef CONFIG_ACPI_NUMA
int nid;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index dcb5285..7fbf2cb 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1977,7 +1977,38 @@ void disconnect_bsp_APIC(int virt_wire_setup)
apic_write(APIC_LVT1, value);
}
-int generic_processor_info(int apicid, int version)
+/*
+ * Logic cpu number(cpuid) to local APIC id persistent mappings.
+ * Do not clear the mapping even if cpu hot removed.
+ * */
+static int apicid_to_cpuid[] = {
+ [0 ... NR_CPUS - 1] = -1,
+};
+
+/*
+ * Internal cpu id bits, set the bit once cpu present, and never clear it.
+ * */
+static cpumask_t cpuid_mask = CPU_MASK_NONE;
+
+static int get_cpuid(int apicid)
+{
+ int free_id, i;
+
+ free_id = cpumask_next_zero(-1, &cpuid_mask);
+ if (free_id >= nr_cpu_ids)
+ return -1;
+
+ for (i = 0; i < free_id; i++)
+ if (apicid_to_cpuid[i] == apicid)
+ return i;
+
+ apicid_to_cpuid[free_id] = apicid;
+ cpumask_set_cpu(free_id, &cpuid_mask);
+
+ return free_id;
+}
+
+int __generic_processor_info(int apicid, int version, bool enabled)
{
int cpu, max = nr_cpu_ids;
bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
@@ -2010,8 +2041,8 @@ int generic_processor_info(int apicid, int version)
pr_warning("APIC: Disabling requested cpu."
" Processor %d/0x%x ignored.\n",
thiscpu, apicid);
-
- disabled_cpus++;
+ if (enabled)
+ disabled_cpus++;
return -ENODEV;
}
@@ -2027,8 +2058,8 @@ int generic_processor_info(int apicid, int version)
"ACPI: NR_CPUS/possible_cpus limit of %i almost"
" reached. Keeping one slot for boot cpu."
" Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
-
- disabled_cpus++;
+ if (enabled)
+ disabled_cpus++;
return -ENODEV;
}
@@ -2039,11 +2070,11 @@ int generic_processor_info(int apicid, int version)
"ACPI: NR_CPUS/possible_cpus limit of %i reached."
" Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
- disabled_cpus++;
+ if (enabled)
+ disabled_cpus++;
return -EINVAL;
}
- num_processors++;
if (apicid == boot_cpu_physical_apicid) {
/*
* x86_bios_cpu_apicid is required to have processors listed
@@ -2053,9 +2084,20 @@ int generic_processor_info(int apicid, int version)
* for BSP.
*/
cpu = 0;
- } else
- cpu = cpumask_next_zero(-1, cpu_present_mask);
-
+ } else {
+ cpu = get_cpuid(apicid);
+ if (cpu < 0) {
+ int thiscpu = max + disabled_cpus;
+
+ pr_warning(" Processor %d/0x%x ignored.\n",
+ thiscpu, apicid);
+ if (enabled)
+ disabled_cpus++;
+ return -EINVAL;
+ }
+ }
+ if (enabled)
+ num_processors++;
/*
* Validate version
*/
@@ -2071,7 +2113,8 @@ int generic_processor_info(int apicid, int version)
apic_version[boot_cpu_physical_apicid], cpu, version);
}
- physid_set(apicid, phys_cpu_present_map);
+ if (enabled)
+ physid_set(apicid, phys_cpu_present_map);
if (apicid > max_physical_apicid)
max_physical_apicid = apicid;
@@ -2084,11 +2127,17 @@ int generic_processor_info(int apicid, int version)
apic->x86_32_early_logical_apicid(cpu);
#endif
set_cpu_possible(cpu, true);
- set_cpu_present(cpu, true);
+ if (enabled)
+ set_cpu_present(cpu, true);
return cpu;
}
+int generic_processor_info(int apicid, int version)
+{
+ return __generic_processor_info(apicid, version, true);
+}
+
int hard_smp_processor_id(void)
{
return read_apic_id();
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 4053bb5..a733cf9 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -702,24 +702,6 @@ void __init x86_numa_init(void)
numa_init(dummy_numa_init);
}
-static __init int find_near_online_node(int node)
-{
- int n, val;
- int min_val = INT_MAX;
- int best_node = -1;
-
- for_each_online_node(n) {
- val = node_distance(node, n);
-
- if (val < min_val) {
- min_val = val;
- best_node = n;
- }
- }
-
- return best_node;
-}
-
/*
* Setup early cpu_to_node.
*
@@ -746,8 +728,6 @@ void __init init_cpu_to_node(void)
if (node == NUMA_NO_NODE)
continue;
- if (!node_online(node))
- node = find_near_online_node(node);
numa_set_node(cpu, node);
}
}
diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
index 58f335c..83bc464 100644
--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -285,7 +285,7 @@ static int acpi_processor_get_info(struct acpi_device *device)
* less than the max # of CPUs. They should be ignored _iff
* they are physically not present.
*/
- if (pr->id == -1) {
+ if (pr->id == -1 || !cpu_present(pr->id)) {
int ret = acpi_processor_hotadd_init(pr);
if (ret)
return ret;
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index c412fdb..98cdce9 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -674,6 +674,9 @@ static int __init acpi_init(void)
acpi_debugfs_init();
acpi_sleep_proc_init();
acpi_wakeup_device_init();
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+ acpi_set_processor_mapping();
+#endif
return 0;
}
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index b1ec78b..74798fe 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -32,12 +32,12 @@ static struct acpi_table_madt *get_madt_table(void)
}
static int map_lapic_id(struct acpi_subtable_header *entry,
- u32 acpi_id, phys_cpuid_t *apic_id)
+ u32 acpi_id, phys_cpuid_t *apic_id, bool ignore_disabled)
{
struct acpi_madt_local_apic *lapic =
container_of(entry, struct acpi_madt_local_apic, header);
- if (!(lapic->lapic_flags & ACPI_MADT_ENABLED))
+ if (ignore_disabled && !(lapic->lapic_flags & ACPI_MADT_ENABLED))
return -ENODEV;
if (lapic->processor_id != acpi_id)
@@ -48,12 +48,13 @@ static int map_lapic_id(struct acpi_subtable_header *entry,
}
static int map_x2apic_id(struct acpi_subtable_header *entry,
- int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id)
+ int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id,
+ bool ignore_disabled)
{
struct acpi_madt_local_x2apic *apic =
container_of(entry, struct acpi_madt_local_x2apic, header);
- if (!(apic->lapic_flags & ACPI_MADT_ENABLED))
+ if (ignore_disabled && !(apic->lapic_flags & ACPI_MADT_ENABLED))
return -ENODEV;
if (device_declaration && (apic->uid == acpi_id)) {
@@ -65,12 +66,13 @@ static int map_x2apic_id(struct acpi_subtable_header *entry,
}
static int map_lsapic_id(struct acpi_subtable_header *entry,
- int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id)
+ int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id,
+ bool ignore_disabled)
{
struct acpi_madt_local_sapic *lsapic =
container_of(entry, struct acpi_madt_local_sapic, header);
- if (!(lsapic->lapic_flags & ACPI_MADT_ENABLED))
+ if (ignore_disabled && !(lsapic->lapic_flags & ACPI_MADT_ENABLED))
return -ENODEV;
if (device_declaration) {
@@ -87,12 +89,13 @@ static int map_lsapic_id(struct acpi_subtable_header *entry,
* Retrieve the ARM CPU physical identifier (MPIDR)
*/
static int map_gicc_mpidr(struct acpi_subtable_header *entry,
- int device_declaration, u32 acpi_id, phys_cpuid_t *mpidr)
+ int device_declaration, u32 acpi_id, phys_cpuid_t *mpidr,
+ bool ignore_disabled)
{
struct acpi_madt_generic_interrupt *gicc =
container_of(entry, struct acpi_madt_generic_interrupt, header);
- if (!(gicc->flags & ACPI_MADT_ENABLED))
+ if (ignore_disabled && !(gicc->flags & ACPI_MADT_ENABLED))
return -ENODEV;
/* device_declaration means Device object in DSDT, in the
@@ -108,7 +111,7 @@ static int map_gicc_mpidr(struct acpi_subtable_header *entry,
return -EINVAL;
}
-static phys_cpuid_t map_madt_entry(int type, u32 acpi_id)
+static phys_cpuid_t map_madt_entry(int type, u32 acpi_id, bool ignore_disabled)
{
unsigned long madt_end, entry;
phys_cpuid_t phys_id = PHYS_CPUID_INVALID; /* CPU hardware ID */
@@ -128,16 +131,20 @@ static phys_cpuid_t map_madt_entry(int type, u32 acpi_id)
struct acpi_subtable_header *header =
(struct acpi_subtable_header *)entry;
if (header->type == ACPI_MADT_TYPE_LOCAL_APIC) {
- if (!map_lapic_id(header, acpi_id, &phys_id))
+ if (!map_lapic_id(header, acpi_id,
+ &phys_id, ignore_disabled))
break;
} else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC) {
- if (!map_x2apic_id(header, type, acpi_id, &phys_id))
+ if (!map_x2apic_id(header, type, acpi_id,
+ &phys_id, ignore_disabled))
break;
} else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC) {
- if (!map_lsapic_id(header, type, acpi_id, &phys_id))
+ if (!map_lsapic_id(header, type, acpi_id,
+ &phys_id, ignore_disabled))
break;
} else if (header->type == ACPI_MADT_TYPE_GENERIC_INTERRUPT) {
- if (!map_gicc_mpidr(header, type, acpi_id, &phys_id))
+ if (!map_gicc_mpidr(header, type, acpi_id,
+ &phys_id, ignore_disabled))
break;
}
entry += header->length;
@@ -145,7 +152,8 @@ static phys_cpuid_t map_madt_entry(int type, u32 acpi_id)
return phys_id;
}
-static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id)
+static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id,
+ bool ignore_disabled)
{
struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
union acpi_object *obj;
@@ -166,30 +174,39 @@ static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id)
header = (struct acpi_subtable_header *)obj->buffer.pointer;
if (header->type == ACPI_MADT_TYPE_LOCAL_APIC)
- map_lapic_id(header, acpi_id, &phys_id);
+ map_lapic_id(header, acpi_id, &phys_id, ignore_disabled);
else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC)
- map_lsapic_id(header, type, acpi_id, &phys_id);
+ map_lsapic_id(header, type, acpi_id,
+ &phys_id, ignore_disabled);
else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC)
- map_x2apic_id(header, type, acpi_id, &phys_id);
+ map_x2apic_id(header, type, acpi_id,
+ &phys_id, ignore_disabled);
else if (header->type == ACPI_MADT_TYPE_GENERIC_INTERRUPT)
- map_gicc_mpidr(header, type, acpi_id, &phys_id);
+ map_gicc_mpidr(header, type, acpi_id,
+ &phys_id, ignore_disabled);
exit:
kfree(buffer.pointer);
return phys_id;
}
-phys_cpuid_t acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id)
+static phys_cpuid_t __acpi_get_phys_id(acpi_handle handle, int type,
+ u32 acpi_id, bool ignore_disabled)
{
phys_cpuid_t phys_id;
- phys_id = map_mat_entry(handle, type, acpi_id);
+ phys_id = map_mat_entry(handle, type, acpi_id, ignore_disabled);
if (phys_id == PHYS_CPUID_INVALID)
- phys_id = map_madt_entry(type, acpi_id);
+ phys_id = map_madt_entry(type, acpi_id, ignore_disabled);
return phys_id;
}
+phys_cpuid_t acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id)
+{
+ return __acpi_get_phys_id(handle, type, acpi_id, true);
+}
+
int acpi_map_cpuid(phys_cpuid_t phys_id, u32 acpi_id)
{
#ifdef CONFIG_SMP
@@ -246,6 +263,68 @@ int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id)
}
EXPORT_SYMBOL_GPL(acpi_get_cpuid);
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+static bool map_processor(acpi_handle handle, int *phys_id, int *cpuid)
+{
+ int type;
+ u32 acpi_id;
+ acpi_status status;
+ acpi_object_type acpi_type;
+ unsigned long long tmp;
+ union acpi_object object = { 0 };
+ struct acpi_buffer buffer = { sizeof(union acpi_object), &object };
+
+ status = acpi_get_type(handle, &acpi_type);
+ if (ACPI_FAILURE(status))
+ return false;
+
+ switch (acpi_type) {
+ case ACPI_TYPE_PROCESSOR:
+ status = acpi_evaluate_object(handle, NULL, NULL, &buffer);
+ if (ACPI_FAILURE(status))
+ return false;
+ acpi_id = object.processor.proc_id;
+ break;
+ case ACPI_TYPE_DEVICE:
+ status = acpi_evaluate_integer(handle, "_UID", NULL, &tmp);
+ if (ACPI_FAILURE(status))
+ return false;
+ acpi_id = tmp;
+ break;
+ default:
+ return false;
+ }
+
+ type = (acpi_type == ACPI_TYPE_DEVICE) ? 1 : 0;
+
+ *phys_id = __acpi_get_phys_id(handle, type, acpi_id, false);
+ *cpuid = acpi_map_cpuid(*phys_id, acpi_id);
+ if (*cpuid == -1)
+ return false;
+ return true;
+}
+
+static acpi_status __init
+set_processor_node_mapping(acpi_handle handle, u32 lvl, void *context,
+ void **rv)
+{
+ u32 apic_id;
+ int cpu_id;
+
+ if (!map_processor(handle, &apic_id, &cpu_id))
+ return AE_ERROR;
+ acpi_map_cpu2node(handle, cpu_id, apic_id);
+ return AE_OK;
+}
+
+void __init acpi_set_processor_mapping(void)
+{
+ acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
+ ACPI_UINT32_MAX,
+ set_processor_node_mapping, NULL, NULL, NULL);
+}
+#endif
+
#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
static int get_ioapic_id(struct acpi_subtable_header *entry, u32 gsi_base,
u64 *phys_addr, int *ioapic_id)
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index e4da5e3..70166df 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -162,6 +162,8 @@ typedef u32 phys_cpuid_t;
/* Arch dependent functions for cpu hotplug support */
int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu);
int acpi_unmap_cpu(int cpu);
+void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid);
+void __init acpi_set_processor_mapping(void);
#endif /* CONFIG_ACPI_HOTPLUG_CPU */
#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
--
1.8.3.1
Since the change to the cpu <--> mapping (map the cpu to the physical
node for all possible at the boot), the node of cpu may be not present,
so we use the best near online node if the node is not present in the low
level allocation APIs.
---
V2: Maintaining a per-cpu cache about the alternative-node
only for x86 arch to avoid additional overhead.
---
Signed-off-by: Gu Zheng <[email protected]>
---
arch/x86/include/asm/topology.h | 2 ++
arch/x86/mm/numa.c | 33 +++++++++++++++++++++++++++++++++
include/linux/gfp.h | 12 +++++++++++-
3 files changed, 46 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 0e8f04f..37bb6b6 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -82,6 +82,8 @@ static inline const struct cpumask *cpumask_of_node(int node)
}
#endif
+extern int get_near_online_node(int node);
+
extern void setup_node_to_cpumask_map(void);
/*
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index a733cf9..4126464 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -72,12 +72,34 @@ int numa_cpu_node(int cpu)
cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
EXPORT_SYMBOL(node_to_cpumask_map);
+cpumask_t node_to_cpuid_mask_map[MAX_NUMNODES];
/*
* Map cpu index to node index
*/
DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+DEFINE_PER_CPU(int, x86_cpu_to_near_online_node);
+EXPORT_PER_CPU_SYMBOL(x86_cpu_to_near_online_node);
+
+static int find_near_online_node(int node)
+{
+ int n, val;
+ int min_val = INT_MAX;
+ int best_node = -1;
+
+ for_each_online_node(n) {
+ val = node_distance(node, n);
+
+ if (val < min_val) {
+ min_val = val;
+ best_node = n;
+ }
+ }
+
+ return best_node;
+}
+
void numa_set_node(int cpu, int node)
{
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
@@ -95,7 +117,11 @@ void numa_set_node(int cpu, int node)
return;
}
#endif
+
+ per_cpu(x86_cpu_to_near_online_node, cpu) =
+ find_near_online_node(numa_cpu_node(cpu));
per_cpu(x86_cpu_to_node_map, cpu) = node;
+ cpumask_set_cpu(cpu, &node_to_cpuid_mask_map[numa_cpu_node(cpu)]);
set_cpu_numa_node(cpu, node);
}
@@ -105,6 +131,13 @@ void numa_clear_node(int cpu)
numa_set_node(cpu, NUMA_NO_NODE);
}
+int get_near_online_node(int node)
+{
+ return per_cpu(x86_cpu_to_near_online_node,
+ cpumask_first(&node_to_cpuid_mask_map[node]));
+}
+EXPORT_SYMBOL(get_near_online_node);
+
/*
* Allocate node_to_cpumask_map based on number of available nodes
* Requires node_possible_map to be valid.
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 97a9373..b233ea4 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -305,13 +305,23 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
if (nid < 0)
nid = numa_node_id();
+#if IS_ENABLED(CONFIG_X86) && IS_ENABLED(CONFIG_NUMA)
+ if (!node_online(nid))
+ nid = get_near_online_node(nid);
+#endif
return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
}
static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask,
unsigned int order)
{
- VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid));
+ VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
+
+#if IS_ENABLED(CONFIG_X86) && IS_ENABLED(CONFIG_NUMA)
+ if (!node_online(nid))
+ nid = get_near_online_node(nid);
+#endif
+ VM_BUG_ON(!node_online(nid));
return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
}
--
1.8.3.1
Hi Gu,
Before 8 months, I posted the following patch to relate
cpuid to apicid.
https://lkml.org/lkml/2014/9/3/1120
Could you try this patch?
Thanks,
Yasuaki Ishimatsu
On Thu, 14 May 2015 19:33:33 +0800
Gu Zheng <[email protected]> wrote:
> Yasuaki Ishimatsu found that with node online/offline, cpu<->node
> relationship is established. Because workqueue uses a info which
> was established at boot time, but it may be changed by node hotpluging.
>
> Once pool->node points to a stale node, following allocation failure
> happens.
> ==
> SLUB: Unable to allocate memory on node 2 (gfp=0x80d0)
> cache: kmalloc-192, object size: 192, buffer size: 192, default
> order:
> 1, min order: 0
> node 0: slabs: 6172, objs: 259224, free: 245741
> node 1: slabs: 3261, objs: 136962, free: 127656
> ==
>
> As the apicid <---> pxm and pxm <--> node relationship are persistent, then
> the apicid <--> node mapping is persistent, so the root cause is the
> cpu-id <-> lapicid mapping is not persistent (because the currently
> implementation always choose the first free cpu id for the new added cpu).
> If we can build persistent cpu-id <-> lapicid relationship, this problem
> will be fixed.
>
> This patch tries to build the whole world mapping cpuid <-> apicid <-> pxm <-> node
> for all possible processor at the boot, the detail implementation are 2 steps:
>
> Step1: generate a logic cpu id for all the local apic (both enabled and dsiabled)
> when register local apic
> Step2: map the cpu to the phyical node via an additional acpi ns walk for processor.
>
> Please refer to:
> https://lkml.org/lkml/2015/2/27/145
> https://lkml.org/lkml/2015/3/25/989
> for the previous discussion.
> ---
> V2: rebase on latest upstream.
> ---
>
> Signed-off-by: Gu Zheng <[email protected]>
> ---
> arch/ia64/kernel/acpi.c | 2 +-
> arch/x86/include/asm/mpspec.h | 1 +
> arch/x86/kernel/acpi/boot.c | 8 ++-
> arch/x86/kernel/apic/apic.c | 73 ++++++++++++++++++++-----
> arch/x86/mm/numa.c | 20 -------
> drivers/acpi/acpi_processor.c | 2 +-
> drivers/acpi/bus.c | 3 ++
> drivers/acpi/processor_core.c | 121 ++++++++++++++++++++++++++++++++++--------
> include/linux/acpi.h | 2 +
> 9 files changed, 172 insertions(+), 60 deletions(-)
>
> diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
> index b1698bc..7db5563 100644
> --- a/arch/ia64/kernel/acpi.c
> +++ b/arch/ia64/kernel/acpi.c
> @@ -796,7 +796,7 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
> * ACPI based hotplug CPU support
> */
> #ifdef CONFIG_ACPI_HOTPLUG_CPU
> -static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
> +int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
> {
> #ifdef CONFIG_ACPI_NUMA
> /*
> diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
> index b07233b..db902d8 100644
> --- a/arch/x86/include/asm/mpspec.h
> +++ b/arch/x86/include/asm/mpspec.h
> @@ -86,6 +86,7 @@ static inline void early_reserve_e820_mpc_new(void) { }
> #endif
>
> int generic_processor_info(int apicid, int version);
> +int __generic_processor_info(int apicid, int version, bool enabled);
>
> #define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC)
>
> diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
> index dbe76a1..c79115b 100644
> --- a/arch/x86/kernel/acpi/boot.c
> +++ b/arch/x86/kernel/acpi/boot.c
> @@ -174,15 +174,13 @@ static int acpi_register_lapic(int id, u8 enabled)
> return -EINVAL;
> }
>
> - if (!enabled) {
> + if (!enabled)
> ++disabled_cpus;
> - return -EINVAL;
> - }
>
> if (boot_cpu_physical_apicid != -1U)
> ver = apic_version[boot_cpu_physical_apicid];
>
> - return generic_processor_info(id, ver);
> + return __generic_processor_info(id, ver, enabled);
> }
>
> static int __init
> @@ -726,7 +724,7 @@ static void __init acpi_set_irq_model_ioapic(void)
> #ifdef CONFIG_ACPI_HOTPLUG_CPU
> #include <acpi/processor.h>
>
> -static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
> +void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
> {
> #ifdef CONFIG_ACPI_NUMA
> int nid;
> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> index dcb5285..7fbf2cb 100644
> --- a/arch/x86/kernel/apic/apic.c
> +++ b/arch/x86/kernel/apic/apic.c
> @@ -1977,7 +1977,38 @@ void disconnect_bsp_APIC(int virt_wire_setup)
> apic_write(APIC_LVT1, value);
> }
>
> -int generic_processor_info(int apicid, int version)
> +/*
> + * Logic cpu number(cpuid) to local APIC id persistent mappings.
> + * Do not clear the mapping even if cpu hot removed.
> + * */
> +static int apicid_to_cpuid[] = {
> + [0 ... NR_CPUS - 1] = -1,
> +};
> +
> +/*
> + * Internal cpu id bits, set the bit once cpu present, and never clear it.
> + * */
> +static cpumask_t cpuid_mask = CPU_MASK_NONE;
> +
> +static int get_cpuid(int apicid)
> +{
> + int free_id, i;
> +
> + free_id = cpumask_next_zero(-1, &cpuid_mask);
> + if (free_id >= nr_cpu_ids)
> + return -1;
> +
> + for (i = 0; i < free_id; i++)
> + if (apicid_to_cpuid[i] == apicid)
> + return i;
> +
> + apicid_to_cpuid[free_id] = apicid;
> + cpumask_set_cpu(free_id, &cpuid_mask);
> +
> + return free_id;
> +}
> +
> +int __generic_processor_info(int apicid, int version, bool enabled)
> {
> int cpu, max = nr_cpu_ids;
> bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
> @@ -2010,8 +2041,8 @@ int generic_processor_info(int apicid, int version)
> pr_warning("APIC: Disabling requested cpu."
> " Processor %d/0x%x ignored.\n",
> thiscpu, apicid);
> -
> - disabled_cpus++;
> + if (enabled)
> + disabled_cpus++;
> return -ENODEV;
> }
>
> @@ -2027,8 +2058,8 @@ int generic_processor_info(int apicid, int version)
> "ACPI: NR_CPUS/possible_cpus limit of %i almost"
> " reached. Keeping one slot for boot cpu."
> " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
> -
> - disabled_cpus++;
> + if (enabled)
> + disabled_cpus++;
> return -ENODEV;
> }
>
> @@ -2039,11 +2070,11 @@ int generic_processor_info(int apicid, int version)
> "ACPI: NR_CPUS/possible_cpus limit of %i reached."
> " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
>
> - disabled_cpus++;
> + if (enabled)
> + disabled_cpus++;
> return -EINVAL;
> }
>
> - num_processors++;
> if (apicid == boot_cpu_physical_apicid) {
> /*
> * x86_bios_cpu_apicid is required to have processors listed
> @@ -2053,9 +2084,20 @@ int generic_processor_info(int apicid, int version)
> * for BSP.
> */
> cpu = 0;
> - } else
> - cpu = cpumask_next_zero(-1, cpu_present_mask);
> -
> + } else {
> + cpu = get_cpuid(apicid);
> + if (cpu < 0) {
> + int thiscpu = max + disabled_cpus;
> +
> + pr_warning(" Processor %d/0x%x ignored.\n",
> + thiscpu, apicid);
> + if (enabled)
> + disabled_cpus++;
> + return -EINVAL;
> + }
> + }
> + if (enabled)
> + num_processors++;
> /*
> * Validate version
> */
> @@ -2071,7 +2113,8 @@ int generic_processor_info(int apicid, int version)
> apic_version[boot_cpu_physical_apicid], cpu, version);
> }
>
> - physid_set(apicid, phys_cpu_present_map);
> + if (enabled)
> + physid_set(apicid, phys_cpu_present_map);
> if (apicid > max_physical_apicid)
> max_physical_apicid = apicid;
>
> @@ -2084,11 +2127,17 @@ int generic_processor_info(int apicid, int version)
> apic->x86_32_early_logical_apicid(cpu);
> #endif
> set_cpu_possible(cpu, true);
> - set_cpu_present(cpu, true);
> + if (enabled)
> + set_cpu_present(cpu, true);
>
> return cpu;
> }
>
> +int generic_processor_info(int apicid, int version)
> +{
> + return __generic_processor_info(apicid, version, true);
> +}
> +
> int hard_smp_processor_id(void)
> {
> return read_apic_id();
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 4053bb5..a733cf9 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -702,24 +702,6 @@ void __init x86_numa_init(void)
> numa_init(dummy_numa_init);
> }
>
> -static __init int find_near_online_node(int node)
> -{
> - int n, val;
> - int min_val = INT_MAX;
> - int best_node = -1;
> -
> - for_each_online_node(n) {
> - val = node_distance(node, n);
> -
> - if (val < min_val) {
> - min_val = val;
> - best_node = n;
> - }
> - }
> -
> - return best_node;
> -}
> -
> /*
> * Setup early cpu_to_node.
> *
> @@ -746,8 +728,6 @@ void __init init_cpu_to_node(void)
>
> if (node == NUMA_NO_NODE)
> continue;
> - if (!node_online(node))
> - node = find_near_online_node(node);
> numa_set_node(cpu, node);
> }
> }
> diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
> index 58f335c..83bc464 100644
> --- a/drivers/acpi/acpi_processor.c
> +++ b/drivers/acpi/acpi_processor.c
> @@ -285,7 +285,7 @@ static int acpi_processor_get_info(struct acpi_device *device)
> * less than the max # of CPUs. They should be ignored _iff
> * they are physically not present.
> */
> - if (pr->id == -1) {
> + if (pr->id == -1 || !cpu_present(pr->id)) {
> int ret = acpi_processor_hotadd_init(pr);
> if (ret)
> return ret;
> diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
> index c412fdb..98cdce9 100644
> --- a/drivers/acpi/bus.c
> +++ b/drivers/acpi/bus.c
> @@ -674,6 +674,9 @@ static int __init acpi_init(void)
> acpi_debugfs_init();
> acpi_sleep_proc_init();
> acpi_wakeup_device_init();
> +#ifdef CONFIG_ACPI_HOTPLUG_CPU
> + acpi_set_processor_mapping();
> +#endif
> return 0;
> }
>
> diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
> index b1ec78b..74798fe 100644
> --- a/drivers/acpi/processor_core.c
> +++ b/drivers/acpi/processor_core.c
> @@ -32,12 +32,12 @@ static struct acpi_table_madt *get_madt_table(void)
> }
>
> static int map_lapic_id(struct acpi_subtable_header *entry,
> - u32 acpi_id, phys_cpuid_t *apic_id)
> + u32 acpi_id, phys_cpuid_t *apic_id, bool ignore_disabled)
> {
> struct acpi_madt_local_apic *lapic =
> container_of(entry, struct acpi_madt_local_apic, header);
>
> - if (!(lapic->lapic_flags & ACPI_MADT_ENABLED))
> + if (ignore_disabled && !(lapic->lapic_flags & ACPI_MADT_ENABLED))
> return -ENODEV;
>
> if (lapic->processor_id != acpi_id)
> @@ -48,12 +48,13 @@ static int map_lapic_id(struct acpi_subtable_header *entry,
> }
>
> static int map_x2apic_id(struct acpi_subtable_header *entry,
> - int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id)
> + int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id,
> + bool ignore_disabled)
> {
> struct acpi_madt_local_x2apic *apic =
> container_of(entry, struct acpi_madt_local_x2apic, header);
>
> - if (!(apic->lapic_flags & ACPI_MADT_ENABLED))
> + if (ignore_disabled && !(apic->lapic_flags & ACPI_MADT_ENABLED))
> return -ENODEV;
>
> if (device_declaration && (apic->uid == acpi_id)) {
> @@ -65,12 +66,13 @@ static int map_x2apic_id(struct acpi_subtable_header *entry,
> }
>
> static int map_lsapic_id(struct acpi_subtable_header *entry,
> - int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id)
> + int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id,
> + bool ignore_disabled)
> {
> struct acpi_madt_local_sapic *lsapic =
> container_of(entry, struct acpi_madt_local_sapic, header);
>
> - if (!(lsapic->lapic_flags & ACPI_MADT_ENABLED))
> + if (ignore_disabled && !(lsapic->lapic_flags & ACPI_MADT_ENABLED))
> return -ENODEV;
>
> if (device_declaration) {
> @@ -87,12 +89,13 @@ static int map_lsapic_id(struct acpi_subtable_header *entry,
> * Retrieve the ARM CPU physical identifier (MPIDR)
> */
> static int map_gicc_mpidr(struct acpi_subtable_header *entry,
> - int device_declaration, u32 acpi_id, phys_cpuid_t *mpidr)
> + int device_declaration, u32 acpi_id, phys_cpuid_t *mpidr,
> + bool ignore_disabled)
> {
> struct acpi_madt_generic_interrupt *gicc =
> container_of(entry, struct acpi_madt_generic_interrupt, header);
>
> - if (!(gicc->flags & ACPI_MADT_ENABLED))
> + if (ignore_disabled && !(gicc->flags & ACPI_MADT_ENABLED))
> return -ENODEV;
>
> /* device_declaration means Device object in DSDT, in the
> @@ -108,7 +111,7 @@ static int map_gicc_mpidr(struct acpi_subtable_header *entry,
> return -EINVAL;
> }
>
> -static phys_cpuid_t map_madt_entry(int type, u32 acpi_id)
> +static phys_cpuid_t map_madt_entry(int type, u32 acpi_id, bool ignore_disabled)
> {
> unsigned long madt_end, entry;
> phys_cpuid_t phys_id = PHYS_CPUID_INVALID; /* CPU hardware ID */
> @@ -128,16 +131,20 @@ static phys_cpuid_t map_madt_entry(int type, u32 acpi_id)
> struct acpi_subtable_header *header =
> (struct acpi_subtable_header *)entry;
> if (header->type == ACPI_MADT_TYPE_LOCAL_APIC) {
> - if (!map_lapic_id(header, acpi_id, &phys_id))
> + if (!map_lapic_id(header, acpi_id,
> + &phys_id, ignore_disabled))
> break;
> } else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC) {
> - if (!map_x2apic_id(header, type, acpi_id, &phys_id))
> + if (!map_x2apic_id(header, type, acpi_id,
> + &phys_id, ignore_disabled))
> break;
> } else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC) {
> - if (!map_lsapic_id(header, type, acpi_id, &phys_id))
> + if (!map_lsapic_id(header, type, acpi_id,
> + &phys_id, ignore_disabled))
> break;
> } else if (header->type == ACPI_MADT_TYPE_GENERIC_INTERRUPT) {
> - if (!map_gicc_mpidr(header, type, acpi_id, &phys_id))
> + if (!map_gicc_mpidr(header, type, acpi_id,
> + &phys_id, ignore_disabled))
> break;
> }
> entry += header->length;
> @@ -145,7 +152,8 @@ static phys_cpuid_t map_madt_entry(int type, u32 acpi_id)
> return phys_id;
> }
>
> -static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id)
> +static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id,
> + bool ignore_disabled)
> {
> struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
> union acpi_object *obj;
> @@ -166,30 +174,39 @@ static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id)
>
> header = (struct acpi_subtable_header *)obj->buffer.pointer;
> if (header->type == ACPI_MADT_TYPE_LOCAL_APIC)
> - map_lapic_id(header, acpi_id, &phys_id);
> + map_lapic_id(header, acpi_id, &phys_id, ignore_disabled);
> else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC)
> - map_lsapic_id(header, type, acpi_id, &phys_id);
> + map_lsapic_id(header, type, acpi_id,
> + &phys_id, ignore_disabled);
> else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC)
> - map_x2apic_id(header, type, acpi_id, &phys_id);
> + map_x2apic_id(header, type, acpi_id,
> + &phys_id, ignore_disabled);
> else if (header->type == ACPI_MADT_TYPE_GENERIC_INTERRUPT)
> - map_gicc_mpidr(header, type, acpi_id, &phys_id);
> + map_gicc_mpidr(header, type, acpi_id,
> + &phys_id, ignore_disabled);
>
> exit:
> kfree(buffer.pointer);
> return phys_id;
> }
>
> -phys_cpuid_t acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id)
> +static phys_cpuid_t __acpi_get_phys_id(acpi_handle handle, int type,
> + u32 acpi_id, bool ignore_disabled)
> {
> phys_cpuid_t phys_id;
>
> - phys_id = map_mat_entry(handle, type, acpi_id);
> + phys_id = map_mat_entry(handle, type, acpi_id, ignore_disabled);
> if (phys_id == PHYS_CPUID_INVALID)
> - phys_id = map_madt_entry(type, acpi_id);
> + phys_id = map_madt_entry(type, acpi_id, ignore_disabled);
>
> return phys_id;
> }
>
> +phys_cpuid_t acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id)
> +{
> + return __acpi_get_phys_id(handle, type, acpi_id, true);
> +}
> +
> int acpi_map_cpuid(phys_cpuid_t phys_id, u32 acpi_id)
> {
> #ifdef CONFIG_SMP
> @@ -246,6 +263,68 @@ int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id)
> }
> EXPORT_SYMBOL_GPL(acpi_get_cpuid);
>
> +#ifdef CONFIG_ACPI_HOTPLUG_CPU
> +static bool map_processor(acpi_handle handle, int *phys_id, int *cpuid)
> +{
> + int type;
> + u32 acpi_id;
> + acpi_status status;
> + acpi_object_type acpi_type;
> + unsigned long long tmp;
> + union acpi_object object = { 0 };
> + struct acpi_buffer buffer = { sizeof(union acpi_object), &object };
> +
> + status = acpi_get_type(handle, &acpi_type);
> + if (ACPI_FAILURE(status))
> + return false;
> +
> + switch (acpi_type) {
> + case ACPI_TYPE_PROCESSOR:
> + status = acpi_evaluate_object(handle, NULL, NULL, &buffer);
> + if (ACPI_FAILURE(status))
> + return false;
> + acpi_id = object.processor.proc_id;
> + break;
> + case ACPI_TYPE_DEVICE:
> + status = acpi_evaluate_integer(handle, "_UID", NULL, &tmp);
> + if (ACPI_FAILURE(status))
> + return false;
> + acpi_id = tmp;
> + break;
> + default:
> + return false;
> + }
> +
> + type = (acpi_type == ACPI_TYPE_DEVICE) ? 1 : 0;
> +
> + *phys_id = __acpi_get_phys_id(handle, type, acpi_id, false);
> + *cpuid = acpi_map_cpuid(*phys_id, acpi_id);
> + if (*cpuid == -1)
> + return false;
> + return true;
> +}
> +
> +static acpi_status __init
> +set_processor_node_mapping(acpi_handle handle, u32 lvl, void *context,
> + void **rv)
> +{
> + u32 apic_id;
> + int cpu_id;
> +
> + if (!map_processor(handle, &apic_id, &cpu_id))
> + return AE_ERROR;
> + acpi_map_cpu2node(handle, cpu_id, apic_id);
> + return AE_OK;
> +}
> +
> +void __init acpi_set_processor_mapping(void)
> +{
> + acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
> + ACPI_UINT32_MAX,
> + set_processor_node_mapping, NULL, NULL, NULL);
> +}
> +#endif
> +
> #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
> static int get_ioapic_id(struct acpi_subtable_header *entry, u32 gsi_base,
> u64 *phys_addr, int *ioapic_id)
> diff --git a/include/linux/acpi.h b/include/linux/acpi.h
> index e4da5e3..70166df 100644
> --- a/include/linux/acpi.h
> +++ b/include/linux/acpi.h
> @@ -162,6 +162,8 @@ typedef u32 phys_cpuid_t;
> /* Arch dependent functions for cpu hotplug support */
> int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu);
> int acpi_unmap_cpu(int cpu);
> +void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid);
> +void __init acpi_set_processor_mapping(void);
> #endif /* CONFIG_ACPI_HOTPLUG_CPU */
>
> #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
> --
> 1.8.3.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
Hi Ishimatsu,
On 05/15/2015 12:44 AM, Yasuaki Ishimatsu wrote:
> Hi Gu,
>
> Before 8 months, I posted the following patch to relate
> cpuid to apicid.
>
> https://lkml.org/lkml/2014/9/3/1120
>
> Could you try this patch?
Thanks for your reminder.
It seems similar to the https://lkml.org/lkml/2015/3/25/989
"[PATCH 0/2] workqueue: fix a bug when numa mapping is changed",
though it also can fix the issue, but it seems not the perfect
solution, because self-maintain cpumask mapping (or something
like this) is very common in kernel.
As TJ and Kame suggested, it is available to build the mapping
for all the possible cpus at boot, so that we can ignore the
effect of cpu/node hotplug, especially for per cpu cases.
Regards,
Gu
>
> Thanks,
> Yasuaki Ishimatsu
>
> On Thu, 14 May 2015 19:33:33 +0800
> Gu Zheng <[email protected]> wrote:
>
>> Yasuaki Ishimatsu found that with node online/offline, cpu<->node
>> relationship is established. Because workqueue uses a info which
>> was established at boot time, but it may be changed by node hotpluging.
>>
>> Once pool->node points to a stale node, following allocation failure
>> happens.
>> ==
>> SLUB: Unable to allocate memory on node 2 (gfp=0x80d0)
>> cache: kmalloc-192, object size: 192, buffer size: 192, default
>> order:
>> 1, min order: 0
>> node 0: slabs: 6172, objs: 259224, free: 245741
>> node 1: slabs: 3261, objs: 136962, free: 127656
>> ==
>>
>> As the apicid <---> pxm and pxm <--> node relationship are persistent, then
>> the apicid <--> node mapping is persistent, so the root cause is the
>> cpu-id <-> lapicid mapping is not persistent (because the currently
>> implementation always choose the first free cpu id for the new added cpu).
>> If we can build persistent cpu-id <-> lapicid relationship, this problem
>> will be fixed.
>>
>> This patch tries to build the whole world mapping cpuid <-> apicid <-> pxm <-> node
>> for all possible processor at the boot, the detail implementation are 2 steps:
>>
>> Step1: generate a logic cpu id for all the local apic (both enabled and dsiabled)
>> when register local apic
>> Step2: map the cpu to the phyical node via an additional acpi ns walk for processor.
>>
>> Please refer to:
>> https://lkml.org/lkml/2015/2/27/145
>> https://lkml.org/lkml/2015/3/25/989
>> for the previous discussion.
>> ---
>> V2: rebase on latest upstream.
>> ---
>>
>> Signed-off-by: Gu Zheng <[email protected]>
>> ---
>> arch/ia64/kernel/acpi.c | 2 +-
>> arch/x86/include/asm/mpspec.h | 1 +
>> arch/x86/kernel/acpi/boot.c | 8 ++-
>> arch/x86/kernel/apic/apic.c | 73 ++++++++++++++++++++-----
>> arch/x86/mm/numa.c | 20 -------
>> drivers/acpi/acpi_processor.c | 2 +-
>> drivers/acpi/bus.c | 3 ++
>> drivers/acpi/processor_core.c | 121 ++++++++++++++++++++++++++++++++++--------
>> include/linux/acpi.h | 2 +
>> 9 files changed, 172 insertions(+), 60 deletions(-)
>>
>> diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
>> index b1698bc..7db5563 100644
>> --- a/arch/ia64/kernel/acpi.c
>> +++ b/arch/ia64/kernel/acpi.c
>> @@ -796,7 +796,7 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
>> * ACPI based hotplug CPU support
>> */
>> #ifdef CONFIG_ACPI_HOTPLUG_CPU
>> -static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
>> +int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
>> {
>> #ifdef CONFIG_ACPI_NUMA
>> /*
>> diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
>> index b07233b..db902d8 100644
>> --- a/arch/x86/include/asm/mpspec.h
>> +++ b/arch/x86/include/asm/mpspec.h
>> @@ -86,6 +86,7 @@ static inline void early_reserve_e820_mpc_new(void) { }
>> #endif
>>
>> int generic_processor_info(int apicid, int version);
>> +int __generic_processor_info(int apicid, int version, bool enabled);
>>
>> #define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC)
>>
>> diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
>> index dbe76a1..c79115b 100644
>> --- a/arch/x86/kernel/acpi/boot.c
>> +++ b/arch/x86/kernel/acpi/boot.c
>> @@ -174,15 +174,13 @@ static int acpi_register_lapic(int id, u8 enabled)
>> return -EINVAL;
>> }
>>
>> - if (!enabled) {
>> + if (!enabled)
>> ++disabled_cpus;
>> - return -EINVAL;
>> - }
>>
>> if (boot_cpu_physical_apicid != -1U)
>> ver = apic_version[boot_cpu_physical_apicid];
>>
>> - return generic_processor_info(id, ver);
>> + return __generic_processor_info(id, ver, enabled);
>> }
>>
>> static int __init
>> @@ -726,7 +724,7 @@ static void __init acpi_set_irq_model_ioapic(void)
>> #ifdef CONFIG_ACPI_HOTPLUG_CPU
>> #include <acpi/processor.h>
>>
>> -static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
>> +void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
>> {
>> #ifdef CONFIG_ACPI_NUMA
>> int nid;
>> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
>> index dcb5285..7fbf2cb 100644
>> --- a/arch/x86/kernel/apic/apic.c
>> +++ b/arch/x86/kernel/apic/apic.c
>> @@ -1977,7 +1977,38 @@ void disconnect_bsp_APIC(int virt_wire_setup)
>> apic_write(APIC_LVT1, value);
>> }
>>
>> -int generic_processor_info(int apicid, int version)
>> +/*
>> + * Logic cpu number(cpuid) to local APIC id persistent mappings.
>> + * Do not clear the mapping even if cpu hot removed.
>> + * */
>> +static int apicid_to_cpuid[] = {
>> + [0 ... NR_CPUS - 1] = -1,
>> +};
>> +
>> +/*
>> + * Internal cpu id bits, set the bit once cpu present, and never clear it.
>> + * */
>> +static cpumask_t cpuid_mask = CPU_MASK_NONE;
>> +
>> +static int get_cpuid(int apicid)
>> +{
>> + int free_id, i;
>> +
>> + free_id = cpumask_next_zero(-1, &cpuid_mask);
>> + if (free_id >= nr_cpu_ids)
>> + return -1;
>> +
>> + for (i = 0; i < free_id; i++)
>> + if (apicid_to_cpuid[i] == apicid)
>> + return i;
>> +
>> + apicid_to_cpuid[free_id] = apicid;
>> + cpumask_set_cpu(free_id, &cpuid_mask);
>> +
>> + return free_id;
>> +}
>> +
>> +int __generic_processor_info(int apicid, int version, bool enabled)
>> {
>> int cpu, max = nr_cpu_ids;
>> bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
>> @@ -2010,8 +2041,8 @@ int generic_processor_info(int apicid, int version)
>> pr_warning("APIC: Disabling requested cpu."
>> " Processor %d/0x%x ignored.\n",
>> thiscpu, apicid);
>> -
>> - disabled_cpus++;
>> + if (enabled)
>> + disabled_cpus++;
>> return -ENODEV;
>> }
>>
>> @@ -2027,8 +2058,8 @@ int generic_processor_info(int apicid, int version)
>> "ACPI: NR_CPUS/possible_cpus limit of %i almost"
>> " reached. Keeping one slot for boot cpu."
>> " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
>> -
>> - disabled_cpus++;
>> + if (enabled)
>> + disabled_cpus++;
>> return -ENODEV;
>> }
>>
>> @@ -2039,11 +2070,11 @@ int generic_processor_info(int apicid, int version)
>> "ACPI: NR_CPUS/possible_cpus limit of %i reached."
>> " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
>>
>> - disabled_cpus++;
>> + if (enabled)
>> + disabled_cpus++;
>> return -EINVAL;
>> }
>>
>> - num_processors++;
>> if (apicid == boot_cpu_physical_apicid) {
>> /*
>> * x86_bios_cpu_apicid is required to have processors listed
>> @@ -2053,9 +2084,20 @@ int generic_processor_info(int apicid, int version)
>> * for BSP.
>> */
>> cpu = 0;
>> - } else
>> - cpu = cpumask_next_zero(-1, cpu_present_mask);
>> -
>> + } else {
>> + cpu = get_cpuid(apicid);
>> + if (cpu < 0) {
>> + int thiscpu = max + disabled_cpus;
>> +
>> + pr_warning(" Processor %d/0x%x ignored.\n",
>> + thiscpu, apicid);
>> + if (enabled)
>> + disabled_cpus++;
>> + return -EINVAL;
>> + }
>> + }
>> + if (enabled)
>> + num_processors++;
>> /*
>> * Validate version
>> */
>> @@ -2071,7 +2113,8 @@ int generic_processor_info(int apicid, int version)
>> apic_version[boot_cpu_physical_apicid], cpu, version);
>> }
>>
>> - physid_set(apicid, phys_cpu_present_map);
>> + if (enabled)
>> + physid_set(apicid, phys_cpu_present_map);
>> if (apicid > max_physical_apicid)
>> max_physical_apicid = apicid;
>>
>> @@ -2084,11 +2127,17 @@ int generic_processor_info(int apicid, int version)
>> apic->x86_32_early_logical_apicid(cpu);
>> #endif
>> set_cpu_possible(cpu, true);
>> - set_cpu_present(cpu, true);
>> + if (enabled)
>> + set_cpu_present(cpu, true);
>>
>> return cpu;
>> }
>>
>> +int generic_processor_info(int apicid, int version)
>> +{
>> + return __generic_processor_info(apicid, version, true);
>> +}
>> +
>> int hard_smp_processor_id(void)
>> {
>> return read_apic_id();
>> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
>> index 4053bb5..a733cf9 100644
>> --- a/arch/x86/mm/numa.c
>> +++ b/arch/x86/mm/numa.c
>> @@ -702,24 +702,6 @@ void __init x86_numa_init(void)
>> numa_init(dummy_numa_init);
>> }
>>
>> -static __init int find_near_online_node(int node)
>> -{
>> - int n, val;
>> - int min_val = INT_MAX;
>> - int best_node = -1;
>> -
>> - for_each_online_node(n) {
>> - val = node_distance(node, n);
>> -
>> - if (val < min_val) {
>> - min_val = val;
>> - best_node = n;
>> - }
>> - }
>> -
>> - return best_node;
>> -}
>> -
>> /*
>> * Setup early cpu_to_node.
>> *
>> @@ -746,8 +728,6 @@ void __init init_cpu_to_node(void)
>>
>> if (node == NUMA_NO_NODE)
>> continue;
>> - if (!node_online(node))
>> - node = find_near_online_node(node);
>> numa_set_node(cpu, node);
>> }
>> }
>> diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
>> index 58f335c..83bc464 100644
>> --- a/drivers/acpi/acpi_processor.c
>> +++ b/drivers/acpi/acpi_processor.c
>> @@ -285,7 +285,7 @@ static int acpi_processor_get_info(struct acpi_device *device)
>> * less than the max # of CPUs. They should be ignored _iff
>> * they are physically not present.
>> */
>> - if (pr->id == -1) {
>> + if (pr->id == -1 || !cpu_present(pr->id)) {
>> int ret = acpi_processor_hotadd_init(pr);
>> if (ret)
>> return ret;
>> diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
>> index c412fdb..98cdce9 100644
>> --- a/drivers/acpi/bus.c
>> +++ b/drivers/acpi/bus.c
>> @@ -674,6 +674,9 @@ static int __init acpi_init(void)
>> acpi_debugfs_init();
>> acpi_sleep_proc_init();
>> acpi_wakeup_device_init();
>> +#ifdef CONFIG_ACPI_HOTPLUG_CPU
>> + acpi_set_processor_mapping();
>> +#endif
>> return 0;
>> }
>>
>> diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
>> index b1ec78b..74798fe 100644
>> --- a/drivers/acpi/processor_core.c
>> +++ b/drivers/acpi/processor_core.c
>> @@ -32,12 +32,12 @@ static struct acpi_table_madt *get_madt_table(void)
>> }
>>
>> static int map_lapic_id(struct acpi_subtable_header *entry,
>> - u32 acpi_id, phys_cpuid_t *apic_id)
>> + u32 acpi_id, phys_cpuid_t *apic_id, bool ignore_disabled)
>> {
>> struct acpi_madt_local_apic *lapic =
>> container_of(entry, struct acpi_madt_local_apic, header);
>>
>> - if (!(lapic->lapic_flags & ACPI_MADT_ENABLED))
>> + if (ignore_disabled && !(lapic->lapic_flags & ACPI_MADT_ENABLED))
>> return -ENODEV;
>>
>> if (lapic->processor_id != acpi_id)
>> @@ -48,12 +48,13 @@ static int map_lapic_id(struct acpi_subtable_header *entry,
>> }
>>
>> static int map_x2apic_id(struct acpi_subtable_header *entry,
>> - int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id)
>> + int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id,
>> + bool ignore_disabled)
>> {
>> struct acpi_madt_local_x2apic *apic =
>> container_of(entry, struct acpi_madt_local_x2apic, header);
>>
>> - if (!(apic->lapic_flags & ACPI_MADT_ENABLED))
>> + if (ignore_disabled && !(apic->lapic_flags & ACPI_MADT_ENABLED))
>> return -ENODEV;
>>
>> if (device_declaration && (apic->uid == acpi_id)) {
>> @@ -65,12 +66,13 @@ static int map_x2apic_id(struct acpi_subtable_header *entry,
>> }
>>
>> static int map_lsapic_id(struct acpi_subtable_header *entry,
>> - int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id)
>> + int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id,
>> + bool ignore_disabled)
>> {
>> struct acpi_madt_local_sapic *lsapic =
>> container_of(entry, struct acpi_madt_local_sapic, header);
>>
>> - if (!(lsapic->lapic_flags & ACPI_MADT_ENABLED))
>> + if (ignore_disabled && !(lsapic->lapic_flags & ACPI_MADT_ENABLED))
>> return -ENODEV;
>>
>> if (device_declaration) {
>> @@ -87,12 +89,13 @@ static int map_lsapic_id(struct acpi_subtable_header *entry,
>> * Retrieve the ARM CPU physical identifier (MPIDR)
>> */
>> static int map_gicc_mpidr(struct acpi_subtable_header *entry,
>> - int device_declaration, u32 acpi_id, phys_cpuid_t *mpidr)
>> + int device_declaration, u32 acpi_id, phys_cpuid_t *mpidr,
>> + bool ignore_disabled)
>> {
>> struct acpi_madt_generic_interrupt *gicc =
>> container_of(entry, struct acpi_madt_generic_interrupt, header);
>>
>> - if (!(gicc->flags & ACPI_MADT_ENABLED))
>> + if (ignore_disabled && !(gicc->flags & ACPI_MADT_ENABLED))
>> return -ENODEV;
>>
>> /* device_declaration means Device object in DSDT, in the
>> @@ -108,7 +111,7 @@ static int map_gicc_mpidr(struct acpi_subtable_header *entry,
>> return -EINVAL;
>> }
>>
>> -static phys_cpuid_t map_madt_entry(int type, u32 acpi_id)
>> +static phys_cpuid_t map_madt_entry(int type, u32 acpi_id, bool ignore_disabled)
>> {
>> unsigned long madt_end, entry;
>> phys_cpuid_t phys_id = PHYS_CPUID_INVALID; /* CPU hardware ID */
>> @@ -128,16 +131,20 @@ static phys_cpuid_t map_madt_entry(int type, u32 acpi_id)
>> struct acpi_subtable_header *header =
>> (struct acpi_subtable_header *)entry;
>> if (header->type == ACPI_MADT_TYPE_LOCAL_APIC) {
>> - if (!map_lapic_id(header, acpi_id, &phys_id))
>> + if (!map_lapic_id(header, acpi_id,
>> + &phys_id, ignore_disabled))
>> break;
>> } else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC) {
>> - if (!map_x2apic_id(header, type, acpi_id, &phys_id))
>> + if (!map_x2apic_id(header, type, acpi_id,
>> + &phys_id, ignore_disabled))
>> break;
>> } else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC) {
>> - if (!map_lsapic_id(header, type, acpi_id, &phys_id))
>> + if (!map_lsapic_id(header, type, acpi_id,
>> + &phys_id, ignore_disabled))
>> break;
>> } else if (header->type == ACPI_MADT_TYPE_GENERIC_INTERRUPT) {
>> - if (!map_gicc_mpidr(header, type, acpi_id, &phys_id))
>> + if (!map_gicc_mpidr(header, type, acpi_id,
>> + &phys_id, ignore_disabled))
>> break;
>> }
>> entry += header->length;
>> @@ -145,7 +152,8 @@ static phys_cpuid_t map_madt_entry(int type, u32 acpi_id)
>> return phys_id;
>> }
>>
>> -static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id)
>> +static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id,
>> + bool ignore_disabled)
>> {
>> struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
>> union acpi_object *obj;
>> @@ -166,30 +174,39 @@ static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id)
>>
>> header = (struct acpi_subtable_header *)obj->buffer.pointer;
>> if (header->type == ACPI_MADT_TYPE_LOCAL_APIC)
>> - map_lapic_id(header, acpi_id, &phys_id);
>> + map_lapic_id(header, acpi_id, &phys_id, ignore_disabled);
>> else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC)
>> - map_lsapic_id(header, type, acpi_id, &phys_id);
>> + map_lsapic_id(header, type, acpi_id,
>> + &phys_id, ignore_disabled);
>> else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC)
>> - map_x2apic_id(header, type, acpi_id, &phys_id);
>> + map_x2apic_id(header, type, acpi_id,
>> + &phys_id, ignore_disabled);
>> else if (header->type == ACPI_MADT_TYPE_GENERIC_INTERRUPT)
>> - map_gicc_mpidr(header, type, acpi_id, &phys_id);
>> + map_gicc_mpidr(header, type, acpi_id,
>> + &phys_id, ignore_disabled);
>>
>> exit:
>> kfree(buffer.pointer);
>> return phys_id;
>> }
>>
>> -phys_cpuid_t acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id)
>> +static phys_cpuid_t __acpi_get_phys_id(acpi_handle handle, int type,
>> + u32 acpi_id, bool ignore_disabled)
>> {
>> phys_cpuid_t phys_id;
>>
>> - phys_id = map_mat_entry(handle, type, acpi_id);
>> + phys_id = map_mat_entry(handle, type, acpi_id, ignore_disabled);
>> if (phys_id == PHYS_CPUID_INVALID)
>> - phys_id = map_madt_entry(type, acpi_id);
>> + phys_id = map_madt_entry(type, acpi_id, ignore_disabled);
>>
>> return phys_id;
>> }
>>
>> +phys_cpuid_t acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id)
>> +{
>> + return __acpi_get_phys_id(handle, type, acpi_id, true);
>> +}
>> +
>> int acpi_map_cpuid(phys_cpuid_t phys_id, u32 acpi_id)
>> {
>> #ifdef CONFIG_SMP
>> @@ -246,6 +263,68 @@ int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id)
>> }
>> EXPORT_SYMBOL_GPL(acpi_get_cpuid);
>>
>> +#ifdef CONFIG_ACPI_HOTPLUG_CPU
>> +static bool map_processor(acpi_handle handle, int *phys_id, int *cpuid)
>> +{
>> + int type;
>> + u32 acpi_id;
>> + acpi_status status;
>> + acpi_object_type acpi_type;
>> + unsigned long long tmp;
>> + union acpi_object object = { 0 };
>> + struct acpi_buffer buffer = { sizeof(union acpi_object), &object };
>> +
>> + status = acpi_get_type(handle, &acpi_type);
>> + if (ACPI_FAILURE(status))
>> + return false;
>> +
>> + switch (acpi_type) {
>> + case ACPI_TYPE_PROCESSOR:
>> + status = acpi_evaluate_object(handle, NULL, NULL, &buffer);
>> + if (ACPI_FAILURE(status))
>> + return false;
>> + acpi_id = object.processor.proc_id;
>> + break;
>> + case ACPI_TYPE_DEVICE:
>> + status = acpi_evaluate_integer(handle, "_UID", NULL, &tmp);
>> + if (ACPI_FAILURE(status))
>> + return false;
>> + acpi_id = tmp;
>> + break;
>> + default:
>> + return false;
>> + }
>> +
>> + type = (acpi_type == ACPI_TYPE_DEVICE) ? 1 : 0;
>> +
>> + *phys_id = __acpi_get_phys_id(handle, type, acpi_id, false);
>> + *cpuid = acpi_map_cpuid(*phys_id, acpi_id);
>> + if (*cpuid == -1)
>> + return false;
>> + return true;
>> +}
>> +
>> +static acpi_status __init
>> +set_processor_node_mapping(acpi_handle handle, u32 lvl, void *context,
>> + void **rv)
>> +{
>> + u32 apic_id;
>> + int cpu_id;
>> +
>> + if (!map_processor(handle, &apic_id, &cpu_id))
>> + return AE_ERROR;
>> + acpi_map_cpu2node(handle, cpu_id, apic_id);
>> + return AE_OK;
>> +}
>> +
>> +void __init acpi_set_processor_mapping(void)
>> +{
>> + acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
>> + ACPI_UINT32_MAX,
>> + set_processor_node_mapping, NULL, NULL, NULL);
>> +}
>> +#endif
>> +
>> #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
>> static int get_ioapic_id(struct acpi_subtable_header *entry, u32 gsi_base,
>> u64 *phys_addr, int *ioapic_id)
>> diff --git a/include/linux/acpi.h b/include/linux/acpi.h
>> index e4da5e3..70166df 100644
>> --- a/include/linux/acpi.h
>> +++ b/include/linux/acpi.h
>> @@ -162,6 +162,8 @@ typedef u32 phys_cpuid_t;
>> /* Arch dependent functions for cpu hotplug support */
>> int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu);
>> int acpi_unmap_cpu(int cpu);
>> +void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid);
>> +void __init acpi_set_processor_mapping(void);
>> #endif /* CONFIG_ACPI_HOTPLUG_CPU */
>>
>> #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
>> --
>> 1.8.3.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at http://www.tux.org/lkml/
> .
>
ping...
Any comments or suggestions are welcomed.
Regards,
Gu
On 05/14/2015 07:33 PM, Gu Zheng wrote:
> Yasuaki Ishimatsu found that with node online/offline, cpu<->node
> relationship is established. Because workqueue uses a info which
> was established at boot time, but it may be changed by node hotpluging.
>
> Once pool->node points to a stale node, following allocation failure
> happens.
> ==
> SLUB: Unable to allocate memory on node 2 (gfp=0x80d0)
> cache: kmalloc-192, object size: 192, buffer size: 192, default
> order:
> 1, min order: 0
> node 0: slabs: 6172, objs: 259224, free: 245741
> node 1: slabs: 3261, objs: 136962, free: 127656
> ==
>
> As the apicid <---> pxm and pxm <--> node relationship are persistent, then
> the apicid <--> node mapping is persistent, so the root cause is the
> cpu-id <-> lapicid mapping is not persistent (because the currently
> implementation always choose the first free cpu id for the new added cpu).
> If we can build persistent cpu-id <-> lapicid relationship, this problem
> will be fixed.
>
> This patch tries to build the whole world mapping cpuid <-> apicid <-> pxm <-> node
> for all possible processor at the boot, the detail implementation are 2 steps:
>
> Step1: generate a logic cpu id for all the local apic (both enabled and dsiabled)
> when register local apic
> Step2: map the cpu to the phyical node via an additional acpi ns walk for processor.
>
> Please refer to:
> https://lkml.org/lkml/2015/2/27/145
> https://lkml.org/lkml/2015/3/25/989
> for the previous discussion.
> ---
> V2: rebase on latest upstream.
> ---
>
> Signed-off-by: Gu Zheng <[email protected]>
> ---
> arch/ia64/kernel/acpi.c | 2 +-
> arch/x86/include/asm/mpspec.h | 1 +
> arch/x86/kernel/acpi/boot.c | 8 ++-
> arch/x86/kernel/apic/apic.c | 73 ++++++++++++++++++++-----
> arch/x86/mm/numa.c | 20 -------
> drivers/acpi/acpi_processor.c | 2 +-
> drivers/acpi/bus.c | 3 ++
> drivers/acpi/processor_core.c | 121 ++++++++++++++++++++++++++++++++++--------
> include/linux/acpi.h | 2 +
> 9 files changed, 172 insertions(+), 60 deletions(-)
>
> diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
> index b1698bc..7db5563 100644
> --- a/arch/ia64/kernel/acpi.c
> +++ b/arch/ia64/kernel/acpi.c
> @@ -796,7 +796,7 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
> * ACPI based hotplug CPU support
> */
> #ifdef CONFIG_ACPI_HOTPLUG_CPU
> -static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
> +int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
> {
> #ifdef CONFIG_ACPI_NUMA
> /*
> diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
> index b07233b..db902d8 100644
> --- a/arch/x86/include/asm/mpspec.h
> +++ b/arch/x86/include/asm/mpspec.h
> @@ -86,6 +86,7 @@ static inline void early_reserve_e820_mpc_new(void) { }
> #endif
>
> int generic_processor_info(int apicid, int version);
> +int __generic_processor_info(int apicid, int version, bool enabled);
>
> #define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC)
>
> diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
> index dbe76a1..c79115b 100644
> --- a/arch/x86/kernel/acpi/boot.c
> +++ b/arch/x86/kernel/acpi/boot.c
> @@ -174,15 +174,13 @@ static int acpi_register_lapic(int id, u8 enabled)
> return -EINVAL;
> }
>
> - if (!enabled) {
> + if (!enabled)
> ++disabled_cpus;
> - return -EINVAL;
> - }
>
> if (boot_cpu_physical_apicid != -1U)
> ver = apic_version[boot_cpu_physical_apicid];
>
> - return generic_processor_info(id, ver);
> + return __generic_processor_info(id, ver, enabled);
> }
>
> static int __init
> @@ -726,7 +724,7 @@ static void __init acpi_set_irq_model_ioapic(void)
> #ifdef CONFIG_ACPI_HOTPLUG_CPU
> #include <acpi/processor.h>
>
> -static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
> +void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
> {
> #ifdef CONFIG_ACPI_NUMA
> int nid;
> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> index dcb5285..7fbf2cb 100644
> --- a/arch/x86/kernel/apic/apic.c
> +++ b/arch/x86/kernel/apic/apic.c
> @@ -1977,7 +1977,38 @@ void disconnect_bsp_APIC(int virt_wire_setup)
> apic_write(APIC_LVT1, value);
> }
>
> -int generic_processor_info(int apicid, int version)
> +/*
> + * Logic cpu number(cpuid) to local APIC id persistent mappings.
> + * Do not clear the mapping even if cpu hot removed.
> + * */
> +static int apicid_to_cpuid[] = {
> + [0 ... NR_CPUS - 1] = -1,
> +};
> +
> +/*
> + * Internal cpu id bits, set the bit once cpu present, and never clear it.
> + * */
> +static cpumask_t cpuid_mask = CPU_MASK_NONE;
> +
> +static int get_cpuid(int apicid)
> +{
> + int free_id, i;
> +
> + free_id = cpumask_next_zero(-1, &cpuid_mask);
> + if (free_id >= nr_cpu_ids)
> + return -1;
> +
> + for (i = 0; i < free_id; i++)
> + if (apicid_to_cpuid[i] == apicid)
> + return i;
> +
> + apicid_to_cpuid[free_id] = apicid;
> + cpumask_set_cpu(free_id, &cpuid_mask);
> +
> + return free_id;
> +}
> +
> +int __generic_processor_info(int apicid, int version, bool enabled)
> {
> int cpu, max = nr_cpu_ids;
> bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
> @@ -2010,8 +2041,8 @@ int generic_processor_info(int apicid, int version)
> pr_warning("APIC: Disabling requested cpu."
> " Processor %d/0x%x ignored.\n",
> thiscpu, apicid);
> -
> - disabled_cpus++;
> + if (enabled)
> + disabled_cpus++;
> return -ENODEV;
> }
>
> @@ -2027,8 +2058,8 @@ int generic_processor_info(int apicid, int version)
> "ACPI: NR_CPUS/possible_cpus limit of %i almost"
> " reached. Keeping one slot for boot cpu."
> " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
> -
> - disabled_cpus++;
> + if (enabled)
> + disabled_cpus++;
> return -ENODEV;
> }
>
> @@ -2039,11 +2070,11 @@ int generic_processor_info(int apicid, int version)
> "ACPI: NR_CPUS/possible_cpus limit of %i reached."
> " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
>
> - disabled_cpus++;
> + if (enabled)
> + disabled_cpus++;
> return -EINVAL;
> }
>
> - num_processors++;
> if (apicid == boot_cpu_physical_apicid) {
> /*
> * x86_bios_cpu_apicid is required to have processors listed
> @@ -2053,9 +2084,20 @@ int generic_processor_info(int apicid, int version)
> * for BSP.
> */
> cpu = 0;
> - } else
> - cpu = cpumask_next_zero(-1, cpu_present_mask);
> -
> + } else {
> + cpu = get_cpuid(apicid);
> + if (cpu < 0) {
> + int thiscpu = max + disabled_cpus;
> +
> + pr_warning(" Processor %d/0x%x ignored.\n",
> + thiscpu, apicid);
> + if (enabled)
> + disabled_cpus++;
> + return -EINVAL;
> + }
> + }
> + if (enabled)
> + num_processors++;
> /*
> * Validate version
> */
> @@ -2071,7 +2113,8 @@ int generic_processor_info(int apicid, int version)
> apic_version[boot_cpu_physical_apicid], cpu, version);
> }
>
> - physid_set(apicid, phys_cpu_present_map);
> + if (enabled)
> + physid_set(apicid, phys_cpu_present_map);
> if (apicid > max_physical_apicid)
> max_physical_apicid = apicid;
>
> @@ -2084,11 +2127,17 @@ int generic_processor_info(int apicid, int version)
> apic->x86_32_early_logical_apicid(cpu);
> #endif
> set_cpu_possible(cpu, true);
> - set_cpu_present(cpu, true);
> + if (enabled)
> + set_cpu_present(cpu, true);
>
> return cpu;
> }
>
> +int generic_processor_info(int apicid, int version)
> +{
> + return __generic_processor_info(apicid, version, true);
> +}
> +
> int hard_smp_processor_id(void)
> {
> return read_apic_id();
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 4053bb5..a733cf9 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -702,24 +702,6 @@ void __init x86_numa_init(void)
> numa_init(dummy_numa_init);
> }
>
> -static __init int find_near_online_node(int node)
> -{
> - int n, val;
> - int min_val = INT_MAX;
> - int best_node = -1;
> -
> - for_each_online_node(n) {
> - val = node_distance(node, n);
> -
> - if (val < min_val) {
> - min_val = val;
> - best_node = n;
> - }
> - }
> -
> - return best_node;
> -}
> -
> /*
> * Setup early cpu_to_node.
> *
> @@ -746,8 +728,6 @@ void __init init_cpu_to_node(void)
>
> if (node == NUMA_NO_NODE)
> continue;
> - if (!node_online(node))
> - node = find_near_online_node(node);
> numa_set_node(cpu, node);
> }
> }
> diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
> index 58f335c..83bc464 100644
> --- a/drivers/acpi/acpi_processor.c
> +++ b/drivers/acpi/acpi_processor.c
> @@ -285,7 +285,7 @@ static int acpi_processor_get_info(struct acpi_device *device)
> * less than the max # of CPUs. They should be ignored _iff
> * they are physically not present.
> */
> - if (pr->id == -1) {
> + if (pr->id == -1 || !cpu_present(pr->id)) {
> int ret = acpi_processor_hotadd_init(pr);
> if (ret)
> return ret;
> diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
> index c412fdb..98cdce9 100644
> --- a/drivers/acpi/bus.c
> +++ b/drivers/acpi/bus.c
> @@ -674,6 +674,9 @@ static int __init acpi_init(void)
> acpi_debugfs_init();
> acpi_sleep_proc_init();
> acpi_wakeup_device_init();
> +#ifdef CONFIG_ACPI_HOTPLUG_CPU
> + acpi_set_processor_mapping();
> +#endif
> return 0;
> }
>
> diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
> index b1ec78b..74798fe 100644
> --- a/drivers/acpi/processor_core.c
> +++ b/drivers/acpi/processor_core.c
> @@ -32,12 +32,12 @@ static struct acpi_table_madt *get_madt_table(void)
> }
>
> static int map_lapic_id(struct acpi_subtable_header *entry,
> - u32 acpi_id, phys_cpuid_t *apic_id)
> + u32 acpi_id, phys_cpuid_t *apic_id, bool ignore_disabled)
> {
> struct acpi_madt_local_apic *lapic =
> container_of(entry, struct acpi_madt_local_apic, header);
>
> - if (!(lapic->lapic_flags & ACPI_MADT_ENABLED))
> + if (ignore_disabled && !(lapic->lapic_flags & ACPI_MADT_ENABLED))
> return -ENODEV;
>
> if (lapic->processor_id != acpi_id)
> @@ -48,12 +48,13 @@ static int map_lapic_id(struct acpi_subtable_header *entry,
> }
>
> static int map_x2apic_id(struct acpi_subtable_header *entry,
> - int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id)
> + int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id,
> + bool ignore_disabled)
> {
> struct acpi_madt_local_x2apic *apic =
> container_of(entry, struct acpi_madt_local_x2apic, header);
>
> - if (!(apic->lapic_flags & ACPI_MADT_ENABLED))
> + if (ignore_disabled && !(apic->lapic_flags & ACPI_MADT_ENABLED))
> return -ENODEV;
>
> if (device_declaration && (apic->uid == acpi_id)) {
> @@ -65,12 +66,13 @@ static int map_x2apic_id(struct acpi_subtable_header *entry,
> }
>
> static int map_lsapic_id(struct acpi_subtable_header *entry,
> - int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id)
> + int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id,
> + bool ignore_disabled)
> {
> struct acpi_madt_local_sapic *lsapic =
> container_of(entry, struct acpi_madt_local_sapic, header);
>
> - if (!(lsapic->lapic_flags & ACPI_MADT_ENABLED))
> + if (ignore_disabled && !(lsapic->lapic_flags & ACPI_MADT_ENABLED))
> return -ENODEV;
>
> if (device_declaration) {
> @@ -87,12 +89,13 @@ static int map_lsapic_id(struct acpi_subtable_header *entry,
> * Retrieve the ARM CPU physical identifier (MPIDR)
> */
> static int map_gicc_mpidr(struct acpi_subtable_header *entry,
> - int device_declaration, u32 acpi_id, phys_cpuid_t *mpidr)
> + int device_declaration, u32 acpi_id, phys_cpuid_t *mpidr,
> + bool ignore_disabled)
> {
> struct acpi_madt_generic_interrupt *gicc =
> container_of(entry, struct acpi_madt_generic_interrupt, header);
>
> - if (!(gicc->flags & ACPI_MADT_ENABLED))
> + if (ignore_disabled && !(gicc->flags & ACPI_MADT_ENABLED))
> return -ENODEV;
>
> /* device_declaration means Device object in DSDT, in the
> @@ -108,7 +111,7 @@ static int map_gicc_mpidr(struct acpi_subtable_header *entry,
> return -EINVAL;
> }
>
> -static phys_cpuid_t map_madt_entry(int type, u32 acpi_id)
> +static phys_cpuid_t map_madt_entry(int type, u32 acpi_id, bool ignore_disabled)
> {
> unsigned long madt_end, entry;
> phys_cpuid_t phys_id = PHYS_CPUID_INVALID; /* CPU hardware ID */
> @@ -128,16 +131,20 @@ static phys_cpuid_t map_madt_entry(int type, u32 acpi_id)
> struct acpi_subtable_header *header =
> (struct acpi_subtable_header *)entry;
> if (header->type == ACPI_MADT_TYPE_LOCAL_APIC) {
> - if (!map_lapic_id(header, acpi_id, &phys_id))
> + if (!map_lapic_id(header, acpi_id,
> + &phys_id, ignore_disabled))
> break;
> } else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC) {
> - if (!map_x2apic_id(header, type, acpi_id, &phys_id))
> + if (!map_x2apic_id(header, type, acpi_id,
> + &phys_id, ignore_disabled))
> break;
> } else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC) {
> - if (!map_lsapic_id(header, type, acpi_id, &phys_id))
> + if (!map_lsapic_id(header, type, acpi_id,
> + &phys_id, ignore_disabled))
> break;
> } else if (header->type == ACPI_MADT_TYPE_GENERIC_INTERRUPT) {
> - if (!map_gicc_mpidr(header, type, acpi_id, &phys_id))
> + if (!map_gicc_mpidr(header, type, acpi_id,
> + &phys_id, ignore_disabled))
> break;
> }
> entry += header->length;
> @@ -145,7 +152,8 @@ static phys_cpuid_t map_madt_entry(int type, u32 acpi_id)
> return phys_id;
> }
>
> -static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id)
> +static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id,
> + bool ignore_disabled)
> {
> struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
> union acpi_object *obj;
> @@ -166,30 +174,39 @@ static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id)
>
> header = (struct acpi_subtable_header *)obj->buffer.pointer;
> if (header->type == ACPI_MADT_TYPE_LOCAL_APIC)
> - map_lapic_id(header, acpi_id, &phys_id);
> + map_lapic_id(header, acpi_id, &phys_id, ignore_disabled);
> else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC)
> - map_lsapic_id(header, type, acpi_id, &phys_id);
> + map_lsapic_id(header, type, acpi_id,
> + &phys_id, ignore_disabled);
> else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC)
> - map_x2apic_id(header, type, acpi_id, &phys_id);
> + map_x2apic_id(header, type, acpi_id,
> + &phys_id, ignore_disabled);
> else if (header->type == ACPI_MADT_TYPE_GENERIC_INTERRUPT)
> - map_gicc_mpidr(header, type, acpi_id, &phys_id);
> + map_gicc_mpidr(header, type, acpi_id,
> + &phys_id, ignore_disabled);
>
> exit:
> kfree(buffer.pointer);
> return phys_id;
> }
>
> -phys_cpuid_t acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id)
> +static phys_cpuid_t __acpi_get_phys_id(acpi_handle handle, int type,
> + u32 acpi_id, bool ignore_disabled)
> {
> phys_cpuid_t phys_id;
>
> - phys_id = map_mat_entry(handle, type, acpi_id);
> + phys_id = map_mat_entry(handle, type, acpi_id, ignore_disabled);
> if (phys_id == PHYS_CPUID_INVALID)
> - phys_id = map_madt_entry(type, acpi_id);
> + phys_id = map_madt_entry(type, acpi_id, ignore_disabled);
>
> return phys_id;
> }
>
> +phys_cpuid_t acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id)
> +{
> + return __acpi_get_phys_id(handle, type, acpi_id, true);
> +}
> +
> int acpi_map_cpuid(phys_cpuid_t phys_id, u32 acpi_id)
> {
> #ifdef CONFIG_SMP
> @@ -246,6 +263,68 @@ int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id)
> }
> EXPORT_SYMBOL_GPL(acpi_get_cpuid);
>
> +#ifdef CONFIG_ACPI_HOTPLUG_CPU
> +static bool map_processor(acpi_handle handle, int *phys_id, int *cpuid)
> +{
> + int type;
> + u32 acpi_id;
> + acpi_status status;
> + acpi_object_type acpi_type;
> + unsigned long long tmp;
> + union acpi_object object = { 0 };
> + struct acpi_buffer buffer = { sizeof(union acpi_object), &object };
> +
> + status = acpi_get_type(handle, &acpi_type);
> + if (ACPI_FAILURE(status))
> + return false;
> +
> + switch (acpi_type) {
> + case ACPI_TYPE_PROCESSOR:
> + status = acpi_evaluate_object(handle, NULL, NULL, &buffer);
> + if (ACPI_FAILURE(status))
> + return false;
> + acpi_id = object.processor.proc_id;
> + break;
> + case ACPI_TYPE_DEVICE:
> + status = acpi_evaluate_integer(handle, "_UID", NULL, &tmp);
> + if (ACPI_FAILURE(status))
> + return false;
> + acpi_id = tmp;
> + break;
> + default:
> + return false;
> + }
> +
> + type = (acpi_type == ACPI_TYPE_DEVICE) ? 1 : 0;
> +
> + *phys_id = __acpi_get_phys_id(handle, type, acpi_id, false);
> + *cpuid = acpi_map_cpuid(*phys_id, acpi_id);
> + if (*cpuid == -1)
> + return false;
> + return true;
> +}
> +
> +static acpi_status __init
> +set_processor_node_mapping(acpi_handle handle, u32 lvl, void *context,
> + void **rv)
> +{
> + u32 apic_id;
> + int cpu_id;
> +
> + if (!map_processor(handle, &apic_id, &cpu_id))
> + return AE_ERROR;
> + acpi_map_cpu2node(handle, cpu_id, apic_id);
> + return AE_OK;
> +}
> +
> +void __init acpi_set_processor_mapping(void)
> +{
> + acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
> + ACPI_UINT32_MAX,
> + set_processor_node_mapping, NULL, NULL, NULL);
> +}
> +#endif
> +
> #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
> static int get_ioapic_id(struct acpi_subtable_header *entry, u32 gsi_base,
> u64 *phys_addr, int *ioapic_id)
> diff --git a/include/linux/acpi.h b/include/linux/acpi.h
> index e4da5e3..70166df 100644
> --- a/include/linux/acpi.h
> +++ b/include/linux/acpi.h
> @@ -162,6 +162,8 @@ typedef u32 phys_cpuid_t;
> /* Arch dependent functions for cpu hotplug support */
> int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu);
> int acpi_unmap_cpu(int cpu);
> +void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid);
> +void __init acpi_set_processor_mapping(void);
> #endif /* CONFIG_ACPI_HOTPLUG_CPU */
>
> #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC