LinuxLists.cc - [PATCH] workqueue: update numa affinity when node hotplug

2015-02-27 10:22:25

Subject: [PATCH] workqueue: update numa affinity when node hotplug

Yasuaki Ishimatsu found that with node online/offline, cpu<->node
relationship is established. Because workqueue uses a info which was
established at boot time, but it may be changed by node hotpluging.

Once pool->node points to a stale node, following allocation failure
happens.
==
SLUB: Unable to allocate memory on node 2 (gfp=0x80d0)
cache: kmalloc-192, object size: 192, buffer size: 192, default
order:
1, min order: 0
node 0: slabs: 6172, objs: 259224, free: 245741
node 1: slabs: 3261, objs: 136962, free: 127656
==

This patch use the present cpumask directly rather than self-maintained
wq_numa_possible_cpumask, so that the wq is a simple customer of numa, and
updates per cpu workqueue pool's node affinity when numa node changed via
the notify callback that registered to node on/down event at the functions
try_online/offline_node.

Unbound workqueue's per node pool are already updated by wq_update_unbound_numa()
at CPU_DOWN_PREPARE of the last cpu, by existing code.

Reported-by: Yasuaki Ishimatsu <[email protected]>
Signed-off-by: Gu Zheng <[email protected]>
---
include/linux/memory_hotplug.h | 8 +++
kernel/workqueue.c | 93 +++++++++++++++++++++++++--------------
mm/memory_hotplug.c | 44 +++++++++++++++++++
3 files changed, 111 insertions(+), 34 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 8f1a419..07bb94e 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -98,6 +98,14 @@ extern void __online_page_free(struct page *page);

extern int try_online_node(int nid);

+/* event type of node on/dwon */
+#define NODE_ON 0x00001
+#define NODE_DOWN 0x00002
+
+extern int __ref register_node_notifier(struct notifier_block *nb);
+extern int __ref unregister_node_notifier(struct notifier_block *nb);
+extern void get_present_cpumask_of_node(cpumask_var_t cpumask, int node);
+
#ifdef CONFIG_MEMORY_HOTREMOVE
extern bool is_pageblock_removable_nolock(struct page *page);
extern int arch_remove_memory(u64 start, u64 size);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index beeeac9..60c5d29 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -265,9 +265,6 @@ struct workqueue_struct {

static struct kmem_cache *pwq_cache;

-static cpumask_var_t *wq_numa_possible_cpumask;
- /* possible CPUs of each node */
-
static bool wq_disable_numa;
module_param_named(disable_numa, wq_disable_numa, bool, 0444);

@@ -3493,10 +3490,18 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
pool->attrs->no_numa = false;

/* if cpumask is contained inside a NUMA node, we belong to that node */
- if (wq_numa_enabled) {
+ if (wq_numa_enabled && !cpumask_empty(pool->attrs->cpumask)) {
for_each_node(node) {
- if (cpumask_subset(pool->attrs->cpumask,
- wq_numa_possible_cpumask[node])) {
+ int cpu;
+ bool is_sub_set = true;
+
+ for_each_cpu(cpu, pool->attrs->cpumask)
+ if (cpu_to_node(cpu) != node) {
+ is_sub_set = false;
+ break;
+ }
+
+ if (is_sub_set) {
pool->node = node;
break;
}
@@ -3717,8 +3722,9 @@ static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
if (cpumask_empty(cpumask))
goto use_dfl;

- /* yeap, return possible CPUs in @node that @attrs wants */
- cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
+ /* yeap, return present CPUs in @node that @attrs wants */
+ get_present_cpumask_of_node(cpumask, node);
+ cpumask_and(cpumask, attrs->cpumask, cpumask);
return !cpumask_equal(cpumask, attrs->cpumask);

use_dfl:
@@ -4564,6 +4570,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
void *hcpu)
{
int cpu = (unsigned long)hcpu;
+ int node = cpu_to_node(cpu);
struct worker_pool *pool;
struct workqueue_struct *wq;
int pi;
@@ -4571,6 +4578,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
for_each_cpu_worker_pool(pool, cpu) {
+ pool->node = node;
if (pool->nr_workers)
continue;
if (!create_worker(pool))
@@ -4787,11 +4795,50 @@ out_unlock:
}
#endif /* CONFIG_FREEZER */

-static void __init wq_numa_init(void)
+static int wq_numa_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
{
- cpumask_var_t *tbl;
- int node, cpu;
+ int node = (unsigned long)arg;
+ struct worker_pool *pool;
+ int pi;
+ int cpu;

+ switch (action) {
+ case NODE_DOWN:
+ mutex_lock(&wq_pool_mutex);
+ for_each_pool(pool, pi) {
+ if (pool->node == node) {
+ pool->node = NUMA_NO_NODE;
+ if (pool->cpu < 0)
+ hash_del(&pool->hash_node);
+ }
+ }
+ mutex_unlock(&wq_pool_mutex);
+ break;
+ case NODE_ON:
+ mutex_lock(&wq_pool_mutex);
+ for_each_present_cpu(cpu) {
+ if (node != cpu_to_node(cpu))
+ continue;
+ for_each_cpu_worker_pool(pool, cpu)
+ pool->node = node;
+ }
+ mutex_unlock(&wq_pool_mutex);
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static struct notifier_block wq_numa_nb = {
+ .notifier_call = wq_numa_callback,
+ .priority = 0
+};
+
+static void __init wq_numa_init(void)
+{
if (num_possible_nodes() <= 1)
return;

@@ -4803,29 +4850,7 @@ static void __init wq_numa_init(void)
wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
BUG_ON(!wq_update_unbound_numa_attrs_buf);

- /*
- * We want masks of possible CPUs of each node which isn't readily
- * available. Build one from cpu_to_node() which should have been
- * fully initialized by now.
- */
- tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL);
- BUG_ON(!tbl);
-
- for_each_node(node)
- BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
- node_online(node) ? node : NUMA_NO_NODE));
-
- for_each_possible_cpu(cpu) {
- node = cpu_to_node(cpu);
- if (WARN_ON(node == NUMA_NO_NODE)) {
- pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
- /* happens iff arch is bonkers, let's just proceed */
- return;
- }
- cpumask_set_cpu(cpu, tbl[node]);
- }
-
- wq_numa_possible_cpumask = tbl;
+ register_node_notifier(&wq_numa_nb);
wq_numa_enabled = true;
}

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9fab107..1778628 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1132,6 +1132,44 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
return;
}

+static RAW_NOTIFIER_HEAD(node_chain);
+
+int __ref register_node_notifier(struct notifier_block *nb)
+{
+ int ret;
+
+ ret = raw_notifier_chain_register(&node_chain, nb);
+ return ret;
+}
+EXPORT_SYMBOL(register_node_notifier);
+
+int __ref unregister_node_notifier(struct notifier_block *nb)
+{
+ int ret;
+
+ ret = raw_notifier_chain_unregister(&node_chain, nb);
+ return ret;
+}
+EXPORT_SYMBOL(unregister_node_notifier);
+
+static int call_node_notify(unsigned long val, void *v)
+{
+ int ret;
+
+ ret = __raw_notifier_call_chain(&node_chain, val, v, -1, NULL);
+
+ return notifier_to_errno(ret);
+}
+
+void get_present_cpumask_of_node(cpumask_var_t cpumask, int node)
+{
+ unsigned int cpu;
+
+ for_each_present_cpu(cpu)
+ if (node == cpu_to_node(cpu))
+ cpumask_set_cpu(cpu, cpumask);
+}
+EXPORT_SYMBOL(get_present_cpumask_of_node);

/**
* try_online_node - online a node if offlined
@@ -1157,6 +1195,9 @@ int try_online_node(int nid)
ret = register_one_node(nid);
BUG_ON(ret);

+ /* notify that the node is on */
+ call_node_notify(NODE_ON, (void *)(long)nid);
+
if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
mutex_lock(&zonelists_mutex);
build_all_zonelists(NULL, NULL);
@@ -1978,6 +2019,9 @@ void try_offline_node(int nid)
vfree(zone->wait_table);
}

+ /* notify that the node is down */
+ call_node_notify(NODE_DOWN, (void *)(long)nid);
+
/*
* Since there is no way to guarentee the address of pgdat/zone is not
* on stack of any kernel threads or used by other kernel objects
--
1.7.7

2015-02-27 11:54:08

by Tejun Heo

[permalink] [raw]

Subject: Re: [PATCH] workqueue: update numa affinity when node hotplug

Hello,

On Fri, Feb 27, 2015 at 06:04:52PM +0800, Gu Zheng wrote:
> Yasuaki Ishimatsu found that with node online/offline, cpu<->node
> relationship is established. Because workqueue uses a info which was
> established at boot time, but it may be changed by node hotpluging.

I've asked this a couple times now but can somebody please justify why
cpu<->node relationship needs to change? If it has to change, that's
okay but let's please first make sure that we understand why such
thing is necessary so that we can figure out what kind of facilities
are necessary for such dynamism.

Thanks.

--
tejun