From: Gu Zheng <guz.fnst@cn.fujitsu.com>
To: <tj@kernel.org>
CC: <linux-kernel@vger.kernel.org>, <laijs@cn.fujitsu.com>,
        <isimatu.yasuaki@jp.fujitsu.com>, <kamezawa.hiroyu@jp.fujitsu.com>,
        <tangchen@cn.fujitsu.com>, <guz.fnst@cn.fujitsu.com>
Subject: [PATCH] workqueue: update numa affinity when node hotplug
Date: Fri, 27 Feb 2015 18:04:52 +0800
Message-ID: <1425031492-32300-1-git-send-email-guz.fnst@cn.fujitsu.com>
MIME-Version: 1.0
Content-Type: text/plain
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 8306
Lines: 277

Yasuaki Ishimatsu found that with node online/offline, cpu<->node
relationship is established. Because workqueue uses a info which was
established at boot time, but it may be changed by node hotpluging.

Once pool->node points to a stale node, following allocation failure
happens.
  ==
     SLUB: Unable to allocate memory on node 2 (gfp=0x80d0)
      cache: kmalloc-192, object size: 192, buffer size: 192, default
order:
    1, min order: 0
      node 0: slabs: 6172, objs: 259224, free: 245741
      node 1: slabs: 3261, objs: 136962, free: 127656
  ==

This patch use the present cpumask directly rather than self-maintained
wq_numa_possible_cpumask, so that the wq is a simple customer of numa, and
updates per cpu workqueue pool's node affinity when numa node changed via
the notify callback that registered to node on/down event at the functions
try_online/offline_node.

Unbound workqueue's per node pool are already updated by wq_update_unbound_numa()
at CPU_DOWN_PREPARE of the last cpu, by existing code.

Reported-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
---
 include/linux/memory_hotplug.h |    8 +++
 kernel/workqueue.c             |   93 +++++++++++++++++++++++++--------------
 mm/memory_hotplug.c            |   44 +++++++++++++++++++
 3 files changed, 111 insertions(+), 34 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 8f1a419..07bb94e 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -98,6 +98,14 @@ extern void __online_page_free(struct page *page);
 
 extern int try_online_node(int nid);
 
+/* event type of node on/dwon */
+#define NODE_ON		0x00001
+#define NODE_DOWN		0x00002
+
+extern int __ref register_node_notifier(struct notifier_block *nb);
+extern int __ref unregister_node_notifier(struct notifier_block *nb);
+extern void get_present_cpumask_of_node(cpumask_var_t cpumask, int node);
+
 #ifdef CONFIG_MEMORY_HOTREMOVE
 extern bool is_pageblock_removable_nolock(struct page *page);
 extern int arch_remove_memory(u64 start, u64 size);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index beeeac9..60c5d29 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -265,9 +265,6 @@ struct workqueue_struct {
 
 static struct kmem_cache *pwq_cache;
 
-static cpumask_var_t *wq_numa_possible_cpumask;
-					/* possible CPUs of each node */
-
 static bool wq_disable_numa;
 module_param_named(disable_numa, wq_disable_numa, bool, 0444);
 
@@ -3493,10 +3490,18 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
 	pool->attrs->no_numa = false;
 
 	/* if cpumask is contained inside a NUMA node, we belong to that node */
-	if (wq_numa_enabled) {
+	if (wq_numa_enabled && !cpumask_empty(pool->attrs->cpumask)) {
 		for_each_node(node) {
-			if (cpumask_subset(pool->attrs->cpumask,
-					   wq_numa_possible_cpumask[node])) {
+			int cpu;
+			bool is_sub_set = true;
+
+			for_each_cpu(cpu, pool->attrs->cpumask)
+				if (cpu_to_node(cpu) != node) {
+					is_sub_set = false;
+					break;
+				}
+
+			if (is_sub_set) {
 				pool->node = node;
 				break;
 			}
@@ -3717,8 +3722,9 @@ static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
 	if (cpumask_empty(cpumask))
 		goto use_dfl;
 
-	/* yeap, return possible CPUs in @node that @attrs wants */
-	cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
+	/* yeap, return present CPUs in @node that @attrs wants */
+	get_present_cpumask_of_node(cpumask, node);
+	cpumask_and(cpumask, attrs->cpumask, cpumask);
 	return !cpumask_equal(cpumask, attrs->cpumask);
 
 use_dfl:
@@ -4564,6 +4570,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
 					       void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
+	int node = cpu_to_node(cpu);
 	struct worker_pool *pool;
 	struct workqueue_struct *wq;
 	int pi;
@@ -4571,6 +4578,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_UP_PREPARE:
 		for_each_cpu_worker_pool(pool, cpu) {
+			pool->node = node;
 			if (pool->nr_workers)
 				continue;
 			if (!create_worker(pool))
@@ -4787,11 +4795,50 @@ out_unlock:
 }
 #endif /* CONFIG_FREEZER */
 
-static void __init wq_numa_init(void)
+static int wq_numa_callback(struct notifier_block *self,
+			    unsigned long action, void *arg)
 {
-	cpumask_var_t *tbl;
-	int node, cpu;
+	int node = (unsigned long)arg;
+	struct worker_pool *pool;
+	int pi;
+	int cpu;
 
+	switch (action) {
+	case NODE_DOWN:
+		mutex_lock(&wq_pool_mutex);
+		for_each_pool(pool, pi) {
+			if (pool->node == node) {
+				pool->node = NUMA_NO_NODE;
+				if (pool->cpu < 0)
+					hash_del(&pool->hash_node);
+			}
+		}
+		mutex_unlock(&wq_pool_mutex);
+		break;
+	case NODE_ON:
+		mutex_lock(&wq_pool_mutex);
+		for_each_present_cpu(cpu) {
+			if (node != cpu_to_node(cpu))
+				continue;
+			for_each_cpu_worker_pool(pool, cpu)
+				pool->node = node;
+		}
+		mutex_unlock(&wq_pool_mutex);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static struct notifier_block wq_numa_nb = {
+	.notifier_call = wq_numa_callback,
+	.priority = 0
+};
+
+static void __init wq_numa_init(void)
+{
 	if (num_possible_nodes() <= 1)
 		return;
 
@@ -4803,29 +4850,7 @@ static void __init wq_numa_init(void)
 	wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
 	BUG_ON(!wq_update_unbound_numa_attrs_buf);
 
-	/*
-	 * We want masks of possible CPUs of each node which isn't readily
-	 * available.  Build one from cpu_to_node() which should have been
-	 * fully initialized by now.
-	 */
-	tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL);
-	BUG_ON(!tbl);
-
-	for_each_node(node)
-		BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
-				node_online(node) ? node : NUMA_NO_NODE));
-
-	for_each_possible_cpu(cpu) {
-		node = cpu_to_node(cpu);
-		if (WARN_ON(node == NUMA_NO_NODE)) {
-			pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
-			/* happens iff arch is bonkers, let's just proceed */
-			return;
-		}
-		cpumask_set_cpu(cpu, tbl[node]);
-	}
-
-	wq_numa_possible_cpumask = tbl;
+	register_node_notifier(&wq_numa_nb);
 	wq_numa_enabled = true;
 }
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9fab107..1778628 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1132,6 +1132,44 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
 	return;
 }
 
+static RAW_NOTIFIER_HEAD(node_chain);
+
+int __ref register_node_notifier(struct notifier_block *nb)
+{
+	int ret;
+
+	ret = raw_notifier_chain_register(&node_chain, nb);
+	return ret;
+}
+EXPORT_SYMBOL(register_node_notifier);
+
+int __ref unregister_node_notifier(struct notifier_block *nb)
+{
+	int ret;
+
+	ret = raw_notifier_chain_unregister(&node_chain, nb);
+	return ret;
+}
+EXPORT_SYMBOL(unregister_node_notifier);
+
+static int call_node_notify(unsigned long val, void *v)
+{
+	int ret;
+
+	ret = __raw_notifier_call_chain(&node_chain, val, v, -1, NULL);
+
+	return notifier_to_errno(ret);
+}
+
+void get_present_cpumask_of_node(cpumask_var_t cpumask, int node)
+{
+	unsigned int cpu;
+
+	for_each_present_cpu(cpu)
+		if (node == cpu_to_node(cpu))
+			cpumask_set_cpu(cpu, cpumask);
+}
+EXPORT_SYMBOL(get_present_cpumask_of_node);
 
 /**
  * try_online_node - online a node if offlined
@@ -1157,6 +1195,9 @@ int try_online_node(int nid)
 	ret = register_one_node(nid);
 	BUG_ON(ret);
 
+	/* notify that the node is on */
+	call_node_notify(NODE_ON, (void *)(long)nid);
+
 	if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
 		mutex_lock(&zonelists_mutex);
 		build_all_zonelists(NULL, NULL);
@@ -1978,6 +2019,9 @@ void try_offline_node(int nid)
 			vfree(zone->wait_table);
 	}
 
+	/* notify that the node is down */
+	call_node_notify(NODE_DOWN, (void *)(long)nid);
+
 	/*
 	 * Since there is no way to guarentee the address of pgdat/zone is not
 	 * on stack of any kernel threads or used by other kernel objects
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/