Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754540AbbB0KWZ (ORCPT ); Fri, 27 Feb 2015 05:22:25 -0500 Received: from cn.fujitsu.com ([59.151.112.132]:55887 "EHLO heian.cn.fujitsu.com" rhost-flags-OK-FAIL-OK-FAIL) by vger.kernel.org with ESMTP id S1752385AbbB0KWH (ORCPT ); Fri, 27 Feb 2015 05:22:07 -0500 X-IronPort-AV: E=Sophos;i="5.04,848,1406563200"; d="scan'208";a="58949836" From: Gu Zheng To: CC: , , , , , Subject: [PATCH] workqueue: update numa affinity when node hotplug Date: Fri, 27 Feb 2015 18:04:52 +0800 Message-ID: <1425031492-32300-1-git-send-email-guz.fnst@cn.fujitsu.com> X-Mailer: git-send-email 1.7.7 MIME-Version: 1.0 Content-Type: text/plain X-Originating-IP: [10.167.226.100] Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 8306 Lines: 277 Yasuaki Ishimatsu found that with node online/offline, cpu<->node relationship is established. Because workqueue uses a info which was established at boot time, but it may be changed by node hotpluging. Once pool->node points to a stale node, following allocation failure happens. == SLUB: Unable to allocate memory on node 2 (gfp=0x80d0) cache: kmalloc-192, object size: 192, buffer size: 192, default order: 1, min order: 0 node 0: slabs: 6172, objs: 259224, free: 245741 node 1: slabs: 3261, objs: 136962, free: 127656 == This patch use the present cpumask directly rather than self-maintained wq_numa_possible_cpumask, so that the wq is a simple customer of numa, and updates per cpu workqueue pool's node affinity when numa node changed via the notify callback that registered to node on/down event at the functions try_online/offline_node. Unbound workqueue's per node pool are already updated by wq_update_unbound_numa() at CPU_DOWN_PREPARE of the last cpu, by existing code. Reported-by: Yasuaki Ishimatsu Signed-off-by: Gu Zheng --- include/linux/memory_hotplug.h | 8 +++ kernel/workqueue.c | 93 +++++++++++++++++++++++++-------------- mm/memory_hotplug.c | 44 +++++++++++++++++++ 3 files changed, 111 insertions(+), 34 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 8f1a419..07bb94e 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -98,6 +98,14 @@ extern void __online_page_free(struct page *page); extern int try_online_node(int nid); +/* event type of node on/dwon */ +#define NODE_ON 0x00001 +#define NODE_DOWN 0x00002 + +extern int __ref register_node_notifier(struct notifier_block *nb); +extern int __ref unregister_node_notifier(struct notifier_block *nb); +extern void get_present_cpumask_of_node(cpumask_var_t cpumask, int node); + #ifdef CONFIG_MEMORY_HOTREMOVE extern bool is_pageblock_removable_nolock(struct page *page); extern int arch_remove_memory(u64 start, u64 size); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index beeeac9..60c5d29 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -265,9 +265,6 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; -static cpumask_var_t *wq_numa_possible_cpumask; - /* possible CPUs of each node */ - static bool wq_disable_numa; module_param_named(disable_numa, wq_disable_numa, bool, 0444); @@ -3493,10 +3490,18 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) pool->attrs->no_numa = false; /* if cpumask is contained inside a NUMA node, we belong to that node */ - if (wq_numa_enabled) { + if (wq_numa_enabled && !cpumask_empty(pool->attrs->cpumask)) { for_each_node(node) { - if (cpumask_subset(pool->attrs->cpumask, - wq_numa_possible_cpumask[node])) { + int cpu; + bool is_sub_set = true; + + for_each_cpu(cpu, pool->attrs->cpumask) + if (cpu_to_node(cpu) != node) { + is_sub_set = false; + break; + } + + if (is_sub_set) { pool->node = node; break; } @@ -3717,8 +3722,9 @@ static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, if (cpumask_empty(cpumask)) goto use_dfl; - /* yeap, return possible CPUs in @node that @attrs wants */ - cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]); + /* yeap, return present CPUs in @node that @attrs wants */ + get_present_cpumask_of_node(cpumask, node); + cpumask_and(cpumask, attrs->cpumask, cpumask); return !cpumask_equal(cpumask, attrs->cpumask); use_dfl: @@ -4564,6 +4570,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, void *hcpu) { int cpu = (unsigned long)hcpu; + int node = cpu_to_node(cpu); struct worker_pool *pool; struct workqueue_struct *wq; int pi; @@ -4571,6 +4578,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: for_each_cpu_worker_pool(pool, cpu) { + pool->node = node; if (pool->nr_workers) continue; if (!create_worker(pool)) @@ -4787,11 +4795,50 @@ out_unlock: } #endif /* CONFIG_FREEZER */ -static void __init wq_numa_init(void) +static int wq_numa_callback(struct notifier_block *self, + unsigned long action, void *arg) { - cpumask_var_t *tbl; - int node, cpu; + int node = (unsigned long)arg; + struct worker_pool *pool; + int pi; + int cpu; + switch (action) { + case NODE_DOWN: + mutex_lock(&wq_pool_mutex); + for_each_pool(pool, pi) { + if (pool->node == node) { + pool->node = NUMA_NO_NODE; + if (pool->cpu < 0) + hash_del(&pool->hash_node); + } + } + mutex_unlock(&wq_pool_mutex); + break; + case NODE_ON: + mutex_lock(&wq_pool_mutex); + for_each_present_cpu(cpu) { + if (node != cpu_to_node(cpu)) + continue; + for_each_cpu_worker_pool(pool, cpu) + pool->node = node; + } + mutex_unlock(&wq_pool_mutex); + break; + default: + break; + } + + return 0; +} + +static struct notifier_block wq_numa_nb = { + .notifier_call = wq_numa_callback, + .priority = 0 +}; + +static void __init wq_numa_init(void) +{ if (num_possible_nodes() <= 1) return; @@ -4803,29 +4850,7 @@ static void __init wq_numa_init(void) wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL); BUG_ON(!wq_update_unbound_numa_attrs_buf); - /* - * We want masks of possible CPUs of each node which isn't readily - * available. Build one from cpu_to_node() which should have been - * fully initialized by now. - */ - tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL); - BUG_ON(!tbl); - - for_each_node(node) - BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL, - node_online(node) ? node : NUMA_NO_NODE)); - - for_each_possible_cpu(cpu) { - node = cpu_to_node(cpu); - if (WARN_ON(node == NUMA_NO_NODE)) { - pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); - /* happens iff arch is bonkers, let's just proceed */ - return; - } - cpumask_set_cpu(cpu, tbl[node]); - } - - wq_numa_possible_cpumask = tbl; + register_node_notifier(&wq_numa_nb); wq_numa_enabled = true; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9fab107..1778628 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1132,6 +1132,44 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) return; } +static RAW_NOTIFIER_HEAD(node_chain); + +int __ref register_node_notifier(struct notifier_block *nb) +{ + int ret; + + ret = raw_notifier_chain_register(&node_chain, nb); + return ret; +} +EXPORT_SYMBOL(register_node_notifier); + +int __ref unregister_node_notifier(struct notifier_block *nb) +{ + int ret; + + ret = raw_notifier_chain_unregister(&node_chain, nb); + return ret; +} +EXPORT_SYMBOL(unregister_node_notifier); + +static int call_node_notify(unsigned long val, void *v) +{ + int ret; + + ret = __raw_notifier_call_chain(&node_chain, val, v, -1, NULL); + + return notifier_to_errno(ret); +} + +void get_present_cpumask_of_node(cpumask_var_t cpumask, int node) +{ + unsigned int cpu; + + for_each_present_cpu(cpu) + if (node == cpu_to_node(cpu)) + cpumask_set_cpu(cpu, cpumask); +} +EXPORT_SYMBOL(get_present_cpumask_of_node); /** * try_online_node - online a node if offlined @@ -1157,6 +1195,9 @@ int try_online_node(int nid) ret = register_one_node(nid); BUG_ON(ret); + /* notify that the node is on */ + call_node_notify(NODE_ON, (void *)(long)nid); + if (pgdat->node_zonelists->_zonerefs->zone == NULL) { mutex_lock(&zonelists_mutex); build_all_zonelists(NULL, NULL); @@ -1978,6 +2019,9 @@ void try_offline_node(int nid) vfree(zone->wait_table); } + /* notify that the node is down */ + call_node_notify(NODE_DOWN, (void *)(long)nid); + /* * Since there is no way to guarentee the address of pgdat/zone is not * on stack of any kernel threads or used by other kernel objects -- 1.7.7 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/