Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1760252AbZAUIIn (ORCPT ); Wed, 21 Jan 2009 03:08:43 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1753240AbZAUIId (ORCPT ); Wed, 21 Jan 2009 03:08:33 -0500 Received: from cn.fujitsu.com ([222.73.24.84]:50835 "EHLO song.cn.fujitsu.com" rhost-flags-OK-FAIL-OK-OK) by vger.kernel.org with ESMTP id S1753218AbZAUIIc (ORCPT ); Wed, 21 Jan 2009 03:08:32 -0500 Message-ID: <4976D77C.3020107@cn.fujitsu.com> Date: Wed, 21 Jan 2009 16:06:20 +0800 From: Miao Xie Reply-To: miaox@cn.fujitsu.com User-Agent: Thunderbird 2.0.0.6 (Windows/20070728) MIME-Version: 1.0 To: Ingo Molnar , Andrew Morton , Paul Menage CC: Linux-Kernel Subject: [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 18859 Lines: 551 The task still allocated the page caches on old node after modifying its cpuset's mems when 'memory_spread_page' was set, it is caused by the old mem_allowed_list of the task, the current kernel doesn't updates it unless some function invokes cpuset_update_task_memory_state(), it is too late sometimes. We must update the mem_allowed_list of the tasks in time. Slab has the same problem. We fixes the bug by updating tasks' mem_allowed_list and spread flag after its cpuset's mems or spread flag is changed. Signed-off-by: Miao Xie --- include/linux/cpuset.h | 4 - include/linux/sched.h | 1 - init/main.c | 3 +- kernel/cpuset.c | 204 +++++++++++++++-------------------------------- kernel/kthread.c | 1 + mm/mempolicy.c | 12 --- mm/page_alloc.c | 5 +- 7 files changed, 69 insertions(+), 161 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 90c6074..c8155a6 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -17,7 +17,6 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */ -extern int cpuset_init_early(void); extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); @@ -26,7 +25,6 @@ extern void cpuset_cpus_allowed_locked(struct task_struct *p, extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); -void cpuset_update_task_memory_state(void); int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask); @@ -83,7 +81,6 @@ extern void cpuset_print_task_mems_allowed(struct task_struct *p); #else /* !CONFIG_CPUSETS */ -static inline int cpuset_init_early(void) { return 0; } static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} @@ -105,7 +102,6 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) #define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY]) static inline void cpuset_init_current_mems_allowed(void) {} -static inline void cpuset_update_task_memory_state(void) {} static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) { diff --git a/include/linux/sched.h b/include/linux/sched.h index 4cae9b8..2c9a93c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1332,7 +1332,6 @@ struct task_struct { #endif #ifdef CONFIG_CPUSETS nodemask_t mems_allowed; - int cpuset_mems_generation; int cpuset_mem_spread_rotor; #endif #ifdef CONFIG_CGROUPS diff --git a/init/main.c b/init/main.c index 8442094..3ce3b5d 100644 --- a/init/main.c +++ b/init/main.c @@ -635,7 +635,6 @@ asmlinkage void __init start_kernel(void) #endif vmalloc_init(); vfs_caches_init_early(); - cpuset_init_early(); page_cgroup_init(); mem_init(); enable_debug_pagealloc(); @@ -845,6 +844,8 @@ static int __init kernel_init(void * unused) */ init_pid_ns.child_reaper = current; + current->mems_allowed = node_possible_map; + cad_pid = task_pid(current); smp_prepare_cpus(setup_max_cpus); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a856788..36436fc 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -89,12 +89,6 @@ struct cpuset { struct cpuset *parent; /* my parent */ - /* - * Copy of global cpuset_mems_generation as of the most - * recent time this cpuset changed its mems_allowed. - */ - int mems_generation; - struct fmeter fmeter; /* memory_pressure filter */ /* partition number for rebuild_sched_domains() */ @@ -172,27 +166,6 @@ static inline int is_spread_slab(const struct cpuset *cs) return test_bit(CS_SPREAD_SLAB, &cs->flags); } -/* - * Increment this integer everytime any cpuset changes its - * mems_allowed value. Users of cpusets can track this generation - * number, and avoid having to lock and reload mems_allowed unless - * the cpuset they're using changes generation. - * - * A single, global generation is needed because cpuset_attach_task() could - * reattach a task to a different cpuset, which must not have its - * generation numbers aliased with those of that tasks previous cpuset. - * - * Generations are needed for mems_allowed because one task cannot - * modify another's memory placement. So we must enable every task, - * on every visit to __alloc_pages(), to efficiently check whether - * its current->cpuset->mems_allowed has changed, requiring an update - * of its current->mems_allowed. - * - * Since writes to cpuset_mems_generation are guarded by the cgroup lock - * there is no need to mark it atomic. - */ -static int cpuset_mems_generation; - static struct cpuset top_cpuset = { .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), }; @@ -224,8 +197,8 @@ static struct cpuset top_cpuset = { * If a task is only holding callback_mutex, then it has read-only * access to cpusets. * - * The task_struct fields mems_allowed and mems_generation may only - * be accessed in the context of that task, so require no locks. + * The task_struct fields mems_allowed may only be accessed in the context + * of that task, so require no locks. * * The cpuset_common_file_read() handlers only hold callback_mutex across * small pieces of code, such as when reading out possibly multi-word @@ -327,77 +300,6 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); } -/** - * cpuset_update_task_memory_state - update task memory placement - * - * If the current tasks cpusets mems_allowed changed behind our - * backs, update current->mems_allowed, mems_generation and task NUMA - * mempolicy to the new value. - * - * Task mempolicy is updated by rebinding it relative to the - * current->cpuset if a task has its memory placement changed. - * Do not call this routine if in_interrupt(). - * - * Call without callback_mutex or task_lock() held. May be - * called with or without cgroup_mutex held. Thanks in part to - * 'the_top_cpuset_hack', the task's cpuset pointer will never - * be NULL. This routine also might acquire callback_mutex during - * call. - * - * Reading current->cpuset->mems_generation doesn't need task_lock - * to guard the current->cpuset derefence, because it is guarded - * from concurrent freeing of current->cpuset using RCU. - * - * The rcu_dereference() is technically probably not needed, - * as I don't actually mind if I see a new cpuset pointer but - * an old value of mems_generation. However this really only - * matters on alpha systems using cpusets heavily. If I dropped - * that rcu_dereference(), it would save them a memory barrier. - * For all other arch's, rcu_dereference is a no-op anyway, and for - * alpha systems not using cpusets, another planned optimization, - * avoiding the rcu critical section for tasks in the root cpuset - * which is statically allocated, so can't vanish, will make this - * irrelevant. Better to use RCU as intended, than to engage in - * some cute trick to save a memory barrier that is impossible to - * test, for alpha systems using cpusets heavily, which might not - * even exist. - * - * This routine is needed to update the per-task mems_allowed data, - * within the tasks context, when it is trying to allocate memory - * (in various mm/mempolicy.c routines) and notices that some other - * task has been modifying its cpuset. - */ - -void cpuset_update_task_memory_state(void) -{ - int my_cpusets_mem_gen; - struct task_struct *tsk = current; - struct cpuset *cs; - - rcu_read_lock(); - my_cpusets_mem_gen = task_cs(tsk)->mems_generation; - rcu_read_unlock(); - - if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { - mutex_lock(&callback_mutex); - task_lock(tsk); - cs = task_cs(tsk); /* Maybe changed when task not locked */ - guarantee_online_mems(cs, &tsk->mems_allowed); - tsk->cpuset_mems_generation = cs->mems_generation; - if (is_spread_page(cs)) - tsk->flags |= PF_SPREAD_PAGE; - else - tsk->flags &= ~PF_SPREAD_PAGE; - if (is_spread_slab(cs)) - tsk->flags |= PF_SPREAD_SLAB; - else - tsk->flags &= ~PF_SPREAD_SLAB; - task_unlock(tsk); - mutex_unlock(&callback_mutex); - mpol_rebind_task(tsk, &tsk->mems_allowed); - } -} - /* * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? * @@ -990,14 +892,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * other task, the task_struct mems_allowed that we are hacking * is for our current task, which must allocate new pages for that * migrating memory region. - * - * We call cpuset_update_task_memory_state() before hacking - * our tasks mems_allowed, so that we are assured of being in - * sync with our tasks cpuset, and in particular, callbacks to - * cpuset_update_task_memory_state() from nested page allocations - * won't see any mismatch of our cpuset and task mems_generation - * values, so won't overwrite our hacked tasks mems_allowed - * nodemask. */ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, @@ -1005,8 +899,6 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, { struct task_struct *tsk = current; - cpuset_update_task_memory_state(); - mutex_lock(&callback_mutex); tsk->mems_allowed = *to; mutex_unlock(&callback_mutex); @@ -1076,6 +968,10 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem) "Cpuset mempolicy rebind incomplete.\n"); break; } + mutex_lock(&callback_mutex); + guarantee_online_mems(cs, &p->mems_allowed); + mutex_unlock(&callback_mutex); + mpol_rebind_task(p, &p->mems_allowed); mm = get_task_mm(p); if (!mm) continue; @@ -1118,10 +1014,9 @@ done: /* * Handle user request to change the 'mems' memory placement * of a cpuset. Needs to validate the request, update the - * cpusets mems_allowed and mems_generation, and for each - * task in the cpuset, rebind any vma mempolicies and if - * the cpuset is marked 'memory_migrate', migrate the tasks - * pages to the new memory. + * cpusets mems_allowed, and for each task in the cpuset, + * rebind any vma mempolicies and if the cpuset is marked + * 'memory_migrate', migrate the tasks pages to the new memory. * * Call with cgroup_mutex held. May take callback_mutex during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, @@ -1169,7 +1064,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, mutex_lock(&callback_mutex); cs->mems_allowed = trialcs->mems_allowed; - cs->mems_generation = cpuset_mems_generation++; mutex_unlock(&callback_mutex); retval = update_tasks_nodemask(cs, &oldmem); @@ -1197,6 +1091,33 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) return 0; } +static void cpuset_change_flag(struct task_struct *tsk, + struct cgroup_scanner *scan) +{ + struct cpuset *cs; + + cs = task_cs(tsk); + if (is_spread_page(cs)) + tsk->flags |= PF_SPREAD_PAGE; + else + tsk->flags &= ~PF_SPREAD_PAGE; + if (is_spread_slab(cs)) + tsk->flags |= PF_SPREAD_SLAB; + else + tsk->flags &= ~PF_SPREAD_SLAB; +} + +static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) +{ + struct cgroup_scanner scan; + + scan.cg = cs->css.cgroup; + scan.test_task = NULL; + scan.process_task = cpuset_change_flag; + scan.heap = heap; + cgroup_scan_tasks(&scan); +} + /* * update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (see cpuset_flagbits_t) @@ -1212,6 +1133,8 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, struct cpuset *trialcs; int err; int balance_flag_changed; + int spread_changed; + struct ptr_heap heap; trialcs = alloc_trial_cpuset(cs); if (!trialcs) @@ -1226,9 +1149,16 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, if (err < 0) goto out; + err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); + if (err) + goto out; + balance_flag_changed = (is_sched_load_balance(cs) != is_sched_load_balance(trialcs)); + spread_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) + || (is_spread_page(cs) != is_spread_page(trialcs))); + mutex_lock(&callback_mutex); cs->flags = trialcs->flags; mutex_unlock(&callback_mutex); @@ -1236,6 +1166,10 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) async_rebuild_sched_domains(); + if (spread_changed) + update_tasks_flags(cs, &heap); + + heap_free(&heap); out: free_trial_cpuset(trialcs); return err; @@ -1374,15 +1308,29 @@ static void cpuset_attach(struct cgroup_subsys *ss, if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); + tsk->mems_allowed = node_possible_map; } else { mutex_lock(&callback_mutex); guarantee_online_cpus(cs, cpus_attach); + guarantee_online_mems(cs,&tsk->mems_allowed); mutex_unlock(&callback_mutex); } err = set_cpus_allowed_ptr(tsk, cpus_attach); if (err) return; + mutex_lock(&callback_mutex); + if (is_spread_page(cs)) + tsk->flags |= PF_SPREAD_PAGE; + else + tsk->flags &= ~PF_SPREAD_PAGE; + if (is_spread_slab(cs)) + tsk->flags |= PF_SPREAD_SLAB; + else + tsk->flags &= ~PF_SPREAD_SLAB; + mutex_unlock(&callback_mutex); + mpol_rebind_task(tsk, &tsk->mems_allowed); + from = oldcs->mems_allowed; to = cs->mems_allowed; mm = get_task_mm(tsk); @@ -1444,11 +1392,9 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) break; case FILE_SPREAD_PAGE: retval = update_flag(CS_SPREAD_PAGE, cs, val); - cs->mems_generation = cpuset_mems_generation++; break; case FILE_SPREAD_SLAB: retval = update_flag(CS_SPREAD_SLAB, cs, val); - cs->mems_generation = cpuset_mems_generation++; break; default: retval = -EINVAL; @@ -1787,8 +1733,6 @@ static struct cgroup_subsys_state *cpuset_create( struct cpuset *parent; if (!cont->parent) { - /* This is early initialization for the top cgroup */ - top_cpuset.mems_generation = cpuset_mems_generation++; return &top_cpuset.css; } parent = cgroup_cs(cont->parent); @@ -1800,7 +1744,6 @@ static struct cgroup_subsys_state *cpuset_create( return ERR_PTR(-ENOMEM); } - cpuset_update_task_memory_state(); cs->flags = 0; if (is_spread_page(parent)) set_bit(CS_SPREAD_PAGE, &cs->flags); @@ -1809,7 +1752,6 @@ static struct cgroup_subsys_state *cpuset_create( set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); - cs->mems_generation = cpuset_mems_generation++; fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; @@ -1828,8 +1770,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) { struct cpuset *cs = cgroup_cs(cont); - cpuset_update_task_memory_state(); - if (is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); @@ -1850,21 +1790,6 @@ struct cgroup_subsys cpuset_subsys = { .early_init = 1, }; -/* - * cpuset_init_early - just enough so that the calls to - * cpuset_update_task_memory_state() in early init code - * are harmless. - */ - -int __init cpuset_init_early(void) -{ - alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed); - - top_cpuset.mems_generation = cpuset_mems_generation++; - return 0; -} - - /** * cpuset_init - initialize cpusets at system boot * @@ -1875,11 +1800,12 @@ int __init cpuset_init(void) { int err = 0; + if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) + BUG(); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); fmeter_init(&top_cpuset.fmeter); - top_cpuset.mems_generation = cpuset_mems_generation++; set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); top_cpuset.relax_domain_level = -1; diff --git a/kernel/kthread.c b/kernel/kthread.c index 4fbc456..90469e6 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -242,6 +242,7 @@ int kthreadd(void *unused) set_user_nice(tsk, KTHREAD_NICE_LEVEL); set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR); + current->mems_allowed = node_possible_map; current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; for (;;) { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3eb4a6f..5912b03 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -222,10 +222,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, policy->flags = flags; if (nodes) { - /* - * cpuset related setup doesn't apply to local allocation - */ - cpuset_update_task_memory_state(); if (flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&cpuset_context_nmask, nodes, &cpuset_current_mems_allowed); @@ -674,7 +670,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy; - cpuset_update_task_memory_state(); if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) return -EINVAL; @@ -1545,8 +1540,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; - cpuset_update_task_memory_state(); - if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; @@ -1585,16 +1578,11 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) * interrupt context and apply the current process NUMA policy. * Returns NULL when no page can be allocated. * - * Don't call cpuset_update_task_memory_state() unless - * 1) it's ok to take cpuset_sem (can WAIT), and - * 2) allocating for current task (not interrupt). */ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = current->mempolicy; - if ((gfp & __GFP_WAIT) && !in_interrupt()) - cpuset_update_task_memory_state(); if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) pol = &default_policy; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5675b30..503219c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1573,10 +1573,7 @@ nofail_alloc: /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - /* - * The task's cpuset might have expanded its set of allowable nodes - */ - cpuset_update_task_memory_state(); + p->flags |= PF_MEMALLOC; reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; -- 1.6.0.3 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/