Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1759449AbZLOJTe (ORCPT ); Tue, 15 Dec 2009 04:19:34 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1756807AbZLOJTc (ORCPT ); Tue, 15 Dec 2009 04:19:32 -0500 Received: from fgwmail5.fujitsu.co.jp ([192.51.44.35]:48660 "EHLO fgwmail5.fujitsu.co.jp" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752684AbZLOJTa (ORCPT ); Tue, 15 Dec 2009 04:19:30 -0500 X-SecurityPolicyCheck-FJ: OK by FujitsuOutboundMailChecker v1.3.1 Date: Tue, 15 Dec 2009 18:16:23 +0900 From: KAMEZAWA Hiroyuki To: KAMEZAWA Hiroyuki Cc: "linux-mm@kvack.org" , "linux-kernel@vger.kernel.org" , "akpm@linux-foundation.org" , cl@linux-foundation.org, minchan.kim@gmail.com, Lee.Schermerhorn@hp.com Subject: [mmotm][PATCH 5/5] mm : count lowmem rss Message-Id: <20091215181623.1321f391.kamezawa.hiroyu@jp.fujitsu.com> In-Reply-To: <20091215180904.c307629f.kamezawa.hiroyu@jp.fujitsu.com> References: <20091215180904.c307629f.kamezawa.hiroyu@jp.fujitsu.com> Organization: FUJITSU Co. LTD. X-Mailer: Sylpheed 2.5.0 (GTK+ 2.10.14; i686-pc-mingw32) Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 14393 Lines: 419 From: KAMEZAWA Hiroyuki Some case of OOM-Kill is caused by memory shortage in lowmem area. For example, NORMAL_ZONE is exhausted on x86-32/HIGHMEM kernel. Now, oom-killer doesn't have no lowmem usage information of processes and selects victim processes based on global memory usage information. In bad case, this can cause chains of kills of innocent processes without progress, oom-serial-killer. For making oom-killer lowmem aware, this patch adds counters for accounting lowmem usage per process. (patches for oom-killer is not included in this.) Adding counter is easy but one of concern is the cost for new counter. But this patch doesn't adds # of counting cost but adds "if" senetense to check a page is lwomem. With micro benchmark, almost no regression. Changelog: 2009/12/14 - makes get_xx_rss() to be not-inlined functions. Reviewed-by: Minchan Kim Signed-off-by: KAMEZAWA Hiroyuki --- fs/proc/task_mmu.c | 4 +- include/linux/mm.h | 27 ++++++++++++--- include/linux/mm_types.h | 7 ++-- mm/filemap_xip.c | 2 - mm/fremap.c | 2 - mm/memory.c | 80 ++++++++++++++++++++++++++++++++++++----------- mm/oom_kill.c | 8 ++-- mm/rmap.c | 10 +++-- mm/swapfile.c | 2 - 9 files changed, 105 insertions(+), 37 deletions(-) Index: mmotm-2.6.32-Dec8-pth/include/linux/mm_types.h =================================================================== --- mmotm-2.6.32-Dec8-pth.orig/include/linux/mm_types.h +++ mmotm-2.6.32-Dec8-pth/include/linux/mm_types.h @@ -194,11 +194,14 @@ struct core_state { }; enum { - MM_FILEPAGES, - MM_ANONPAGES, + MM_FILEPAGES, /* file's rss is MM_FILEPAGES + MM_LOW_FILEPAGES */ + MM_ANONPAGES, /* anon`'s rss is MM_FILEPAGES + MM_LOW_FILEPAGES */ + MM_FILE_LOWPAGES, /* pages from lower zones in file rss*/ + MM_ANON_LOWPAGES, /* pages from lower zones in anon rss*/ MM_SWAPENTS, NR_MM_COUNTERS }; +#define LOWMEM_COUNTER 2 #if USE_SPLIT_PTLOCKS #define SPLIT_RSS_COUNTING Index: mmotm-2.6.32-Dec8-pth/mm/memory.c =================================================================== --- mmotm-2.6.32-Dec8-pth.orig/mm/memory.c +++ mmotm-2.6.32-Dec8-pth/mm/memory.c @@ -137,7 +137,7 @@ void __sync_task_rss_stat(struct task_st task->rss_stat.events = 0; } -static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) +static void __add_mm_counter_fast(struct mm_struct *mm, int member, int val) { struct task_struct *task = current; @@ -146,8 +146,17 @@ static void add_mm_counter_fast(struct m else add_mm_counter(mm, member, val); } -#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member,1) -#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member,-1) +static void add_mm_counter_fast(struct mm_struct *mm, int member, + int val, struct page *page) +{ + if (is_lowmem_page(page)) + member += LOWMEM_COUNTER; + __add_mm_counter_fast(mm, member, val); +} +#define inc_mm_counter_fast(mm, member, page)\ + add_mm_counter_fast(mm, member,1, page) +#define dec_mm_counter_fast(mm, member, page)\ + add_mm_counter_fast(mm, member,-1, page) /* sync counter once per 64 page faults */ #define TASK_RSS_EVENTS_THRESH (64) @@ -183,8 +192,9 @@ void sync_mm_rss(struct task_struct *tas } #else -#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) -#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) +#define inc_mm_counter_fast(mm, member, page) inc_mm_counter_page(mm, member, page) +#define dec_mm_counter_fast(mm, member, page) dec_mm_counter_page(mm, member, page) +#define __add_mm_counter_fast(mm, member, val) add_mm_counter(mm, member, val) static void check_sync_rss_stat(struct task_struct *task) { @@ -195,6 +205,30 @@ void sync_mm_rss(struct task_struct *tas } #endif +unsigned long get_file_rss(struct mm_struct *mm) +{ + return get_mm_counter(mm, MM_ANONPAGES) + + get_mm_counter(mm, MM_ANON_LOWPAGES); +} + +unsigned long get_anon_rss(struct mm_struct *mm) +{ + return get_mm_counter(mm, MM_FILEPAGES) + + get_mm_counter(mm, MM_FILE_LOWPAGES); +} + +unsigned long get_low_rss(struct mm_struct *mm) +{ + return get_mm_counter(mm, MM_ANON_LOWPAGES) + + get_mm_counter(mm, MM_FILE_LOWPAGES); +} + +unsigned long get_mm_rss(struct mm_struct *mm) +{ + return get_file_rss(mm) + get_anon_rss(mm); +} + + /* * If a p?d_bad entry is found while walking page tables, report * the error, before resetting entry to p?d_none. Usually (but @@ -714,12 +748,17 @@ copy_one_pte(struct mm_struct *dst_mm, s page = vm_normal_page(vma, addr, pte); if (page) { + int type; + get_page(page); page_dup_rmap(page); if (PageAnon(page)) - rss[MM_ANONPAGES]++; + type = MM_ANONPAGES; else - rss[MM_FILEPAGES]++; + type = MM_FILEPAGES; + if (is_lowmem_page(page)) + type += LOWMEM_COUNTER; + rss[type]++; } out_set_pte: @@ -905,6 +944,7 @@ static unsigned long zap_pte_range(struc pte_t *pte; spinlock_t *ptl; int rss[NR_MM_COUNTERS]; + int type; init_rss_vec(rss); @@ -952,15 +992,18 @@ static unsigned long zap_pte_range(struc set_pte_at(mm, addr, pte, pgoff_to_pte(page->index)); if (PageAnon(page)) - rss[MM_ANONPAGES]--; + type = MM_ANONPAGES; else { if (pte_dirty(ptent)) set_page_dirty(page); if (pte_young(ptent) && likely(!VM_SequentialReadHint(vma))) mark_page_accessed(page); - rss[MM_FILEPAGES]--; + type = MM_FILEPAGES; } + if (is_lowmem_page(page)) + type += LOWMEM_COUNTER; + rss[type]--; page_remove_rmap(page); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); @@ -1621,7 +1664,7 @@ static int insert_page(struct vm_area_st /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter_fast(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, MM_FILEPAGES, page); page_add_file_rmap(page); set_pte_at(mm, addr, pte, mk_pte(page, prot)); @@ -2257,11 +2300,12 @@ gotten: if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter_fast(mm, MM_FILEPAGES); - inc_mm_counter_fast(mm, MM_ANONPAGES); + dec_mm_counter_fast(mm, MM_FILEPAGES, old_page); + inc_mm_counter_fast(mm, MM_ANONPAGES, new_page); } } else - inc_mm_counter_fast(mm, MM_ANONPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES, new_page); + flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2694,8 +2738,9 @@ static int do_swap_page(struct mm_struct * discarded at swap_free(). */ - inc_mm_counter_fast(mm, MM_ANONPAGES); - dec_mm_counter_fast(mm, MM_SWAPENTS); + inc_mm_counter_fast(mm, MM_ANONPAGES, page); + /* SWAPENTS counter is not related to page..then use bare call */ + __add_mm_counter_fast(mm, MM_SWAPENTS, -1); pte = mk_pte(page, vma->vm_page_prot); if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -2779,7 +2824,7 @@ static int do_anonymous_page(struct mm_s if (!pte_none(*page_table)) goto release; - inc_mm_counter_fast(mm, MM_ANONPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES, page); page_add_new_anon_rmap(page, vma, address); setpte: set_pte_at(mm, address, page_table, entry); @@ -2933,10 +2978,10 @@ static int __do_fault(struct mm_struct * if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { - inc_mm_counter_fast(mm, MM_ANONPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES, page); page_add_new_anon_rmap(page, vma, address); } else { - inc_mm_counter_fast(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, MM_FILEPAGES, page); page_add_file_rmap(page); if (flags & FAULT_FLAG_WRITE) { dirty_page = page; Index: mmotm-2.6.32-Dec8-pth/mm/rmap.c =================================================================== --- mmotm-2.6.32-Dec8-pth.orig/mm/rmap.c +++ mmotm-2.6.32-Dec8-pth/mm/rmap.c @@ -815,9 +815,9 @@ int try_to_unmap_one(struct page *page, if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { if (PageAnon(page)) - dec_mm_counter(mm, MM_ANONPAGES); + dec_mm_counter_page(mm, MM_ANONPAGES, page); else - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter_page(mm, MM_FILEPAGES, page); set_pte_at(mm, address, pte, swp_entry_to_pte(make_hwpoison_entry(page))); } else if (PageAnon(page)) { @@ -839,7 +839,7 @@ int try_to_unmap_one(struct page *page, list_add(&mm->mmlist, &init_mm.mmlist); spin_unlock(&mmlist_lock); } - dec_mm_counter(mm, MM_ANONPAGES); + dec_mm_counter_page(mm, MM_ANONPAGES, page); inc_mm_counter(mm, MM_SWAPENTS); } else if (PAGE_MIGRATION) { /* @@ -858,7 +858,7 @@ int try_to_unmap_one(struct page *page, entry = make_migration_entry(page, pte_write(pteval)); set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); } else - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter_page(mm, MM_FILEPAGES, page); page_remove_rmap(page); page_cache_release(page); @@ -998,6 +998,8 @@ static int try_to_unmap_cluster(unsigned page_remove_rmap(page); page_cache_release(page); dec_mm_counter(mm, MM_FILEPAGES); + if (is_lowmem_page(page)) + dec_mm_counter(mm, MM_FILEPAGES); (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); Index: mmotm-2.6.32-Dec8-pth/mm/swapfile.c =================================================================== --- mmotm-2.6.32-Dec8-pth.orig/mm/swapfile.c +++ mmotm-2.6.32-Dec8-pth/mm/swapfile.c @@ -841,7 +841,7 @@ static int unuse_pte(struct vm_area_stru } dec_mm_counter(vma->vm_mm, MM_SWAPENTS); - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); + inc_mm_counter_page(vma->vm_mm, MM_ANONPAGES, page); get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); Index: mmotm-2.6.32-Dec8-pth/mm/filemap_xip.c =================================================================== --- mmotm-2.6.32-Dec8-pth.orig/mm/filemap_xip.c +++ mmotm-2.6.32-Dec8-pth/mm/filemap_xip.c @@ -194,7 +194,7 @@ retry: flush_cache_page(vma, address, pte_pfn(*pte)); pteval = ptep_clear_flush_notify(vma, address, pte); page_remove_rmap(page); - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter_page(mm, MM_FILEPAGES, page); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); page_cache_release(page); Index: mmotm-2.6.32-Dec8-pth/mm/fremap.c =================================================================== --- mmotm-2.6.32-Dec8-pth.orig/mm/fremap.c +++ mmotm-2.6.32-Dec8-pth/mm/fremap.c @@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm page_remove_rmap(page); page_cache_release(page); update_hiwater_rss(mm); - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter_page(mm, MM_FILEPAGES, page); } } else { if (!pte_file(pte)) Index: mmotm-2.6.32-Dec8-pth/include/linux/mm.h =================================================================== --- mmotm-2.6.32-Dec8-pth.orig/include/linux/mm.h +++ mmotm-2.6.32-Dec8-pth/include/linux/mm.h @@ -939,11 +939,10 @@ static inline void dec_mm_counter(struct #endif /* !USE_SPLIT_PTLOCKS */ -static inline unsigned long get_mm_rss(struct mm_struct *mm) -{ - return get_mm_counter(mm, MM_FILEPAGES) + - get_mm_counter(mm, MM_ANONPAGES); -} +unsigned long get_mm_rss(struct mm_struct *mm); +unsigned long get_file_rss(struct mm_struct *mm); +unsigned long get_anon_rss(struct mm_struct *mm); +unsigned long get_low_rss(struct mm_struct *mm); static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) { @@ -978,6 +977,23 @@ static inline void setmax_mm_hiwater_rss *maxrss = hiwater_rss; } +/* Utility for lowmem counting */ +static inline void +inc_mm_counter_page(struct mm_struct *mm, int member, struct page *page) +{ + if (unlikely(is_lowmem_page(page))) + member += LOWMEM_COUNTER; + inc_mm_counter(mm, member); +} + +static inline void +dec_mm_counter_page(struct mm_struct *mm, int member, struct page *page) +{ + if (unlikely(is_lowmem_page(page))) + member += LOWMEM_COUNTER; + dec_mm_counter(mm, member); +} + void sync_mm_rss(struct task_struct *task, struct mm_struct *mm); /* @@ -1034,6 +1050,7 @@ int __pmd_alloc(struct mm_struct *mm, pu int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); + /* * The following ifdef needed to get the 4level-fixup.h header to work. * Remove it when 4level-fixup.h has been removed. Index: mmotm-2.6.32-Dec8-pth/fs/proc/task_mmu.c =================================================================== --- mmotm-2.6.32-Dec8-pth.orig/fs/proc/task_mmu.c +++ mmotm-2.6.32-Dec8-pth/fs/proc/task_mmu.c @@ -68,11 +68,11 @@ unsigned long task_vsize(struct mm_struc int task_statm(struct mm_struct *mm, int *shared, int *text, int *data, int *resident) { - *shared = get_mm_counter(mm, MM_FILEPAGES); + *shared = get_file_rss(mm); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->total_vm - mm->shared_vm; - *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); + *resident = *shared + get_anon_rss(mm); return mm->total_vm; } Index: mmotm-2.6.32-Dec8-pth/mm/oom_kill.c =================================================================== --- mmotm-2.6.32-Dec8-pth.orig/mm/oom_kill.c +++ mmotm-2.6.32-Dec8-pth/mm/oom_kill.c @@ -398,11 +398,13 @@ static void __oom_kill_task(struct task_ if (verbose) printk(KERN_ERR "Killed process %d (%s) " - "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", + "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB " + "lowmem %lukB\n", task_pid_nr(p), p->comm, K(p->mm->total_vm), - K(get_mm_counter(p->mm, MM_ANONPAGES)), - K(get_mm_counter(p->mm, MM_FILEPAGES))); + K(get_anon_rss(p->mm)), + K(get_file_rss(p->mm)), + K(get_low_rss(p->mm))); task_unlock(p); /* -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/