Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S932300Ab2KMRSd (ORCPT ); Tue, 13 Nov 2012 12:18:33 -0500 Received: from mail-ea0-f174.google.com ([209.85.215.174]:33061 "EHLO mail-ea0-f174.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S932253Ab2KMRPa (ORCPT ); Tue, 13 Nov 2012 12:15:30 -0500 From: Ingo Molnar To: linux-kernel@vger.kernel.org, linux-mm@kvack.org Cc: Paul Turner , Lee Schermerhorn , Christoph Lameter , Rik van Riel , Mel Gorman , Andrew Morton , Andrea Arcangeli , Linus Torvalds , Peter Zijlstra , Thomas Gleixner Subject: [PATCH 22/31] sched, numa, mm: Add last_cpu to page flags Date: Tue, 13 Nov 2012 18:13:45 +0100 Message-Id: <1352826834-11774-23-git-send-email-mingo@kernel.org> X-Mailer: git-send-email 1.7.11.7 In-Reply-To: <1352826834-11774-1-git-send-email-mingo@kernel.org> References: <1352826834-11774-1-git-send-email-mingo@kernel.org> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 12166 Lines: 379 From: Peter Zijlstra Introduce a per-page last_cpu field, fold this into the struct page::flags field whenever possible. The unlikely/rare 32bit NUMA configs will likely grow the page-frame. [ Completely dropping 32bit support for CONFIG_NUMA_BALANCING would simplify things, but it would also remove the warning if we grow enough 64bit only page-flags to push the last-cpu out. ] Suggested-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Mel Gorman Signed-off-by: Ingo Molnar --- include/linux/mm.h | 90 +++++++++++++++++++++------------------ include/linux/mm_types.h | 5 +++ include/linux/mmzone.h | 14 +----- include/linux/page-flags-layout.h | 83 ++++++++++++++++++++++++++++++++++++ kernel/bounds.c | 4 ++ mm/huge_memory.c | 3 ++ mm/memory.c | 4 ++ 7 files changed, 149 insertions(+), 54 deletions(-) create mode 100644 include/linux/page-flags-layout.h diff --git a/include/linux/mm.h b/include/linux/mm.h index 1821629..141a28f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -594,50 +594,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) * sets it, so none of the operations on it need to be atomic. */ - -/* - * page->flags layout: - * - * There are three possibilities for how page->flags get - * laid out. The first is for the normal case, without - * sparsemem. The second is for sparsemem when there is - * plenty of space for node and section. The last is when - * we have run out of space and have to fall back to an - * alternate (slower) way of determining the node. - * - * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | - * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | - * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | - */ -#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) -#define SECTIONS_WIDTH SECTIONS_SHIFT -#else -#define SECTIONS_WIDTH 0 -#endif - -#define ZONES_WIDTH ZONES_SHIFT - -#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS -#define NODES_WIDTH NODES_SHIFT -#else -#ifdef CONFIG_SPARSEMEM_VMEMMAP -#error "Vmemmap: No space for nodes field in page flags" -#endif -#define NODES_WIDTH 0 -#endif - -/* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */ +/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPU] | ... | FLAGS | */ #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) - -/* - * We are going to use the flags for the page to node mapping if its in - * there. This includes the case where there is no node, so it is implicit. - */ -#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0) -#define NODE_NOT_IN_PAGE_FLAGS -#endif +#define LAST_CPU_PGOFF (ZONES_PGOFF - LAST_CPU_WIDTH) /* * Define the bit shifts to access each section. For non-existent @@ -647,6 +608,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) +#define LAST_CPU_PGSHIFT (LAST_CPU_PGOFF * (LAST_CPU_WIDTH != 0)) /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ #ifdef NODE_NOT_IN_PAGE_FLAGS @@ -668,6 +630,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) +#define LAST_CPU_MASK ((1UL << LAST_CPU_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) static inline enum zone_type page_zonenum(const struct page *page) @@ -706,6 +669,51 @@ static inline int page_to_nid(const struct page *page) } #endif +#ifdef CONFIG_NUMA_BALANCING +#ifdef LAST_CPU_NOT_IN_PAGE_FLAGS +static inline int page_xchg_last_cpu(struct page *page, int cpu) +{ + return xchg(&page->_last_cpu, cpu); +} + +static inline int page_last_cpu(struct page *page) +{ + return page->_last_cpu; +} +#else +static inline int page_xchg_last_cpu(struct page *page, int cpu) +{ + unsigned long old_flags, flags; + int last_cpu; + + do { + old_flags = flags = page->flags; + last_cpu = (flags >> LAST_CPU_PGSHIFT) & LAST_CPU_MASK; + + flags &= ~(LAST_CPU_MASK << LAST_CPU_PGSHIFT); + flags |= (cpu & LAST_CPU_MASK) << LAST_CPU_PGSHIFT; + } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); + + return last_cpu; +} + +static inline int page_last_cpu(struct page *page) +{ + return (page->flags >> LAST_CPU_PGSHIFT) & LAST_CPU_MASK; +} +#endif /* LAST_CPU_NOT_IN_PAGE_FLAGS */ +#else /* CONFIG_NUMA_BALANCING */ +static inline int page_xchg_last_cpu(struct page *page, int cpu) +{ + return page_to_nid(page); +} + +static inline int page_last_cpu(struct page *page) +{ + return page_to_nid(page); +} +#endif /* CONFIG_NUMA_BALANCING */ + static inline struct zone *page_zone(const struct page *page) { return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 31f8a3a..7e9f758 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -175,6 +176,10 @@ struct page { */ void *shadow; #endif + +#ifdef LAST_CPU_NOT_IN_PAGE_FLAGS + int _last_cpu; +#endif } /* * The struct page can be forced to be double word aligned so that atomic ops diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 50aaca8..7e116ed 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include @@ -318,16 +318,6 @@ enum zone_type { * match the requested limits. See gfp_zone() in include/linux/gfp.h */ -#if MAX_NR_ZONES < 2 -#define ZONES_SHIFT 0 -#elif MAX_NR_ZONES <= 2 -#define ZONES_SHIFT 1 -#elif MAX_NR_ZONES <= 4 -#define ZONES_SHIFT 2 -#else -#error ZONES_SHIFT -- too many zones configured adjust calculation -#endif - struct zone { /* Fields commonly accessed by the page allocator */ @@ -1030,8 +1020,6 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn) * PA_SECTION_SHIFT physical address to/from section number * PFN_SECTION_SHIFT pfn to/from section number */ -#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) - #define PA_SECTION_SHIFT (SECTION_SIZE_BITS) #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h new file mode 100644 index 0000000..b258132 --- /dev/null +++ b/include/linux/page-flags-layout.h @@ -0,0 +1,83 @@ +#ifndef _LINUX_PAGE_FLAGS_LAYOUT +#define _LINUX_PAGE_FLAGS_LAYOUT + +#include +#include + +#if MAX_NR_ZONES < 2 +#define ZONES_SHIFT 0 +#elif MAX_NR_ZONES <= 2 +#define ZONES_SHIFT 1 +#elif MAX_NR_ZONES <= 4 +#define ZONES_SHIFT 2 +#else +#error ZONES_SHIFT -- too many zones configured adjust calculation +#endif + +#ifdef CONFIG_SPARSEMEM +#include + +/* + * SECTION_SHIFT #bits space required to store a section # + */ +#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) +#endif + +/* + * page->flags layout: + * + * There are five possibilities for how page->flags get laid out. The first + * (and second) is for the normal case, without sparsemem. The third is for + * sparsemem when there is plenty of space for node and section. The last is + * when we have run out of space and have to fall back to an alternate (slower) + * way of determining the node. + * + * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | + * " plus space for last_cpu:| NODE | ZONE | LAST_CPU | ... | FLAGS | + * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | + * " plus space for last_cpu:| SECTION | NODE | ZONE | LAST_CPU | ... | FLAGS | + * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | + */ +#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) + +#define SECTIONS_WIDTH SECTIONS_SHIFT +#else +#define SECTIONS_WIDTH 0 +#endif + +#define ZONES_WIDTH ZONES_SHIFT + +#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS +#define NODES_WIDTH NODES_SHIFT +#else +#ifdef CONFIG_SPARSEMEM_VMEMMAP +#error "Vmemmap: No space for nodes field in page flags" +#endif +#define NODES_WIDTH 0 +#endif + +#ifdef CONFIG_NUMA_BALANCING +#define LAST_CPU_SHIFT NR_CPUS_BITS +#else +#define LAST_CPU_SHIFT 0 +#endif + +#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_CPU_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS +#define LAST_CPU_WIDTH LAST_CPU_SHIFT +#else +#define LAST_CPU_WIDTH 0 +#endif + +/* + * We are going to use the flags for the page to node mapping if its in + * there. This includes the case where there is no node, so it is implicit. + */ +#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0) +#define NODE_NOT_IN_PAGE_FLAGS +#endif + +#if defined(CONFIG_NUMA_BALANCING) && LAST_CPU_WIDTH == 0 +#define LAST_CPU_NOT_IN_PAGE_FLAGS +#endif + +#endif /* _LINUX_PAGE_FLAGS_LAYOUT */ diff --git a/kernel/bounds.c b/kernel/bounds.c index 0c9b862..e8ca97b 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -10,6 +10,7 @@ #include #include #include +#include void foo(void) { @@ -17,5 +18,8 @@ void foo(void) DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); +#ifdef CONFIG_SMP + DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); +#endif /* End of constants */ } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 931caf4..fbff718 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -745,6 +745,7 @@ void do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *new_page = NULL; struct page *page = NULL; int node, lru; + int last_cpu; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(*pmd, entry))) @@ -759,6 +760,7 @@ void do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pmd_page(entry); if (page) { VM_BUG_ON(!PageCompound(page) || !PageHead(page)); + last_cpu = page_last_cpu(page); get_page(page); node = mpol_misplaced(page, vma, haddr); @@ -1440,6 +1442,7 @@ static void __split_huge_page_refcount(struct page *page) page_tail->mapping = page->mapping; page_tail->index = page->index + i; + page_xchg_last_cpu(page, page_last_cpu(page_tail)); BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); diff --git a/mm/memory.c b/mm/memory.c index 0d26a28..1b9108c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -68,6 +68,10 @@ #include "internal.h" +#ifdef LAST_CPU_NOT_IN_PAGE_FLAGS +#warning Unfortunate NUMA config, growing page-frame for last_cpu. +#endif + #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/