Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751821Ab2F3FYM (ORCPT ); Sat, 30 Jun 2012 01:24:12 -0400 Received: from mail-qc0-f174.google.com ([209.85.216.174]:58535 "EHLO mail-qc0-f174.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751019Ab2F3FYK (ORCPT ); Sat, 30 Jun 2012 01:24:10 -0400 Date: Sat, 30 Jun 2012 01:24:05 -0400 From: Konrad Rzeszutek Wilk To: Andrea Arcangeli Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org, Hillf Danton , Dan Smith , Peter Zijlstra , Linus Torvalds , Andrew Morton , Thomas Gleixner , Ingo Molnar , Paul Turner , Suresh Siddha , Mike Galbraith , "Paul E. McKenney" , Lai Jiangshan , Bharata B Rao , Lee Schermerhorn , Rik van Riel , Johannes Weiner , Srivatsa Vaddagiri , Christoph Lameter , Alex Shi , Mauricio Faria de Oliveira , Konrad Rzeszutek Wilk , Don Morris , Benjamin Herrenschmidt Subject: Re: [PATCH 36/40] autonuma: page_autonuma Message-ID: <20120630052404.GH3975@localhost.localdomain> References: <1340888180-15355-1-git-send-email-aarcange@redhat.com> <1340888180-15355-37-git-send-email-aarcange@redhat.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <1340888180-15355-37-git-send-email-aarcange@redhat.com> User-Agent: Mutt/1.5.21 (2010-09-15) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 12321 Lines: 358 On Thu, Jun 28, 2012 at 02:56:16PM +0200, Andrea Arcangeli wrote: > Move the AutoNUMA per page information from the "struct page" to a > separate page_autonuma data structure allocated in the memsection > (with sparsemem) or in the pgdat (with flatmem). > > This is done to avoid growing the size of the "struct page" and the > page_autonuma data is only allocated if the kernel has been booted on > real NUMA hardware (or if noautonuma is passed as parameter to the > kernel). > > Signed-off-by: Andrea Arcangeli > --- > include/linux/autonuma.h | 18 +++- > include/linux/autonuma_flags.h | 6 + > include/linux/autonuma_types.h | 55 ++++++++++ > include/linux/mm_types.h | 26 ----- > include/linux/mmzone.h | 14 +++- > include/linux/page_autonuma.h | 53 +++++++++ > init/main.c | 2 + > mm/Makefile | 2 +- > mm/autonuma.c | 98 ++++++++++------- > mm/huge_memory.c | 26 +++-- > mm/page_alloc.c | 21 +--- > mm/page_autonuma.c | 234 ++++++++++++++++++++++++++++++++++++++++ > mm/sparse.c | 126 ++++++++++++++++++++- > 13 files changed, 577 insertions(+), 104 deletions(-) > create mode 100644 include/linux/page_autonuma.h > create mode 100644 mm/page_autonuma.c > > diff --git a/include/linux/autonuma.h b/include/linux/autonuma.h > index 85ca5eb..67af86a 100644 > --- a/include/linux/autonuma.h > +++ b/include/linux/autonuma.h > @@ -7,15 +7,26 @@ > > extern void autonuma_enter(struct mm_struct *mm); > extern void autonuma_exit(struct mm_struct *mm); > -extern void __autonuma_migrate_page_remove(struct page *page); > +extern void __autonuma_migrate_page_remove(struct page *, > + struct page_autonuma *); > extern void autonuma_migrate_split_huge_page(struct page *page, > struct page *page_tail); > extern void autonuma_setup_new_exec(struct task_struct *p); > +extern struct page_autonuma *lookup_page_autonuma(struct page *page); > > static inline void autonuma_migrate_page_remove(struct page *page) > { > - if (ACCESS_ONCE(page->autonuma_migrate_nid) >= 0) > - __autonuma_migrate_page_remove(page); > + struct page_autonuma *page_autonuma = lookup_page_autonuma(page); > + if (ACCESS_ONCE(page_autonuma->autonuma_migrate_nid) >= 0) > + __autonuma_migrate_page_remove(page, page_autonuma); > +} > + > +static inline void autonuma_free_page(struct page *page) > +{ > + if (!autonuma_impossible()) { I think you are better using a different name. Perhaps 'if (autonuma_on())' > + autonuma_migrate_page_remove(page); > + lookup_page_autonuma(page)->autonuma_last_nid = -1; > + } > } > > #define autonuma_printk(format, args...) \ > @@ -29,6 +40,7 @@ static inline void autonuma_migrate_page_remove(struct page *page) {} > static inline void autonuma_migrate_split_huge_page(struct page *page, > struct page *page_tail) {} > static inline void autonuma_setup_new_exec(struct task_struct *p) {} > +static inline void autonuma_free_page(struct page *page) {} > > #endif /* CONFIG_AUTONUMA */ > > diff --git a/include/linux/autonuma_flags.h b/include/linux/autonuma_flags.h > index 5e29a75..035d993 100644 > --- a/include/linux/autonuma_flags.h > +++ b/include/linux/autonuma_flags.h > @@ -15,6 +15,12 @@ enum autonuma_flag { > > extern unsigned long autonuma_flags; > > +static inline bool autonuma_impossible(void) > +{ > + return num_possible_nodes() <= 1 || > + test_bit(AUTONUMA_IMPOSSIBLE_FLAG, &autonuma_flags); > +} > + > static inline bool autonuma_enabled(void) > { > return !!test_bit(AUTONUMA_FLAG, &autonuma_flags); > diff --git a/include/linux/autonuma_types.h b/include/linux/autonuma_types.h > index 9e697e3..1e860f6 100644 > --- a/include/linux/autonuma_types.h > +++ b/include/linux/autonuma_types.h > @@ -39,6 +39,61 @@ struct task_autonuma { > unsigned long task_numa_fault[0]; > }; > > +/* > + * Per page (or per-pageblock) structure dynamically allocated only if > + * autonuma is not impossible. not impossible? So possible? > + */ > +struct page_autonuma { > + /* > + * To modify autonuma_last_nid lockless the architecture, > + * needs SMP atomic granularity < sizeof(long), not all archs > + * have that, notably some ancient alpha (but none of those > + * should run in NUMA systems). Archs without that requires > + * autonuma_last_nid to be a long. > + */ > +#if BITS_PER_LONG > 32 > + /* > + * autonuma_migrate_nid is -1 if the page_autonuma structure > + * is not linked into any > + * pgdat->autonuma_migrate_head. Otherwise it means the > + * page_autonuma structure is linked into the > + * &NODE_DATA(autonuma_migrate_nid)->autonuma_migrate_head[page_nid]. > + * page_nid is the nid that the page (referenced by the > + * page_autonuma structure) belongs to. > + */ > + int autonuma_migrate_nid; > + /* > + * autonuma_last_nid records which is the NUMA nid that tried > + * to access this page at the last NUMA hinting page fault. > + * If it changed, AutoNUMA will not try to migrate the page to > + * the nid where the thread is running on and to the contrary, > + * it will make different threads trashing on the same pages, > + * converge on the same NUMA node (if possible). > + */ > + int autonuma_last_nid; > +#else > +#if MAX_NUMNODES >= 32768 > +#error "too many nodes" > +#endif > + short autonuma_migrate_nid; > + short autonuma_last_nid; > +#endif > + /* > + * This is the list node that links the page (referenced by > + * the page_autonuma structure) in the > + * &NODE_DATA(dst_nid)->autonuma_migrate_head[page_nid] lru. > + */ > + struct list_head autonuma_migrate_node; > + > + /* > + * To find the page starting from the autonuma_migrate_node we > + * need a backlink. > + * > + * FIXME: drop it; > + */ > + struct page *page; > +}; > + > extern int alloc_task_autonuma(struct task_struct *tsk, > struct task_struct *orig, > int node); > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index d1248cf..f0c6379 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -136,32 +136,6 @@ struct page { > struct page *first_page; /* Compound tail pages */ > }; > > -#ifdef CONFIG_AUTONUMA > - /* > - * FIXME: move to pgdat section along with the memcg and allocate > - * at runtime only in presence of a numa system. > - */ > - /* > - * To modify autonuma_last_nid lockless the architecture, > - * needs SMP atomic granularity < sizeof(long), not all archs > - * have that, notably some ancient alpha (but none of those > - * should run in NUMA systems). Archs without that requires > - * autonuma_last_nid to be a long. > - */ > -#if BITS_PER_LONG > 32 > - int autonuma_migrate_nid; > - int autonuma_last_nid; > -#else > -#if MAX_NUMNODES >= 32768 > -#error "too many nodes" > -#endif > - /* FIXME: remember to check the updates are atomic */ > - short autonuma_migrate_nid; > - short autonuma_last_nid; > -#endif > - struct list_head autonuma_migrate_node; > -#endif > - > /* > * On machines where all RAM is mapped into kernel address space, > * we can simply calculate the virtual address. On machines with > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index d53b26a..e66da74 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -698,10 +698,13 @@ typedef struct pglist_data { > int kswapd_max_order; > enum zone_type classzone_idx; > #ifdef CONFIG_AUTONUMA > - spinlock_t autonuma_lock; > +#if !defined(CONFIG_SPARSEMEM) > + struct page_autonuma *node_page_autonuma; > +#endif > struct list_head autonuma_migrate_head[MAX_NUMNODES]; > unsigned long autonuma_nr_migrate_pages; > wait_queue_head_t autonuma_knuma_migrated_wait; > + spinlock_t autonuma_lock; > #endif > } pg_data_t; > > @@ -1064,6 +1067,15 @@ struct mem_section { > * section. (see memcontrol.h/page_cgroup.h about this.) > */ > struct page_cgroup *page_cgroup; > +#endif > +#ifdef CONFIG_AUTONUMA > + /* > + * If !SPARSEMEM, pgdat doesn't have page_autonuma pointer. We use > + * section. > + */ > + struct page_autonuma *section_page_autonuma; > +#endif > +#if defined(CONFIG_CGROUP_MEM_RES_CTLR) ^ defined(CONFIG_AUTONUMA) > unsigned long pad; > #endif > }; > diff --git a/include/linux/page_autonuma.h b/include/linux/page_autonuma.h > new file mode 100644 > index 0000000..d748aa2 > --- /dev/null > +++ b/include/linux/page_autonuma.h > @@ -0,0 +1,53 @@ > +#ifndef _LINUX_PAGE_AUTONUMA_H > +#define _LINUX_PAGE_AUTONUMA_H > + > +#if defined(CONFIG_AUTONUMA) && !defined(CONFIG_SPARSEMEM) > +extern void __init page_autonuma_init_flatmem(void); > +#else > +static inline void __init page_autonuma_init_flatmem(void) {} > +#endif > + > +#ifdef CONFIG_AUTONUMA > + > +#include > + > +extern void __meminit page_autonuma_map_init(struct page *page, > + struct page_autonuma *page_autonuma, > + int nr_pages); > + > +#ifdef CONFIG_SPARSEMEM > +#define PAGE_AUTONUMA_SIZE (sizeof(struct page_autonuma)) > +#define SECTION_PAGE_AUTONUMA_SIZE (PAGE_AUTONUMA_SIZE * \ > + PAGES_PER_SECTION) > +#endif > + > +extern void __meminit pgdat_autonuma_init(struct pglist_data *); > + > +#else /* CONFIG_AUTONUMA */ > + > +#ifdef CONFIG_SPARSEMEM > +struct page_autonuma; > +#define PAGE_AUTONUMA_SIZE 0 > +#define SECTION_PAGE_AUTONUMA_SIZE 0 > + > +#define autonuma_impossible() true > + > +#endif > + > +static inline void pgdat_autonuma_init(struct pglist_data *pgdat) {} > + > +#endif /* CONFIG_AUTONUMA */ > + > +#ifdef CONFIG_SPARSEMEM > +extern struct page_autonuma * __meminit __kmalloc_section_page_autonuma(int nid, > + unsigned long nr_pages); > +extern void __kfree_section_page_autonuma(struct page_autonuma *page_autonuma, > + unsigned long nr_pages); > +extern void __init sparse_early_page_autonuma_alloc_node(struct page_autonuma **page_autonuma_map, > + unsigned long pnum_begin, > + unsigned long pnum_end, > + unsigned long map_count, > + int nodeid); > +#endif > + > +#endif /* _LINUX_PAGE_AUTONUMA_H */ > diff --git a/init/main.c b/init/main.c > index b5cc0a7..070a377 100644 > --- a/init/main.c > +++ b/init/main.c > @@ -68,6 +68,7 @@ > #include > #include > #include > +#include > > #include > #include > @@ -455,6 +456,7 @@ static void __init mm_init(void) > * bigger than MAX_ORDER unless SPARSEMEM. > */ > page_cgroup_init_flatmem(); > + page_autonuma_init_flatmem(); > mem_init(); > kmem_cache_init(); > percpu_init_late(); > diff --git a/mm/Makefile b/mm/Makefile > index 15900fd..a4d8354 100644 > --- a/mm/Makefile > +++ b/mm/Makefile > @@ -33,7 +33,7 @@ obj-$(CONFIG_FRONTSWAP) += frontswap.o > obj-$(CONFIG_HAS_DMA) += dmapool.o > obj-$(CONFIG_HUGETLBFS) += hugetlb.o > obj-$(CONFIG_NUMA) += mempolicy.o > -obj-$(CONFIG_AUTONUMA) += autonuma.o > +obj-$(CONFIG_AUTONUMA) += autonuma.o page_autonuma.o > obj-$(CONFIG_SPARSEMEM) += sparse.o > obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o > obj-$(CONFIG_SLOB) += slob.o > diff --git a/mm/autonuma.c b/mm/autonuma.c > index f44272b..ec4d492 100644 > --- a/mm/autonuma.c > +++ b/mm/autonuma.c > @@ -51,12 +51,6 @@ static struct knumad_scan { > .mm_head = LIST_HEAD_INIT(knumad_scan.mm_head), > }; > > -static inline bool autonuma_impossible(void) > -{ > - return num_possible_nodes() <= 1 || > - test_bit(AUTONUMA_IMPOSSIBLE_FLAG, &autonuma_flags); > -} > - > static inline void autonuma_migrate_lock(int nid) > { > spin_lock(&NODE_DATA(nid)->autonuma_lock); > @@ -82,54 +76,63 @@ void autonuma_migrate_split_huge_page(struct page *page, > struct page *page_tail) > { > int nid, last_nid; > + struct page_autonuma *page_autonuma, *page_tail_autonuma; > > - nid = page->autonuma_migrate_nid; > + if (autonuma_impossible()) Is it just better to call it 'autonuma_off()' ? -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/