Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S935692AbYCSU7A (ORCPT ); Wed, 19 Mar 2008 16:59:00 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1753658AbYCSTtf (ORCPT ); Wed, 19 Mar 2008 15:49:35 -0400 Received: from gir.skynet.ie ([193.1.99.77]:38417 "EHLO gir.skynet.ie" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1759445AbYCSTtX (ORCPT ); Wed, 19 Mar 2008 15:49:23 -0400 Date: Tue, 18 Mar 2008 16:27:14 +0000 From: Mel Gorman To: Andi Kleen Cc: linux-kernel@vger.kernel.org, pj@sgi.com, linux-mm@kvack.org, nickpiggin@yahoo.com.au Subject: Re: [PATCH] [12/18] Add support to allocate hugetlb pages that are larger than MAX_ORDER Message-ID: <20080318162714.GL23866@csn.ul.ie> References: <20080317258.659191058@firstfloor.org> <20080317015826.110AA1B41E0@basil.firstfloor.org> MIME-Version: 1.0 Content-Type: text/plain; charset=iso-8859-15 Content-Disposition: inline In-Reply-To: <20080317015826.110AA1B41E0@basil.firstfloor.org> User-Agent: Mutt/1.5.13 (2006-08-11) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 5475 Lines: 174 On (17/03/08 02:58), Andi Kleen didst pronounce: > This is needed on x86-64 to handle GB pages in hugetlbfs, because it is > not practical to enlarge MAX_ORDER to 1GB. > > Instead the 1GB pages are only allocated at boot using the bootmem > allocator using the hugepages=... option. > > These 1G bootmem pages are never freed. In theory it would be possible > to implement that with some complications, but since it would be a one-way > street (> MAX_ORDER pages cannot be allocated later) I decided not to currently. > > The > MAX_ORDER code is not ifdef'ed per architecture. It is not very big > and the ifdef uglyness seemed not be worth it. > > Known problems: /proc/meminfo and "free" do not display the memory > allocated for gb pages in "Total". This is a little confusing for the > user. > > Signed-off-by: Andi Kleen > > --- > mm/hugetlb.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 62 insertions(+), 2 deletions(-) > > Index: linux/mm/hugetlb.c > =================================================================== > --- linux.orig/mm/hugetlb.c > +++ linux/mm/hugetlb.c > @@ -14,6 +14,7 @@ > #include > #include > #include > +#include > > #include > #include > @@ -153,7 +154,7 @@ static void free_huge_page(struct page * > INIT_LIST_HEAD(&page->lru); > > spin_lock(&hugetlb_lock); > - if (h->surplus_huge_pages_node[nid]) { > + if (h->surplus_huge_pages_node[nid] && h->order <= MAX_ORDER) { > update_and_free_page(h, page); > h->surplus_huge_pages--; > h->surplus_huge_pages_node[nid]--; > @@ -215,6 +216,9 @@ static struct page *alloc_fresh_huge_pag > { > struct page *page; > > + if (h->order > MAX_ORDER) > + return NULL; > + Should this print out a KERN_INFO message to the effect that pages of that size must be reserved at boot-time? > page = alloc_pages_node(nid, > htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN, > huge_page_order(h)); > @@ -271,6 +275,9 @@ static struct page *alloc_buddy_huge_pag > struct page *page; > unsigned int nid; > > + if (h->order > MAX_ORDER) > + return NULL; > + > /* > * Assume we will successfully allocate the surplus page to > * prevent racing processes from causing the surplus to exceed > @@ -422,6 +429,10 @@ return_unused_surplus_pages(struct hstat > /* Uncommit the reservation */ > h->resv_huge_pages -= unused_resv_pages; > > + /* Cannot return gigantic pages currently */ > + if (h->order > MAX_ORDER) > + return; > + > nr_pages = min(unused_resv_pages, h->surplus_huge_pages); > > while (nr_pages) { > @@ -499,6 +510,44 @@ static struct page *alloc_huge_page(stru > return page; > } > > +static __initdata LIST_HEAD(huge_boot_pages); > + > +struct huge_bm_page { > + struct list_head list; > + struct hstate *hstate; > +}; > + > +static int __init alloc_bm_huge_page(struct hstate *h) > +{ > + struct huge_bm_page *m; > + m = __alloc_bootmem_node_nopanic(NODE_DATA(h->hugetlb_next_nid), > + huge_page_size(h), huge_page_size(h), > + 0); > + if (!m) > + return 0; > + BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1)); > + /* Put them into a private list first because mem_map is not up yet */ > + list_add(&m->list, &huge_boot_pages); > + m->hstate = h; > + huge_next_node(h); > + return 1; > +} > + > +/* Put bootmem huge pages into the standard lists after mem_map is up */ > +static int __init huge_init_bm(void) > +{ > + struct huge_bm_page *m; > + list_for_each_entry (m, &huge_boot_pages, list) { > + struct page *page = virt_to_page(m); > + struct hstate *h = m->hstate; > + __ClearPageReserved(page); > + prep_compound_page(page, h->order); > + huge_new_page(h, page); > + } > + return 0; > +} > +__initcall(huge_init_bm); > + > static int __init hugetlb_init_hstate(struct hstate *h) > { > unsigned long i; > @@ -509,7 +558,10 @@ static int __init hugetlb_init_hstate(st > h->hugetlb_next_nid = first_node(node_online_map); > > for (i = 0; i < max_huge_pages[h - hstates]; ++i) { > - if (!alloc_fresh_huge_page(h)) > + if (h->order > MAX_ORDER) { > + if (!alloc_bm_huge_page(h)) > + break; > + } else if (!alloc_fresh_huge_page(h)) > break; > } > max_huge_pages[h - hstates] = h->free_huge_pages = h->nr_huge_pages = i; > @@ -581,6 +633,9 @@ static void do_try_to_free_low(struct hs > { > int i; > > + if (h->order > MAX_ORDER) > + return; > + > for (i = 0; i < MAX_NUMNODES; ++i) { > struct page *page, *next; > struct list_head *freel = &h->hugepage_freelists[i]; > @@ -618,6 +673,11 @@ set_max_huge_pages(struct hstate *h, uns > > *err = 0; > > + if (h->order > MAX_ORDER) { > + *err = -EINVAL; > + return max_huge_pages[h - hstates]; > + } > + Ah, scratch the comment on an earlier patch where I said I cannot see where err ever gets updated in set_max_huge_pages(). > /* > * Increase the pool size > * First take pages out of surplus state. Then make up the > -- Mel Gorman Part-time Phd Student Linux Technology Center University of Limerick IBM Dublin Software Lab -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/