The following patchset is a set of performance optimisations
for the mm subsystem. They mainly focus on the page allocator
because that is a very hot path for kbuild, which is my target
workload.
Performance improvements are not finely documented yet, so they
are not indented for merging yet. Also some rmap optimisations
that Hugh probably won't have time to ACK for a while.
However, a slightly older patchset was able to decrease kernel
residency by about 5% for UP, and 7.5% for SMP on a dual Xeon
doing kbuild.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Prefetch ptes a line ahead. Worth 25% on ia64 when doing big forks.
Index: linux-2.6/include/asm-generic/pgtable.h
===================================================================
--- linux-2.6.orig/include/asm-generic/pgtable.h
+++ linux-2.6/include/asm-generic/pgtable.h
@@ -196,6 +196,33 @@ static inline void ptep_set_wrprotect(st
})
#endif
+#ifndef __HAVE_ARCH_PTE_PREFETCH
+#define PTES_PER_LINE (L1_CACHE_BYTES / sizeof(pte_t))
+#define PTE_LINE_MASK (~(PTES_PER_LINE - 1))
+#define ADDR_PER_LINE (PTES_PER_LINE << PAGE_SHIFT)
+#define ADDR_LINE_MASK (~(ADDR_PER_LINE - 1))
+
+#define pte_prefetch(pte, addr, end) \
+({ \
+ unsigned long __nextline = ((addr) + ADDR_PER_LINE) & ADDR_LINE_MASK; \
+ if (__nextline < (end)) \
+ prefetch(pte + PTES_PER_LINE); \
+})
+
+#define pte_prefetch_start(pte, addr, end) \
+({ \
+ prefetch(pte); \
+ pte_prefetch(pte, addr, end); \
+})
+
+#define pte_prefetch_next(pte, addr, end) \
+({ \
+ unsigned long __addr = (addr); \
+ if (!(__addr & ~ADDR_LINE_MASK)) /* We hit a new cacheline */ \
+ pte_prefetch(pte, __addr, end); \
+})
+#endif
+
#ifndef __ASSEMBLY__
/*
* When walking page tables, we usually want to skip any p?d_none entries;
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -437,6 +437,8 @@ again:
if (!dst_pte)
return -ENOMEM;
src_pte = pte_offset_map_nested(src_pmd, addr);
+ pte_prefetch_start(src_pte, addr, end);
+
src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock(src_ptl);
@@ -458,7 +460,8 @@ again:
}
copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
progress += 8;
- } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+ } while (dst_pte++, src_pte++, addr += PAGE_SIZE,
+ pte_prefetch_next(src_pte, addr, end), addr != end);
spin_unlock(src_ptl);
pte_unmap_nested(src_pte - 1);
@@ -561,6 +564,7 @@ static unsigned long zap_pte_range(struc
int anon_rss = 0;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte_prefetch_start(pte, addr, end);
do {
pte_t ptent = *pte;
if (pte_none(ptent)) {
@@ -629,7 +633,8 @@ static unsigned long zap_pte_range(struc
if (!pte_file(ptent))
free_swap_and_cache(pte_to_swp_entry(ptent));
pte_clear_full(mm, addr, pte, tlb->fullmm);
- } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
+ } while (pte++, addr += PAGE_SIZE, pte_prefetch_next(pte, addr, end),
+ (addr != end && *zap_work > 0));
add_mm_rss(mm, file_rss, anon_rss);
pte_unmap_unlock(pte - 1, ptl);
Prefetch ptes a line ahead. Worth 25% on ia64 when doing big forks.
Index: linux-2.6/include/asm-generic/pgtable.h
===================================================================
--- linux-2.6.orig/include/asm-generic/pgtable.h
+++ linux-2.6/include/asm-generic/pgtable.h
@@ -196,6 +196,33 @@ static inline void ptep_set_wrprotect(st
})
#endif
+#ifndef __HAVE_ARCH_PTE_PREFETCH
+#define PTES_PER_LINE (L1_CACHE_BYTES / sizeof(pte_t))
+#define PTE_LINE_MASK (~(PTES_PER_LINE - 1))
+#define ADDR_PER_LINE (PTES_PER_LINE << PAGE_SHIFT)
+#define ADDR_LINE_MASK (~(ADDR_PER_LINE - 1))
+
+#define pte_prefetch(pte, addr, end) \
+({ \
+ unsigned long __nextline = ((addr) + ADDR_PER_LINE) & ADDR_LINE_MASK; \
+ if (__nextline < (end)) \
+ prefetch(pte + PTES_PER_LINE); \
+})
+
+#define pte_prefetch_start(pte, addr, end) \
+({ \
+ prefetch(pte); \
+ pte_prefetch(pte, addr, end); \
+})
+
+#define pte_prefetch_next(pte, addr, end) \
+({ \
+ unsigned long __addr = (addr); \
+ if (!(__addr & ~ADDR_LINE_MASK)) /* We hit a new cacheline */ \
+ pte_prefetch(pte, __addr, end); \
+})
+#endif
+
#ifndef __ASSEMBLY__
/*
* When walking page tables, we usually want to skip any p?d_none entries;
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -437,6 +437,8 @@ again:
if (!dst_pte)
return -ENOMEM;
src_pte = pte_offset_map_nested(src_pmd, addr);
+ pte_prefetch_start(src_pte, addr, end);
+
src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock(src_ptl);
@@ -458,7 +460,8 @@ again:
}
copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
progress += 8;
- } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+ } while (dst_pte++, src_pte++, addr += PAGE_SIZE,
+ pte_prefetch_next(src_pte, addr, end), addr != end);
spin_unlock(src_ptl);
pte_unmap_nested(src_pte - 1);
@@ -561,6 +564,7 @@ static unsigned long zap_pte_range(struc
int anon_rss = 0;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte_prefetch_start(pte, addr, end);
do {
pte_t ptent = *pte;
if (pte_none(ptent)) {
@@ -629,7 +633,8 @@ static unsigned long zap_pte_range(struc
if (!pte_file(ptent))
free_swap_and_cache(pte_to_swp_entry(ptent));
pte_clear_full(mm, addr, pte, tlb->fullmm);
- } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
+ } while (pte++, addr += PAGE_SIZE, pte_prefetch_next(pte, addr, end),
+ (addr != end && *zap_work > 0));
add_mm_rss(mm, file_rss, anon_rss);
pte_unmap_unlock(pte - 1, ptl);
Slightly optimise some page allocation and freeing functions by
taking advantage of knowing whether or not interrupts are disabled.
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -373,11 +373,10 @@ static int
free_pages_bulk(struct zone *zone, int count,
struct list_head *list, unsigned int order)
{
- unsigned long flags;
struct page *page = NULL;
int ret = 0;
- spin_lock_irqsave(&zone->lock, flags);
+ spin_lock(&zone->lock);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
while (!list_empty(list) && count--) {
@@ -387,12 +386,13 @@ free_pages_bulk(struct zone *zone, int c
__free_pages_bulk(page, zone, order);
ret++;
}
- spin_unlock_irqrestore(&zone->lock, flags);
+ spin_unlock(&zone->lock);
return ret;
}
void __free_pages_ok(struct page *page, unsigned int order)
{
+ unsigned long flags;
LIST_HEAD(list);
int i;
@@ -410,7 +410,9 @@ void __free_pages_ok(struct page *page,
free_pages_check(__FUNCTION__, page + i);
list_add(&page->lru, &list);
kernel_map_pages(page, 1<<order, 0);
+ local_irq_save(flags);
free_pages_bulk(page_zone(page), 1, &list, order);
+ local_irq_restore(flags);
}
@@ -526,12 +528,11 @@ static struct page *__rmqueue(struct zon
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list)
{
- unsigned long flags;
int i;
int allocated = 0;
struct page *page;
- spin_lock_irqsave(&zone->lock, flags);
+ spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
page = __rmqueue(zone, order);
if (page == NULL)
@@ -539,7 +540,7 @@ static int rmqueue_bulk(struct zone *zon
allocated++;
list_add_tail(&page->lru, list);
}
- spin_unlock_irqrestore(&zone->lock, flags);
+ spin_unlock(&zone->lock);
return allocated;
}
@@ -576,6 +577,7 @@ void drain_remote_pages(void)
#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
static void __drain_pages(unsigned int cpu)
{
+ unsigned long flags;
struct zone *zone;
int i;
@@ -587,8 +589,10 @@ static void __drain_pages(unsigned int c
struct per_cpu_pages *pcp;
pcp = &pset->pcp[i];
+ local_irq_save(flags);
pcp->count -= free_pages_bulk(zone, pcp->count,
&pcp->list, 0);
+ local_irq_restore(flags);
}
}
}
@@ -726,16 +730,14 @@ buffered_rmqueue(struct zone *zone, int
if (pcp->count <= pcp->low)
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, &pcp->list);
- if (pcp->count) {
+ if (likely(pcp->count)) {
page = list_entry(pcp->list.next, struct page, lru);
list_del(&page->lru);
pcp->count--;
}
local_irq_restore(flags);
put_cpu();
- }
-
- if (page == NULL) {
+ } else {
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order);
spin_unlock_irqrestore(&zone->lock, flags);
Optimise some pagevec functions by not reenabling irqs while
switching lru locks.
Index: linux-2.6/mm/swap.c
===================================================================
--- linux-2.6.orig/mm/swap.c
+++ linux-2.6/mm/swap.c
@@ -220,10 +220,13 @@ void release_pages(struct page **pages,
pagezone = page_zone(page);
if (pagezone != zone) {
- if (zone)
- spin_unlock_irq(&zone->lru_lock);
+ spin_lock_prefetch(&pagezone->lru_lock);
+ if (!zone)
+ local_irq_disable();
+ else
+ spin_unlock(&zone->lru_lock);
zone = pagezone;
- spin_lock_irq(&zone->lru_lock);
+ spin_lock(&zone->lru_lock);
}
if (TestClearPageLRU(page))
del_page_from_lru(zone, page);
@@ -297,10 +300,12 @@ void __pagevec_lru_add(struct pagevec *p
struct zone *pagezone = page_zone(page);
if (pagezone != zone) {
+ if (!zone)
+ local_irq_disable();
if (zone)
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock(&zone->lru_lock);
zone = pagezone;
- spin_lock_irq(&zone->lru_lock);
+ spin_lock(&zone->lru_lock);
}
if (TestSetPageLRU(page))
BUG();
@@ -324,10 +329,12 @@ void __pagevec_lru_add_active(struct pag
struct zone *pagezone = page_zone(page);
if (pagezone != zone) {
+ if (!zone)
+ local_irq_disable();
if (zone)
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock(&zone->lru_lock);
zone = pagezone;
- spin_lock_irq(&zone->lru_lock);
+ spin_lock(&zone->lru_lock);
}
if (TestSetPageLRU(page))
BUG();
Slightly optimise rmap functions by minimising atomic operations when
we know there will be no concurrent modifications.
Index: linux-2.6/include/linux/rmap.h
===================================================================
--- linux-2.6.orig/include/linux/rmap.h
+++ linux-2.6/include/linux/rmap.h
@@ -71,6 +71,7 @@ void __anon_vma_link(struct vm_area_stru
* rmap interfaces called when adding or removing pte of page
*/
void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
void page_add_file_rmap(struct page *);
void page_remove_rmap(struct page *);
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -1337,14 +1337,15 @@ static int do_wp_page(struct mm_struct *
inc_mm_counter(mm, anon_rss);
dec_mm_counter(mm, file_rss);
}
+
flush_cache_page(vma, address, pfn);
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
ptep_establish(vma, address, page_table, entry);
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
+ page_add_new_anon_rmap(new_page, vma, address);
lru_cache_add_active(new_page);
- page_add_anon_rmap(new_page, vma, address);
/* Free the old page.. */
new_page = old_page;
@@ -1796,9 +1797,8 @@ static int do_anonymous_page(struct mm_s
if (!pte_none(*page_table))
goto release;
inc_mm_counter(mm, anon_rss);
+ page_add_new_anon_rmap(page, vma, address);
lru_cache_add_active(page);
- SetPageReferenced(page);
- page_add_anon_rmap(page, vma, address);
} else {
/* Map the ZERO_PAGE - vm_page_prot is readonly */
page = ZERO_PAGE(address);
@@ -1924,11 +1924,10 @@ retry:
entry = mk_pte(new_page, vma->vm_page_prot);
if (write_access)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- set_pte_at(mm, address, page_table, entry);
if (anon) {
inc_mm_counter(mm, anon_rss);
+ page_add_new_anon_rmap(new_page, vma, address);
lru_cache_add_active(new_page);
- page_add_anon_rmap(new_page, vma, address);
} else if (!(vma->vm_flags & VM_RESERVED)) {
inc_mm_counter(mm, file_rss);
page_add_file_rmap(new_page);
@@ -1939,6 +1938,7 @@ retry:
goto unlock;
}
+ set_pte_at(mm, address, page_table, entry);
/* no need to invalidate: a not-present page shouldn't be cached */
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c
+++ linux-2.6/mm/rmap.c
@@ -440,6 +440,26 @@ int page_referenced(struct page *page, i
}
/**
+ * page_set_anon_rmap - setup new anonymous rmap
+ * @page: the page to add the mapping to
+ * @vma: the vm area in which the mapping is added
+ * @address: the user virtual address mapped
+ */
+static void __page_set_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+
+ BUG_ON(!anon_vma);
+ anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+ page->mapping = (struct address_space *) anon_vma;
+
+ page->index = linear_page_index(vma, address);
+
+ inc_page_state(nr_mapped);
+}
+
+/**
* page_add_anon_rmap - add pte mapping to an anonymous page
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
@@ -450,21 +470,28 @@ int page_referenced(struct page *page, i
void page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
- if (atomic_inc_and_test(&page->_mapcount)) {
- struct anon_vma *anon_vma = vma->anon_vma;
-
- BUG_ON(!anon_vma);
- anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
- page->mapping = (struct address_space *) anon_vma;
-
- page->index = linear_page_index(vma, address);
-
- inc_page_state(nr_mapped);
- }
+ if (atomic_inc_and_test(&page->_mapcount))
+ __page_set_anon_rmap(page, vma, address);
/* else checking page index and mapping is racy */
}
/**
+ * page_add_new_anon_rmap - add pte mapping to a new anonymous page
+ * @page: the page to add the mapping to
+ * @vma: the vm area in which the mapping is added
+ * @address: the user virtual address mapped
+ *
+ * same as page_add_anon_rmap but must only be called on *new* pages.
+ */
+void page_add_new_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
+ __page_set_anon_rmap(page, vma, address);
+}
+
+
+/**
* page_add_file_rmap - add pte mapping to a file page
* @page: the page to add the mapping to
*
@@ -487,21 +514,28 @@ void page_add_file_rmap(struct page *pag
*/
void page_remove_rmap(struct page *page)
{
- if (atomic_add_negative(-1, &page->_mapcount)) {
+ int fast = (page_mapcount(page) == 1) &
+ PageAnon(page) & (!PageSwapCache(page));
+
+ /* fast page may become SwapCache here, but nothing new will map it. */
+ if (fast)
+ reset_page_mapcount(page);
+ else if (atomic_add_negative(-1, &page->_mapcount))
BUG_ON(page_mapcount(page) < 0);
- /*
- * It would be tidy to reset the PageAnon mapping here,
- * but that might overwrite a racing page_add_anon_rmap
- * which increments mapcount after us but sets mapping
- * before us: so leave the reset to free_hot_cold_page,
- * and remember that it's only reliable while mapped.
- * Leaving it set also helps swapoff to reinstate ptes
- * faster for those pages still in swapcache.
- */
if (page_test_and_clear_dirty(page))
set_page_dirty(page);
- dec_page_state(nr_mapped);
- }
+ else
+ return; /* non zero mapcount */
+ /*
+ * It would be tidy to reset the PageAnon mapping here,
+ * but that might overwrite a racing page_add_anon_rmap
+ * which increments mapcount after us but sets mapping
+ * before us: so leave the reset to free_hot_cold_page,
+ * and remember that it's only reliable while mapped.
+ * Leaving it set also helps swapoff to reinstate ptes
+ * faster for those pages still in swapcache.
+ */
+ dec_page_state(nr_mapped);
}
/*
Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h
+++ linux-2.6/include/linux/page-flags.h
@@ -182,6 +182,7 @@ extern void __mod_page_state(unsigned lo
#define PageReferenced(page) test_bit(PG_referenced, &(page)->flags)
#define SetPageReferenced(page) set_bit(PG_referenced, &(page)->flags)
+#define __SetPageReferenced(page) __set_bit(PG_referenced, &(page)->flags)
#define ClearPageReferenced(page) clear_bit(PG_referenced, &(page)->flags)
#define TestClearPageReferenced(page) test_and_clear_bit(PG_referenced, &(page)->flags)
Inline set_page_refs. Remove mm/internal.h
Index: linux-2.6/mm/bootmem.c
===================================================================
--- linux-2.6.orig/mm/bootmem.c
+++ linux-2.6/mm/bootmem.c
@@ -19,7 +19,6 @@
#include <linux/module.h>
#include <asm/dma.h>
#include <asm/io.h>
-#include "internal.h"
/*
* Access to this subsystem has to be serialized externally. (this is
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -38,7 +38,6 @@
#include <linux/vmalloc.h>
#include <asm/tlbflush.h>
-#include "internal.h"
/*
* MCD - HACK: Find somewhere to initialize this EARLY, or make this
@@ -448,23 +447,6 @@ expand(struct zone *zone, struct page *p
return page;
}
-void set_page_refs(struct page *page, int order)
-{
-#ifdef CONFIG_MMU
- set_page_count(page, 1);
-#else
- int i;
-
- /*
- * We need to reference all the pages for this order, otherwise if
- * anyone accesses one of the pages with (get/put) it will be freed.
- * - eg: access_process_vm()
- */
- for (i = 0; i < (1 << order); i++)
- set_page_count(page + i, 1);
-#endif /* CONFIG_MMU */
-}
-
/*
* This page is about to be returned from the page allocator
*/
Index: linux-2.6/mm/internal.h
===================================================================
--- linux-2.6.orig/mm/internal.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* internal.h: mm/ internal definitions
- *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells ([email protected])
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-/* page_alloc.c */
-extern void set_page_refs(struct page *page, int order);
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -315,6 +315,23 @@ struct page {
#define set_page_count(p,v) atomic_set(&(p)->_count, v - 1)
#define __put_page(p) atomic_dec(&(p)->_count)
+static inline void set_page_refs(struct page *page, int order)
+{
+#ifdef CONFIG_MMU
+ set_page_count(page, 1);
+#else
+ int i;
+
+ /*
+ * We need to reference all the pages for this order, otherwise if
+ * anyone accesses one of the pages with (get/put) it will be freed.
+ * - eg: access_process_vm()
+ */
+ for (i = 0; i < (1 << order); i++)
+ set_page_count(page + i, 1);
+#endif /* CONFIG_MMU */
+}
+
extern void FASTCALL(__page_cache_release(struct page *));
#ifdef CONFIG_HUGETLB_PAGE
bad_range is supposed to be a temporary check. It would be a pity to throw
it out. Make it depend on CONFIG_DEBUG_VM instead.
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -78,6 +78,7 @@ int min_free_kbytes = 1024;
unsigned long __initdata nr_kernel_pages;
unsigned long __initdata nr_all_pages;
+#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
{
int ret = 0;
@@ -119,6 +120,13 @@ static int bad_range(struct zone *zone,
return 0;
}
+#else
+static inline int bad_range(struct zone *zone, struct page *page)
+{
+ return 0;
+}
+#endif
+
static void bad_page(const char *function, struct page *page)
{
printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
Index: linux-2.6/lib/Kconfig.debug
===================================================================
--- linux-2.6.orig/lib/Kconfig.debug
+++ linux-2.6/lib/Kconfig.debug
@@ -172,7 +172,8 @@ config DEBUG_VM
bool "Debug VM"
depends on DEBUG_KERNEL
help
- Enable this to debug the virtual-memory system.
+ Enable this to turn on extended checks in the virtual-memory system
+ that may impact performance.
If unsure, say N.
Micro optimise some conditionals where we don't need lazy evaluation.
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -339,9 +339,9 @@ static inline void __free_pages_bulk (st
static inline void free_pages_check(const char *function, struct page *page)
{
- if ( page_mapcount(page) ||
- page->mapping != NULL ||
- page_count(page) != 0 ||
+ if (unlikely(page_mapcount(page) |
+ (page->mapping != NULL) |
+ (page_count(page) != 0) |
(page->flags & (
1 << PG_lru |
1 << PG_private |
@@ -351,7 +351,7 @@ static inline void free_pages_check(cons
1 << PG_slab |
1 << PG_swapcache |
1 << PG_writeback |
- 1 << PG_reserved )))
+ 1 << PG_reserved ))))
bad_page(function, page);
if (PageDirty(page))
__ClearPageDirty(page);
@@ -452,9 +452,9 @@ expand(struct zone *zone, struct page *p
*/
static void prep_new_page(struct page *page, int order)
{
- if ( page_mapcount(page) ||
- page->mapping != NULL ||
- page_count(page) != 0 ||
+ if (unlikely(page_mapcount(page) |
+ (page->mapping != NULL) |
+ (page_count(page) != 0) |
(page->flags & (
1 << PG_lru |
1 << PG_private |
@@ -465,7 +465,7 @@ static void prep_new_page(struct page *p
1 << PG_slab |
1 << PG_swapcache |
1 << PG_writeback |
- 1 << PG_reserved )))
+ 1 << PG_reserved ))))
bad_page(__FUNCTION__, page);
page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
pcp->low is useless.
Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h
+++ linux-2.6/include/linux/mmzone.h
@@ -46,7 +46,6 @@ struct zone_padding {
struct per_cpu_pages {
int count; /* number of pages in the list */
- int low; /* low watermark, refill needed */
int high; /* high watermark, emptying needed */
int batch; /* chunk size for buddy add/remove */
struct list_head list; /* the list of pages */
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -712,7 +712,7 @@ buffered_rmqueue(struct zone *zone, int
pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
local_irq_save(flags);
- if (pcp->count <= pcp->low)
+ if (!pcp->count)
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, &pcp->list);
if (likely(pcp->count)) {
@@ -1324,10 +1324,9 @@ void show_free_areas(void)
pageset = zone_pcp(zone, cpu);
for (temperature = 0; temperature < 2; temperature++)
- printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
+ printk("cpu %d %s: high %d, batch %d used:%d\n",
cpu,
temperature ? "cold" : "hot",
- pageset->pcp[temperature].low,
pageset->pcp[temperature].high,
pageset->pcp[temperature].batch,
pageset->pcp[temperature].count);
@@ -1765,14 +1764,12 @@ inline void setup_pageset(struct per_cpu
pcp = &p->pcp[0]; /* hot */
pcp->count = 0;
- pcp->low = 0;
- pcp->high = 6 * batch;
+ pcp->high = 4 * batch;
pcp->batch = max(1UL, 1 * batch);
INIT_LIST_HEAD(&pcp->list);
pcp = &p->pcp[1]; /* cold*/
pcp->count = 0;
- pcp->low = 0;
pcp->high = 2 * batch;
pcp->batch = max(1UL, batch/2);
INIT_LIST_HEAD(&pcp->list);
@@ -2169,12 +2166,10 @@ static int zoneinfo_show(struct seq_file
seq_printf(m,
"\n cpu: %i pcp: %i"
"\n count: %i"
- "\n low: %i"
"\n high: %i"
"\n batch: %i",
i, j,
pageset->pcp[j].count,
- pageset->pcp[j].low,
pageset->pcp[j].high,
pageset->pcp[j].batch);
}
Optimise page_state manipulations by introducing a direct accessor
to page_state fields without disabling interrupts, in which case
the callers must provide their own locking (either disable interrupts
or not update from interrupt context).
Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h
+++ linux-2.6/include/linux/page-flags.h
@@ -138,6 +138,7 @@ extern void get_page_state_node(struct p
extern void get_full_page_state(struct page_state *ret);
extern unsigned long __read_page_state(unsigned long offset);
extern void __mod_page_state(unsigned long offset, unsigned long delta);
+extern unsigned long *__page_state(unsigned long offset);
#define read_page_state(member) \
__read_page_state(offsetof(struct page_state, member))
@@ -150,16 +151,26 @@ extern void __mod_page_state(unsigned lo
#define add_page_state(member,delta) mod_page_state(member, (delta))
#define sub_page_state(member,delta) mod_page_state(member, 0UL - (delta))
-#define mod_page_state_zone(zone, member, delta) \
- do { \
- unsigned offset; \
- if (is_highmem(zone)) \
- offset = offsetof(struct page_state, member##_high); \
- else if (is_normal(zone)) \
- offset = offsetof(struct page_state, member##_normal); \
- else \
- offset = offsetof(struct page_state, member##_dma); \
- __mod_page_state(offset, (delta)); \
+#define page_state(member) (*__page_state(offsetof(struct page_state, member)))
+
+#define state_zone_offset(zone, member) \
+({ \
+ unsigned offset; \
+ if (is_highmem(zone)) \
+ offset = offsetof(struct page_state, member##_high); \
+ else if (is_normal(zone)) \
+ offset = offsetof(struct page_state, member##_normal); \
+ else \
+ offset = offsetof(struct page_state, member##_dma); \
+ offset; \
+})
+
+#define page_state_zone(zone, member) \
+ (*__page_state(state_zone_offset(zone, member)))
+
+#define mod_page_state_zone(zone, member, delta) \
+ do { \
+ __mod_page_state(state_zone_offset(zone, member), (delta)); \
} while (0)
/*
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -400,8 +400,6 @@ void __free_pages_ok(struct page *page,
arch_free_page(page, order);
- mod_page_state(pgfree, 1 << order);
-
#ifndef CONFIG_MMU
if (order > 0)
for (i = 1 ; i < (1 << order) ; ++i)
@@ -413,6 +411,7 @@ void __free_pages_ok(struct page *page,
list_add(&page->lru, &list);
kernel_map_pages(page, 1<<order, 0);
local_irq_save(flags);
+ page_state(pgfree) += 1 << order;
free_pages_bulk(page_zone(page), 1, &list, order);
local_irq_restore(flags);
}
@@ -662,12 +661,12 @@ static void fastcall free_hot_cold_page(
arch_free_page(page, 0);
kernel_map_pages(page, 1, 0);
- inc_page_state(pgfree);
if (PageAnon(page))
page->mapping = NULL;
free_pages_check(page);
pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
local_irq_save(flags);
+ page_state(pgfree)++;
list_add(&page->lru, &pcp->list);
pcp->count++;
if (pcp->count >= pcp->high)
@@ -704,42 +703,50 @@ static struct page *
buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
{
unsigned long flags;
- struct page *page = NULL;
+ struct page *page;
int cold = !!(gfp_flags & __GFP_COLD);
+ int cpu = get_cpu();
if (order == 0) {
struct per_cpu_pages *pcp;
- pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
+ pcp = &zone_pcp(zone, cpu)->pcp[cold];
local_irq_save(flags);
- if (!pcp->count)
+ if (!pcp->count) {
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, &pcp->list);
- if (likely(pcp->count)) {
- page = list_entry(pcp->list.next, struct page, lru);
- list_del(&page->lru);
- pcp->count--;
+ if (unlikely(!pcp->count))
+ goto failed;
}
- local_irq_restore(flags);
- put_cpu();
+ page = list_entry(pcp->list.next, struct page, lru);
+ list_del(&page->lru);
+ pcp->count--;
} else {
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order);
- spin_unlock_irqrestore(&zone->lock, flags);
+ spin_unlock(&zone->lock);
+ if (!page)
+ goto failed;
}
- if (page != NULL) {
- BUG_ON(bad_range(zone, page));
- mod_page_state_zone(zone, pgalloc, 1 << order);
- prep_new_page(page, order);
+ page_state_zone(zone, pgalloc) += 1 << order;
+ local_irq_restore(flags);
+ put_cpu();
- if (gfp_flags & __GFP_ZERO)
- prep_zero_page(page, order, gfp_flags);
+ BUG_ON(bad_range(zone, page));
+ prep_new_page(page, order);
- if (order && (gfp_flags & __GFP_COMP))
- prep_compound_page(page, order);
- }
+ if (gfp_flags & __GFP_ZERO)
+ prep_zero_page(page, order, gfp_flags);
+
+ if (order && (gfp_flags & __GFP_COMP))
+ prep_compound_page(page, order);
return page;
+
+failed:
+ local_irq_restore(flags);
+ put_cpu();
+ return NULL;
}
/*
@@ -1215,6 +1222,15 @@ unsigned long __read_page_state(unsigned
return ret;
}
+unsigned long *__page_state(unsigned long offset)
+{
+ void* ptr;
+ ptr = &__get_cpu_var(page_states);
+ return (unsigned long*)(ptr + offset);
+}
+
+EXPORT_SYMBOL(__page_state);
+
void __mod_page_state(unsigned long offset, unsigned long delta)
{
unsigned long flags;
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c
+++ linux-2.6/mm/vmscan.c
@@ -641,17 +641,18 @@ static void shrink_cache(struct zone *zo
goto done;
max_scan -= nr_scan;
- if (current_is_kswapd())
- mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
- else
- mod_page_state_zone(zone, pgscan_direct, nr_scan);
nr_freed = shrink_list(&page_list, sc);
- if (current_is_kswapd())
- mod_page_state(kswapd_steal, nr_freed);
- mod_page_state_zone(zone, pgsteal, nr_freed);
sc->nr_to_reclaim -= nr_freed;
- spin_lock_irq(&zone->lru_lock);
+ local_irq_disable();
+ if (current_is_kswapd()) {
+ page_state_zone(zone, pgscan_kswapd) += nr_scan;
+ page_state(kswapd_steal) += nr_freed;
+ } else
+ page_state_zone(zone, pgscan_direct) += nr_scan;
+ page_state_zone(zone, pgsteal) += nr_freed;
+
+ spin_lock(&zone->lru_lock);
/*
* Put back any unfreeable pages.
*/
@@ -813,11 +814,13 @@ refill_inactive_zone(struct zone *zone,
}
}
zone->nr_active += pgmoved;
- spin_unlock_irq(&zone->lru_lock);
- pagevec_release(&pvec);
+ spin_unlock(&zone->lru_lock);
+
+ page_state_zone(zone, pgrefill) += pgscanned;
+ page_state(pgdeactivate) += pgdeactivate;
+ local_irq_enable();
- mod_page_state_zone(zone, pgrefill, pgscanned);
- mod_page_state(pgdeactivate, pgdeactivate);
+ pagevec_release(&pvec);
}
/*
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c
+++ linux-2.6/mm/rmap.c
@@ -456,7 +456,11 @@ static void __page_set_anon_rmap(struct
page->index = linear_page_index(vma, address);
- inc_page_state(nr_mapped);
+ /*
+ * nr_mapped state can be updated without turning off
+ * interrupts because it is not modified via interrupt.
+ */
+ page_state(nr_mapped)++;
}
/**
@@ -503,7 +507,7 @@ void page_add_file_rmap(struct page *pag
BUG_ON(!pfn_valid(page_to_pfn(page)));
if (atomic_inc_and_test(&page->_mapcount))
- inc_page_state(nr_mapped);
+ page_state(nr_mapped)++;
}
/**
@@ -535,7 +539,7 @@ void page_remove_rmap(struct page *page)
* Leaving it set also helps swapoff to reinstate ptes
* faster for those pages still in swapcache.
*/
- dec_page_state(nr_mapped);
+ page_state(nr_mapped)--;
}
/*
Use a single pcp list.
Having a hot and a cold pcp list means that cold pages are overlooked
when when a hot page is needed but none available. So a workload that is
doing heavy page reclaim will not take much advantage of the pcps for
minimising zone lock contention for the pages it is freeing up.
The same wastage applies the other way (eg. when the hot list fills up
and cold list is empty). The patch also takes care of that.
Disallow cold page allocation from taking hot pages though.
Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h
+++ linux-2.6/include/linux/mmzone.h
@@ -44,15 +44,13 @@ struct zone_padding {
#define ZONE_PADDING(name)
#endif
-struct per_cpu_pages {
+struct per_cpu_pageset {
+ struct list_head list; /* the list of pages */
int count; /* number of pages in the list */
+ int cold_count; /* number of cold pages in the list */
int high; /* high watermark, emptying needed */
int batch; /* chunk size for buddy add/remove */
- struct list_head list; /* the list of pages */
-};
-struct per_cpu_pageset {
- struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
#ifdef CONFIG_NUMA
unsigned long numa_hit; /* allocated in intended node */
unsigned long numa_miss; /* allocated in non intended node */
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -533,10 +533,8 @@ static int rmqueue_bulk(struct zone *zon
void drain_remote_pages(void)
{
struct zone *zone;
- int i;
unsigned long flags;
- local_irq_save(flags);
for_each_zone(zone) {
struct per_cpu_pageset *pset;
@@ -544,17 +542,16 @@ void drain_remote_pages(void)
if (zone->zone_pgdat->node_id == numa_node_id())
continue;
- pset = zone->pageset[smp_processor_id()];
- for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
- struct per_cpu_pages *pcp;
-
- pcp = &pset->pcp[i];
- if (pcp->count)
- pcp->count -= free_pages_bulk(zone, pcp->count,
- &pcp->list, 0);
+ local_irq_save(flags);
+ if (zone->zone_pgdat->node_id != numa_node_id()) {
+ pset = zone->pageset[smp_processor_id()];
+ if (pset->count)
+ pset->count -= free_pages_bulk(zone,
+ pset->count, &pset->list, 0);
+ pset->cold_count = min(pset->cold_count, pset->count);
}
+ local_irq_restore(flags);
}
- local_irq_restore(flags);
}
#endif
@@ -563,21 +560,16 @@ static void __drain_pages(unsigned int c
{
unsigned long flags;
struct zone *zone;
- int i;
for_each_zone(zone) {
struct per_cpu_pageset *pset;
pset = zone_pcp(zone, cpu);
- for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
- struct per_cpu_pages *pcp;
-
- pcp = &pset->pcp[i];
- local_irq_save(flags);
- pcp->count -= free_pages_bulk(zone, pcp->count,
- &pcp->list, 0);
- local_irq_restore(flags);
- }
+ local_irq_save(flags);
+ pset->count -= free_pages_bulk(zone, pset->count,
+ &pset->list, 0);
+ pset->cold_count = min(pset->cold_count, pset->count);
+ local_irq_restore(flags);
}
}
#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
@@ -655,7 +647,8 @@ static void FASTCALL(free_hot_cold_page(
static void fastcall free_hot_cold_page(struct page *page, int cold)
{
struct zone *zone = page_zone(page);
- struct per_cpu_pages *pcp;
+ struct per_cpu_pageset *pset;
+ struct list_head *entry;
unsigned long flags;
arch_free_page(page, 0);
@@ -664,13 +657,21 @@ static void fastcall free_hot_cold_page(
if (PageAnon(page))
page->mapping = NULL;
free_pages_check(page);
- pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
+ pset = zone_pcp(zone, get_cpu());
local_irq_save(flags);
page_state(pgfree)++;
- list_add(&page->lru, &pcp->list);
- pcp->count++;
- if (pcp->count >= pcp->high)
- pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+ pset->count++;
+ entry = &pset->list;
+ if (cold) {
+ pset->cold_count++;
+ entry = entry->prev; /* tail */
+ }
+ list_add(&page->lru, entry);
+ if (pset->count > pset->high) {
+ pset->count -= free_pages_bulk(zone, pset->batch,
+ &pset->list, 0);
+ pset->cold_count = min(pset->cold_count, pset->count);
+ }
local_irq_restore(flags);
put_cpu();
}
@@ -708,19 +709,31 @@ buffered_rmqueue(struct zone *zone, int
int cpu = get_cpu();
if (order == 0) {
- struct per_cpu_pages *pcp;
+ struct per_cpu_pageset *pset;
+ struct list_head *entry;
- pcp = &zone_pcp(zone, cpu)->pcp[cold];
+ pset = zone_pcp(zone, cpu);
local_irq_save(flags);
- if (!pcp->count) {
- pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, &pcp->list);
- if (unlikely(!pcp->count))
+ if (!pset->count || (cold && !pset->cold_count &&
+ pset->count <= pset->high - (pset->high>>2))) {
+ int count;
+ count = rmqueue_bulk(zone, 0,pset->batch, &pset->list);
+ if (unlikely(!count))
goto failed;
+ pset->count += count;
+ pset->cold_count += count;
}
- page = list_entry(pcp->list.next, struct page, lru);
+
+ pset->count--;
+ entry = pset->list.next;
+ if (cold) {
+ if (pset->cold_count)
+ pset->cold_count--;
+ entry = pset->list.prev;
+ }
+ pset->cold_count = min(pset->cold_count, pset->count);
+ page = list_entry(entry, struct page, lru);
list_del(&page->lru);
- pcp->count--;
} else {
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order);
@@ -1318,7 +1331,7 @@ void si_meminfo_node(struct sysinfo *val
void show_free_areas(void)
{
struct page_state ps;
- int cpu, temperature;
+ int cpu;
unsigned long active;
unsigned long inactive;
unsigned long free;
@@ -1335,17 +1348,11 @@ void show_free_areas(void)
printk("\n");
for_each_cpu(cpu) {
- struct per_cpu_pageset *pageset;
+ struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
- pageset = zone_pcp(zone, cpu);
-
- for (temperature = 0; temperature < 2; temperature++)
- printk("cpu %d %s: high %d, batch %d used:%d\n",
- cpu,
- temperature ? "cold" : "hot",
- pageset->pcp[temperature].high,
- pageset->pcp[temperature].batch,
- pageset->pcp[temperature].count);
+ printk("cpu %d: high %d, batch %d, pages %d, cold %d\n",
+ cpu, pset->high, pset->batch,
+ pset->count, pset->cold_count);
}
}
@@ -1774,21 +1781,12 @@ static int __devinit zone_batchsize(stru
inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
{
- struct per_cpu_pages *pcp;
-
memset(p, 0, sizeof(*p));
-
- pcp = &p->pcp[0]; /* hot */
- pcp->count = 0;
- pcp->high = 4 * batch;
- pcp->batch = max(1UL, 1 * batch);
- INIT_LIST_HEAD(&pcp->list);
-
- pcp = &p->pcp[1]; /* cold*/
- pcp->count = 0;
- pcp->high = 2 * batch;
- pcp->batch = max(1UL, batch/2);
- INIT_LIST_HEAD(&pcp->list);
+ p->count = 0;
+ p->cold_count = 0;
+ p->high = 6 * batch;
+ p->batch = max(1UL, 1 * batch);
+ INIT_LIST_HEAD(&p->list);
}
#ifdef CONFIG_NUMA
@@ -2168,27 +2166,15 @@ static int zoneinfo_show(struct seq_file
")"
"\n pagesets");
for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) {
- struct per_cpu_pageset *pageset;
- int j;
+ struct per_cpu_pageset *pset;
- pageset = zone_pcp(zone, i);
- for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
- if (pageset->pcp[j].count)
- break;
- }
- if (j == ARRAY_SIZE(pageset->pcp))
- continue;
- for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
- seq_printf(m,
- "\n cpu: %i pcp: %i"
- "\n count: %i"
- "\n high: %i"
- "\n batch: %i",
- i, j,
- pageset->pcp[j].count,
- pageset->pcp[j].high,
- pageset->pcp[j].batch);
- }
+ pset = zone_pcp(zone, i);
+ seq_printf(m,
+ "\n cpu: %i"
+ "\n count: %i"
+ "\n high: %i"
+ "\n batch: %i",
+ i, pset->count, pset->high, pset->batch);
#ifdef CONFIG_NUMA
seq_printf(m,
"\n numa_hit: %lu"
@@ -2197,12 +2183,12 @@ static int zoneinfo_show(struct seq_file
"\n interleave_hit: %lu"
"\n local_node: %lu"
"\n other_node: %lu",
- pageset->numa_hit,
- pageset->numa_miss,
- pageset->numa_foreign,
- pageset->interleave_hit,
- pageset->local_node,
- pageset->other_node);
+ pset->numa_hit,
+ pset->numa_miss,
+ pset->numa_foreign,
+ pset->interleave_hit,
+ pset->local_node,
+ pset->other_node);
#endif
}
seq_printf(m,
Increasing pageset size gives improvements on kbuild on my Xeon.
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -1784,7 +1784,7 @@ inline void setup_pageset(struct per_cpu
memset(p, 0, sizeof(*p));
p->count = 0;
p->cold_count = 0;
- p->high = 6 * batch;
+ p->high = 16 * batch;
p->batch = max(1UL, 1 * batch);
INIT_LIST_HEAD(&p->list);
}
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -431,8 +431,7 @@ void __free_pages_ok(struct page *page,
*
* -- wli
*/
-static inline struct page *
-expand(struct zone *zone, struct page *page,
+static inline void expand(struct zone *zone, struct page *page,
int low, int high, struct free_area *area)
{
unsigned long size = 1 << high;
@@ -446,7 +445,6 @@ expand(struct zone *zone, struct page *p
area->nr_free++;
set_page_order(&page[size], high);
}
- return page;
}
/*
@@ -498,7 +496,8 @@ static struct page *__rmqueue(struct zon
rmv_page_order(page);
area->nr_free--;
zone->free_pages -= 1UL << order;
- return expand(zone, page, order, current_order, area);
+ expand(zone, page, order, current_order, area);
+ return page;
}
return NULL;
@@ -513,19 +512,16 @@ static int rmqueue_bulk(struct zone *zon
unsigned long count, struct list_head *list)
{
int i;
- int allocated = 0;
- struct page *page;
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
- page = __rmqueue(zone, order);
- if (page == NULL)
+ struct page *page = __rmqueue(zone, order);
+ if (unlikely(page == NULL))
break;
- allocated++;
list_add_tail(&page->lru, list);
}
spin_unlock(&zone->lock);
- return allocated;
+ return i;
}
#ifdef CONFIG_NUMA
The previous increase in pcp list size will probably be too much for
huge NUMA machines, despite advances in keeping remote pagesets in check.
Make pcp sizes for remote zones much smaller (slightly smaller than before
the increase), and take advantage of this to increase local pcp list size
again.
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -1779,13 +1779,14 @@ static int __devinit zone_batchsize(stru
return batch;
}
-inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+static inline void setup_pageset(struct per_cpu_pageset *p,
+ unsigned long size, unsigned long batch)
{
memset(p, 0, sizeof(*p));
p->count = 0;
p->cold_count = 0;
- p->high = 16 * batch;
- p->batch = max(1UL, 1 * batch);
+ p->high = max(1UL, size);
+ p->batch = max(1UL, batch);
INIT_LIST_HEAD(&p->list);
}
@@ -1819,13 +1820,19 @@ static int __devinit process_zones(int c
struct zone *zone, *dzone;
for_each_zone(zone) {
+ unsigned long size, batch;
zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
GFP_KERNEL, cpu_to_node(cpu));
if (!zone->pageset[cpu])
goto bad;
- setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
+ batch = zone_batchsize(zone);
+ if (cpu_to_node(cpu) == zone->zone_pgdat->node_id)
+ size = batch * 32;
+ else
+ size = batch * 4;
+ setup_pageset(zone->pageset[cpu], size, batch);
}
return 0;
@@ -1923,9 +1930,9 @@ static __devinit void zone_pcp_init(stru
#ifdef CONFIG_NUMA
/* Early boot. Slab allocator not functional yet */
zone->pageset[cpu] = &boot_pageset[cpu];
- setup_pageset(&boot_pageset[cpu],0);
+ setup_pageset(&boot_pageset[cpu], 0, 0);
#else
- setup_pageset(zone_pcp(zone,cpu), batch);
+ setup_pageset(zone_pcp(zone, cpu), batch * 32, batch);
#endif
}
printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
Use zone_pcp everywhere even though NUMA code "knows" the internal
details of the zone. Stop other people trying to copy, and it looks
nicer.
Also, only print the pagesets of online cpus in zoneinfo.
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -544,7 +544,7 @@ void drain_remote_pages(void)
local_irq_save(flags);
if (zone->zone_pgdat->node_id != numa_node_id()) {
- pset = zone->pageset[smp_processor_id()];
+ pset = zone_pcp(zone, smp_processor_id());
if (pset->count)
pset->count -= free_pages_bulk(zone,
pset->count, &pset->list, 0);
@@ -1822,9 +1822,9 @@ static int __devinit process_zones(int c
for_each_zone(zone) {
unsigned long size, batch;
- zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
+ zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
GFP_KERNEL, cpu_to_node(cpu));
- if (!zone->pageset[cpu])
+ if (!zone_pcp(zone, cpu))
goto bad;
batch = zone_batchsize(zone);
@@ -1832,7 +1832,7 @@ static int __devinit process_zones(int c
size = batch * 32;
else
size = batch * 4;
- setup_pageset(zone->pageset[cpu], size, batch);
+ setup_pageset(zone_pcp(zone, cpu), size, batch);
}
return 0;
@@ -1840,8 +1840,8 @@ bad:
for_each_zone(dzone) {
if (dzone == zone)
break;
- kfree(dzone->pageset[cpu]);
- dzone->pageset[cpu] = NULL;
+ kfree(zone_pcp(dzone, cpu));
+ zone_pcp(dzone, cpu) = NULL;
}
return -ENOMEM;
}
@@ -1929,8 +1929,8 @@ static __devinit void zone_pcp_init(stru
for (cpu = 0; cpu < NR_CPUS; cpu++) {
#ifdef CONFIG_NUMA
/* Early boot. Slab allocator not functional yet */
- zone->pageset[cpu] = &boot_pageset[cpu];
setup_pageset(&boot_pageset[cpu], 0, 0);
+ zone_pcp(zone, cpu) = &boot_pageset[cpu];
#else
setup_pageset(zone_pcp(zone, cpu), batch * 32, batch);
#endif
@@ -2172,7 +2172,7 @@ static int zoneinfo_show(struct seq_file
seq_printf(m,
")"
"\n pagesets");
- for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) {
+ for_each_online_cpu(i) {
struct per_cpu_pageset *pset;
pset = zone_pcp(zone, i);
On Sun, 2005-11-06 at 19:20 +1100, Nick Piggin wrote:
> 2/14
>
> plain text document attachment (mm-pte-prefetch.patch)
> Prefetch ptes a line ahead. Worth 25% on ia64 when doing big forks.
>
> Index: linux-2.6/include/asm-generic/pgtable.h
> ===================================================================
> --- linux-2.6.orig/include/asm-generic/pgtable.h
> +++ linux-2.6/include/asm-generic/pgtable.h
> @@ -196,6 +196,33 @@ static inline void ptep_set_wrprotect(st
> })
> #endif
>
> +#ifndef __HAVE_ARCH_PTE_PREFETCH
> +#define PTES_PER_LINE (L1_CACHE_BYTES / sizeof(pte_t))
> +#define PTE_LINE_MASK (~(PTES_PER_LINE - 1))
> +#define ADDR_PER_LINE (PTES_PER_LINE << PAGE_SHIFT)
> +#define ADDR_LINE_MASK (~(ADDR_PER_LINE - 1))
> +
> +#define pte_prefetch(pte, addr, end) \
> +({ \
> + unsigned long __nextline = ((addr) + ADDR_PER_LINE) & ADDR_LINE_MASK; \
> + if (__nextline < (end)) \
> + prefetch(pte + PTES_PER_LINE); \
> +})
> +
are you sure this is right? at least on pc's having a branch predictor
miss is very expensive and might well be more expensive than the gain
you get from a prefetch
Arjan van de Ven wrote:
> On Sun, 2005-11-06 at 19:20 +1100, Nick Piggin wrote:
>
>>2/14
>>
>>plain text document attachment (mm-pte-prefetch.patch)
>>Prefetch ptes a line ahead. Worth 25% on ia64 when doing big forks.
>>
>>Index: linux-2.6/include/asm-generic/pgtable.h
>>===================================================================
>>--- linux-2.6.orig/include/asm-generic/pgtable.h
>>+++ linux-2.6/include/asm-generic/pgtable.h
>>@@ -196,6 +196,33 @@ static inline void ptep_set_wrprotect(st
>> })
>> #endif
>>
>>+#ifndef __HAVE_ARCH_PTE_PREFETCH
>>+#define PTES_PER_LINE (L1_CACHE_BYTES / sizeof(pte_t))
>>+#define PTE_LINE_MASK (~(PTES_PER_LINE - 1))
>>+#define ADDR_PER_LINE (PTES_PER_LINE << PAGE_SHIFT)
>>+#define ADDR_LINE_MASK (~(ADDR_PER_LINE - 1))
>>+
>>+#define pte_prefetch(pte, addr, end) \
>>+({ \
>>+ unsigned long __nextline = ((addr) + ADDR_PER_LINE) & ADDR_LINE_MASK; \
>>+ if (__nextline < (end)) \
>>+ prefetch(pte + PTES_PER_LINE); \
>>+})
>>+
>
>
> are you sure this is right? at least on pc's having a branch predictor
> miss is very expensive and might well be more expensive than the gain
> you get from a prefetch
>
Yeah, not 100% sure about this one, which is why it has been sitting
around for so long.
It gives about 25% on contrived fork workload on an ia64 system, which
is probably about its best case workload+architecture. I haven't found
any notable regressions but it definitely isn't going to be any faster
when the page tables are in cache.
So long as I haven't found a real-world workload that is improved with
the patch, I won't be trynig to get it merged.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Nick Piggin wrote: [Sun Nov 06 2005, 03:24:40AM EST]
> 7/14
>
> --
> SUSE Labs, Novell Inc.
>
> bad_range is supposed to be a temporary check. It would be a pity to throw
> it out. Make it depend on CONFIG_DEBUG_VM instead.
>
> Index: linux-2.6/mm/page_alloc.c
> ===================================================================
> --- linux-2.6.orig/mm/page_alloc.c
> +++ linux-2.6/mm/page_alloc.c
> @@ -78,6 +78,7 @@ int min_free_kbytes = 1024;
> unsigned long __initdata nr_kernel_pages;
> unsigned long __initdata nr_all_pages;
>
> +#ifdef CONFIG_DEBUG_VM
> static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
> {
> int ret = 0;
> @@ -119,6 +120,13 @@ static int bad_range(struct zone *zone,
> return 0;
> }
>
> +#else
> +static inline int bad_range(struct zone *zone, struct page *page)
> +{
> + return 0;
> +}
> +#endif
> +
> static void bad_page(const char *function, struct page *page)
> {
> printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
> Index: linux-2.6/lib/Kconfig.debug
> ===================================================================
> --- linux-2.6.orig/lib/Kconfig.debug
> +++ linux-2.6/lib/Kconfig.debug
> @@ -172,7 +172,8 @@ config DEBUG_VM
> bool "Debug VM"
> depends on DEBUG_KERNEL
> help
> - Enable this to debug the virtual-memory system.
> + Enable this to turn on extended checks in the virtual-memory system
> + that may impact performance.
>
> If unsure, say N.
>
Nick,
I don't think you can do it this way. On ia64 VIRTUAL_MEM_MAP depends on
CONFIG_HOLES_IN_ZONE and the check within bad_range for pfn_valid. Holes in
memory (MMIO and etc.) won't have a page structure.
bob
Nick Piggin <[email protected]> writes:
> 1/14
>
> --
> SUSE Labs, Novell Inc.
>
> Slightly optimise some page allocation and freeing functions by
> taking advantage of knowing whether or not interrupts are disabled.
Another thing that could optimize that would be to use local_t
for the per zone statistics and the VM statistics (i have an
old patch for the later, needs polishing up for the current
kernel)
With an architecture optimized for it (like i386/x86-64) they
generate much better code.
-Andi
Bob Picco wrote:
> Nick Piggin wrote: [Sun Nov 06 2005, 03:24:40AM EST]
>
>>7/14
>>
>>--
>>SUSE Labs, Novell Inc.
>>
>
>
>>bad_range is supposed to be a temporary check. It would be a pity to throw
>>it out. Make it depend on CONFIG_DEBUG_VM instead.
>>
>>Index: linux-2.6/mm/page_alloc.c
>>===================================================================
>>--- linux-2.6.orig/mm/page_alloc.c
>>+++ linux-2.6/mm/page_alloc.c
>>@@ -78,6 +78,7 @@ int min_free_kbytes = 1024;
>> unsigned long __initdata nr_kernel_pages;
>> unsigned long __initdata nr_all_pages;
>>
>>+#ifdef CONFIG_DEBUG_VM
>> static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
>> {
>> int ret = 0;
>>@@ -119,6 +120,13 @@ static int bad_range(struct zone *zone,
>> return 0;
>> }
>>
>>+#else
>>+static inline int bad_range(struct zone *zone, struct page *page)
>>+{
>>+ return 0;
>>+}
>>+#endif
>>+
>> static void bad_page(const char *function, struct page *page)
>> {
>> printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
>>Index: linux-2.6/lib/Kconfig.debug
>>===================================================================
>>--- linux-2.6.orig/lib/Kconfig.debug
>>+++ linux-2.6/lib/Kconfig.debug
>>@@ -172,7 +172,8 @@ config DEBUG_VM
>> bool "Debug VM"
>> depends on DEBUG_KERNEL
>> help
>>- Enable this to debug the virtual-memory system.
>>+ Enable this to turn on extended checks in the virtual-memory system
>>+ that may impact performance.
>>
>> If unsure, say N.
>>
>
> Nick,
>
> I don't think you can do it this way. On ia64 VIRTUAL_MEM_MAP depends on
> CONFIG_HOLES_IN_ZONE and the check within bad_range for pfn_valid. Holes in
> memory (MMIO and etc.) won't have a page structure.
>
Hmm, right - in __free_pages_bulk.
Could we make a different call here, or is the full array of bad_range
checks required?
Thanks,
Nick
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Andi Kleen wrote:
> Nick Piggin <[email protected]> writes:
>
>
>>1/14
>>
>>--
>>SUSE Labs, Novell Inc.
>>
>>Slightly optimise some page allocation and freeing functions by
>>taking advantage of knowing whether or not interrupts are disabled.
>
>
> Another thing that could optimize that would be to use local_t
> for the per zone statistics and the VM statistics (i have an
> old patch for the later, needs polishing up for the current
> kernel)
> With an architecture optimized for it (like i386/x86-64) they
> generate much better code.
>
Yes, all this turning on and off of interrupts does have a
significant cost here.
With the full patchset applied, most of the hot path statistics
get put under areas that already require interrupts to be off,
however there are still a few I didn't get around to doing.
zone_statistics on CONFIG_NUMA, for example.
I wonder if local_t is still good on architectures like ppc64
where it still requires an ll/sc sequence?
Nick
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
could you _please_ send you patches inline? skipping to an attachment
everytime to read the description and patch is very awkward. We can
allow that as an exception for people who send a single patch occasionally,
but for huge patch series it's highly annoying.
On Sun, Nov 06, 2005 at 07:23:30PM +1100, Nick Piggin wrote:
> 5/14
>
> --
> SUSE Labs, Novell Inc.
>
> Inline set_page_refs. Remove mm/internal.h
So why don't you keep the inline function in mm/internal.h? this isn't
really stuff we want driver writers to use every.
Christoph Hellwig wrote:
> On Sun, Nov 06, 2005 at 07:23:30PM +1100, Nick Piggin wrote:
>
>>5/14
>>
>>--
>>SUSE Labs, Novell Inc.
>>
>
>
>>Inline set_page_refs. Remove mm/internal.h
>
>
> So why don't you keep the inline function in mm/internal.h? this isn't
> really stuff we want driver writers to use every.
>
>
There are plenty of things in the linux/ headers which driver
writers shouldn't use.
Although I think your idea is a good one, and one has to start
somewhere. I'll make that change, thanks.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Christoph Hellwig wrote:
> could you _please_ send you patches inline? skipping to an attachment
> everytime to read the description and patch is very awkward. We can
> allow that as an exception for people who send a single patch occasionally,
> but for huge patch series it's highly annoying.
>
Yeah, my mailer traditionally breaks them and not many people have
complained. I was hoping people were just allowing an exception for
me because I'm cool, but I guess not :(
Maybe time to switch mailers... I'll see what I can do.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Nick Piggin wrote: [Sun Nov 06 2005, 07:58:26PM EST]
> Bob Picco wrote:
> >Nick Piggin wrote: [Sun Nov 06 2005, 03:24:40AM EST]
> >
> >>7/14
> >>
> >>--
> >>SUSE Labs, Novell Inc.
> >>
> >
> >
> >>bad_range is supposed to be a temporary check. It would be a pity to throw
> >>it out. Make it depend on CONFIG_DEBUG_VM instead.
> >>
> >>Index: linux-2.6/mm/page_alloc.c
> >>===================================================================
> >>--- linux-2.6.orig/mm/page_alloc.c
> >>+++ linux-2.6/mm/page_alloc.c
> >>@@ -78,6 +78,7 @@ int min_free_kbytes = 1024;
> >>unsigned long __initdata nr_kernel_pages;
> >>unsigned long __initdata nr_all_pages;
> >>
> >>+#ifdef CONFIG_DEBUG_VM
> >>static int page_outside_zone_boundaries(struct zone *zone, struct page
> >>*page)
> >>{
> >> int ret = 0;
> >>@@ -119,6 +120,13 @@ static int bad_range(struct zone *zone,
> >> return 0;
> >>}
> >>
> >>+#else
> >>+static inline int bad_range(struct zone *zone, struct page *page)
> >>+{
> >>+ return 0;
> >>+}
> >>+#endif
> >>+
> >>static void bad_page(const char *function, struct page *page)
> >>{
> >> printk(KERN_EMERG "Bad page state at %s (in process '%s', page
> >> %p)\n",
> >>Index: linux-2.6/lib/Kconfig.debug
> >>===================================================================
> >>--- linux-2.6.orig/lib/Kconfig.debug
> >>+++ linux-2.6/lib/Kconfig.debug
> >>@@ -172,7 +172,8 @@ config DEBUG_VM
> >> bool "Debug VM"
> >> depends on DEBUG_KERNEL
> >> help
> >>- Enable this to debug the virtual-memory system.
> >>+ Enable this to turn on extended checks in the virtual-memory system
> >>+ that may impact performance.
> >>
> >> If unsure, say N.
> >>
> >
> >Nick,
> >
> >I don't think you can do it this way. On ia64 VIRTUAL_MEM_MAP depends on
> >CONFIG_HOLES_IN_ZONE and the check within bad_range for pfn_valid. Holes in
> >memory (MMIO and etc.) won't have a page structure.
> >
>
> Hmm, right - in __free_pages_bulk.
>
> Could we make a different call here, or is the full array of bad_range
> checks required?
Not the full array. Just the pfn_valid call. Seems CONFIG_HOLES_IN_ZONE is
already in page_alloc.c, perhaps just in __free_pages_bulk as a replacement
for the bad_range call which isn't within a BUG_ON check. It's somewhat of a
wart but already there. Otherwise we might want arch_holes_in_zone inline
which is only required by ia64 and noop for other arches.
The only place I didn't look closely is the BUG_ON in expand. I'll do that
tomorrow.
>
> Thanks,
> Nick
>
> --
your welcome,
bob
Bob Picco wrote:
> Nick Piggin wrote: [Sun Nov 06 2005, 07:58:26PM EST]
>
>>Hmm, right - in __free_pages_bulk.
>>
>>Could we make a different call here, or is the full array of bad_range
>>checks required?
>
> Not the full array. Just the pfn_valid call. Seems CONFIG_HOLES_IN_ZONE is
> already in page_alloc.c, perhaps just in __free_pages_bulk as a replacement
> for the bad_range call which isn't within a BUG_ON check. It's somewhat of a
> wart but already there. Otherwise we might want arch_holes_in_zone inline
> which is only required by ia64 and noop for other arches.
>
Ideally yes, it would be hidden away in an arch specific header file.
In the meantime I will just replace it with an ifdefed pfn_valid call.
> The only place I didn't look closely is the BUG_ON in expand. I'll do that
> tomorrow.
>
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
On Monday 07 November 2005 02:06, Nick Piggin wrote:
> Yes, all this turning on and off of interrupts does have a
> significant cost here.
How did you find out?
>
> With the full patchset applied, most of the hot path statistics
> get put under areas that already require interrupts to be off,
> however there are still a few I didn't get around to doing.
> zone_statistics on CONFIG_NUMA, for example.
These should just be local_t
>
> I wonder if local_t is still good on architectures like ppc64
> where it still requires an ll/sc sequence?
The current default fallback local_t doesn't require that. It uses
different fields indexed by !!in_interrupt()
-Andi
Andi Kleen wrote:
> On Monday 07 November 2005 02:06, Nick Piggin wrote:
>
>
>>Yes, all this turning on and off of interrupts does have a
>>significant cost here.
>
>
> How did you find out?
>
Measuring the actual performance improvement on kbuild.
Not to mention that profiles for things like mod_page_state
go dramatically down, but you can't use that alone to be sure
of an improvement.
>
>>With the full patchset applied, most of the hot path statistics
>>get put under areas that already require interrupts to be off,
>>however there are still a few I didn't get around to doing.
>>zone_statistics on CONFIG_NUMA, for example.
>
>
> These should just be local_t
>
Yep.
>
>>I wonder if local_t is still good on architectures like ppc64
>>where it still requires an ll/sc sequence?
>
>
> The current default fallback local_t doesn't require that. It uses
> different fields indexed by !!in_interrupt()
>
Right I didn't see that. ppc(32), then.
I think maybe for struct page_state there is not so much point
in using local_t because the hot page allocator paths can easily
be covered under the interrupt critical sections.
The other fields aren't very hot, and using local_t would bloat
this up by many cachelines on 64-bit architectures like ppc64,
and would make them probably noticably more expensive on 32s
like ppc.
Actually, the NUMA fields in the pcp lists can probably also
just be put under the interrupt-off section that the page
allocator uses. At least it should be much easier to do when
Seth's __alloc_pages cleanup goes in. I'll keep it in mind.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
> Maybe time to switch mailers... I'll see what I can do.
I recommend using a dedicated tool (patchbomb script) to send patches,
not ones email client. It lets you prepare everything ahead of time in
your favorite editor, and obtains optimum results.
See the script I use, at:
http://www.speakeasy.org/~pj99/sgi/sendpatchset
--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401
Paul Jackson wrote:
>>Maybe time to switch mailers... I'll see what I can do.
>
>
> I recommend using a dedicated tool (patchbomb script) to send patches,
> not ones email client. It lets you prepare everything ahead of time in
> your favorite editor, and obtains optimum results.
>
> See the script I use, at:
>
> http://www.speakeasy.org/~pj99/sgi/sendpatchset
>
Probably the best idea. I hadn't worried about those until now,
although I have several fairly large patchsets floating around.
Thanks,
Nick
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Nick Piggin <[email protected]> writes:
> 9/14
>
> --
> SUSE Labs, Novell Inc.
>
> Optimise page_state manipulations by introducing a direct accessor
> to page_state fields without disabling interrupts, in which case
> the callers must provide their own locking (either disable interrupts
> or not update from interrupt context).
I have a patchkit (which i need to update for the current kernel)
which replaces this with local_t. Gives much better code and is much
simpler and doesn't require turning off interrupts anywhere.
-Andi