2005-01-13 16:08:24

by Mel Gorman

[permalink] [raw]
Subject: [PATCH] Avoiding fragmentation through different allocator V2

The patch is against 2.6.11-rc1 and I'm willing to stand by it's
stability. I'm also confident it does it's job pretty well so I'd like it
to be considered for inclusion.

For me, the next stage is to write a linear scanner that goes through the
address space to free up a high-order block of pages on demand. This will
be a tricky job so it'll take me quite a while.



Changelog since V1

o Update patch to 2.6.11-rc1
o Cleaned up bug where memory was wasted on a large bitmap
o Extended fallback_count bean counters to show the fallback count for each
allocation type
o Better commenting

This patch divides allocations into three different types of allocations;

UserReclaimable - These are userspace pages that are easily reclaimable. Right
now, I'm putting all allocations of GFP_USER and GFP_HIGHUSER as
well as disk-buffer pages into this category. These pages are trivially
reclaimed by writing the page out to swap or syncing with backing
storage

KernelReclaimable - These are pages allocated by the kernel that are easily
reclaimed. This is stuff like inode caches, dcache, buffer_heads etc.
These type of pages potentially could be reclaimed by dumping the
caches and reaping the slabs (drastic, but you get the idea).

KernelNonReclaimable - These are pages that are allocated by the kernel that
are not trivially reclaimed. For example, the memory allocated for a
loaded module would be in this category. By default, allocations are
considered to be of this type

Instead of having one global MAX_ORDER-sized array of free lists, there are
three, one for each type of allocation. Finally, there is a list of pages of
size 2^MAX_ORDER which is a global pool of the largest pages the kernel deals
with.

Once a 2^MAX_ORDER block of pages it split for a type of allocation, it is
added to the free-lists for that type, in effect reserving it. Hence, over
time, pages of the different types can be clustered together. This means that
if we wanted 2^MAX_ORDER number of pages, we could linearly scan a block of
pages allocated for UserReclaimable and page each of them out.

Fallback is used when there are no 2^MAX_ORDER pages available and there
are no free pages of the desired type. The fallback lists were chosen in a
way that keeps the most easily reclaimable pages together.

Signed-off-by: Mel Gorman <[email protected]>

diff -rup linux-2.6.11-rc1-clean/fs/buffer.c linux-2.6.11-rc1-mbuddy/fs/buffer.c
--- linux-2.6.11-rc1-clean/fs/buffer.c 2005-01-12 04:01:23.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/fs/buffer.c 2005-01-13 10:56:30.000000000 +0000
@@ -1134,7 +1134,8 @@ grow_dev_page(struct block_device *bdev,
struct page *page;
struct buffer_head *bh;

- page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ page = find_or_create_page(inode->i_mapping, index,
+ GFP_NOFS | __GFP_USERRCLM);
if (!page)
return NULL;

@@ -2997,7 +2998,8 @@ static void recalc_bh_state(void)

struct buffer_head *alloc_buffer_head(int gfp_flags)
{
- struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
+ struct buffer_head *ret = kmem_cache_alloc(bh_cachep,
+ gfp_flags|__GFP_KERNRCLM);
if (ret) {
preempt_disable();
__get_cpu_var(bh_accounting).nr++;
diff -rup linux-2.6.11-rc1-clean/fs/dcache.c linux-2.6.11-rc1-mbuddy/fs/dcache.c
--- linux-2.6.11-rc1-clean/fs/dcache.c 2005-01-12 04:00:09.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/fs/dcache.c 2005-01-13 10:56:30.000000000 +0000
@@ -715,7 +715,8 @@ struct dentry *d_alloc(struct dentry * p
struct dentry *dentry;
char *dname;

- dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
+ dentry = kmem_cache_alloc(dentry_cache,
+ GFP_KERNEL|__GFP_KERNRCLM);
if (!dentry)
return NULL;

diff -rup linux-2.6.11-rc1-clean/fs/ext2/super.c linux-2.6.11-rc1-mbuddy/fs/ext2/super.c
--- linux-2.6.11-rc1-clean/fs/ext2/super.c 2005-01-12 04:01:24.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/fs/ext2/super.c 2005-01-13 10:56:30.000000000 +0000
@@ -137,7 +137,7 @@ static kmem_cache_t * ext2_inode_cachep;
static struct inode *ext2_alloc_inode(struct super_block *sb)
{
struct ext2_inode_info *ei;
- ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, SLAB_KERNEL);
+ ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, SLAB_KERNEL|__GFP_KERNRCLM);
if (!ei)
return NULL;
#ifdef CONFIG_EXT2_FS_POSIX_ACL
diff -rup linux-2.6.11-rc1-clean/fs/ext3/super.c linux-2.6.11-rc1-mbuddy/fs/ext3/super.c
--- linux-2.6.11-rc1-clean/fs/ext3/super.c 2005-01-12 04:02:11.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/fs/ext3/super.c 2005-01-13 10:56:30.000000000 +0000
@@ -434,7 +434,7 @@ static struct inode *ext3_alloc_inode(st
{
struct ext3_inode_info *ei;

- ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS);
+ ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS|__GFP_KERNRCLM);
if (!ei)
return NULL;
#ifdef CONFIG_EXT3_FS_POSIX_ACL
diff -rup linux-2.6.11-rc1-clean/fs/ntfs/inode.c linux-2.6.11-rc1-mbuddy/fs/ntfs/inode.c
--- linux-2.6.11-rc1-clean/fs/ntfs/inode.c 2005-01-12 04:01:45.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/fs/ntfs/inode.c 2005-01-13 10:56:30.000000000 +0000
@@ -318,7 +318,7 @@ struct inode *ntfs_alloc_big_inode(struc

ntfs_debug("Entering.");
ni = (ntfs_inode *)kmem_cache_alloc(ntfs_big_inode_cache,
- SLAB_NOFS);
+ SLAB_NOFS|__GFP_KERNRCLM);
if (likely(ni != NULL)) {
ni->state = 0;
return VFS_I(ni);
@@ -343,7 +343,8 @@ static inline ntfs_inode *ntfs_alloc_ext
ntfs_inode *ni;

ntfs_debug("Entering.");
- ni = (ntfs_inode *)kmem_cache_alloc(ntfs_inode_cache, SLAB_NOFS);
+ ni = (ntfs_inode *)kmem_cache_alloc(ntfs_inode_cache,
+ SLAB_NOFS|__GFP_KERNRCLM);
if (likely(ni != NULL)) {
ni->state = 0;
return ni;
diff -rup linux-2.6.11-rc1-clean/include/linux/gfp.h linux-2.6.11-rc1-mbuddy/include/linux/gfp.h
--- linux-2.6.11-rc1-clean/include/linux/gfp.h 2005-01-12 04:00:35.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/include/linux/gfp.h 2005-01-13 14:18:06.000000000 +0000
@@ -38,21 +38,24 @@ struct vm_area_struct;
#define __GFP_NO_GROW 0x2000 /* Slab internal usage */
#define __GFP_COMP 0x4000 /* Add compound page metadata */
#define __GFP_ZERO 0x8000 /* Return zeroed page on success */
+#define __GFP_KERNRCLM 0x10000 /* Kernel page that is easily reclaimable */
+#define __GFP_USERRCLM 0x20000 /* User is a userspace user */

-#define __GFP_BITS_SHIFT 16 /* Room for 16 __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 17 /* Room for 17 __GFP_FOO bits */
#define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1)

/* if you forget to add the bitmask here kernel will crash, period */
#define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
- __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP)
+ __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
+ __GFP_USERRCLM|__GFP_KERNRCLM)

#define GFP_ATOMIC (__GFP_HIGH)
#define GFP_NOIO (__GFP_WAIT)
#define GFP_NOFS (__GFP_WAIT | __GFP_IO)
#define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS)
-#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS)
-#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM)
+#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_USERRCLM)
+#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM | __GFP_USERRCLM)

/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some
platforms, used as appropriate on others */
diff -rup linux-2.6.11-rc1-clean/include/linux/mmzone.h linux-2.6.11-rc1-mbuddy/include/linux/mmzone.h
--- linux-2.6.11-rc1-clean/include/linux/mmzone.h 2005-01-12 04:01:17.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/include/linux/mmzone.h 2005-01-13 14:24:27.000000000 +0000
@@ -19,6 +19,10 @@
#else
#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
#endif
+#define ALLOC_TYPES 3
+#define ALLOC_KERNNORCLM 0
+#define ALLOC_KERNRCLM 1
+#define ALLOC_USERRCLM 2

struct free_area {
struct list_head free_list;
@@ -131,8 +135,37 @@ struct zone {
* free areas of different sizes
*/
spinlock_t lock;
- struct free_area free_area[MAX_ORDER];

+ /*
+ * There are ALLOC_TYPE number of MAX_ORDER free lists. Once a
+ * MAX_ORDER block of pages has been split for an allocation type,
+ * the whole block is reserved for that type of allocation. The
+ * types are User Reclaimable, Kernel Reclaimable and Kernel
+ * Non-reclaimable. The objective is to reduce fragmentation
+ * overall
+ */
+ struct free_area free_area_lists[ALLOC_TYPES][MAX_ORDER];
+
+ /*
+ * This is a list of page blocks of 2^MAX_ORDER. Once one of
+ * these are split, the buddy is added to the appropriate
+ * free_area_lists. When the buddies are later merged, they
+ * are placed back here
+ */
+ struct free_area free_area_global;
+
+ /*
+ * This map tracks what each 2^MAX_ORDER sized block has been used for.
+ * Each 2^MAX_ORDER block have pages has 2 bits in this map to remember
+ * what the block is for. When a page is freed, it's index within this
+ * bitmap is calculated using (address >> MAX_ORDER) * 2 . This means
+ * that pages will always be freed into the correct list in
+ * free_area_lists
+ *
+ * The bits are set when a 2^MAX_ORDER block of pages is split
+ */
+
+ unsigned long *free_area_usemap;

ZONE_PADDING(_pad1_)

diff -rup linux-2.6.11-rc1-clean/mm/page_alloc.c linux-2.6.11-rc1-mbuddy/mm/page_alloc.c
--- linux-2.6.11-rc1-clean/mm/page_alloc.c 2005-01-12 04:00:02.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/mm/page_alloc.c 2005-01-13 14:58:06.000000000 +0000
@@ -46,9 +46,31 @@ unsigned long totalhigh_pages;
long nr_swap_pages;
int sysctl_lower_zone_protection = 0;

+/* Bean counters for the per-type buddy allocator */
+int fallback_count[ALLOC_TYPES] = { 0, 0, 0};
+int drastic_fallback_count=0;
+int global_steal=0;
+int global_refill=0;
+int kernnorclm_count=0;
+int kernrclm_count=0;
+int userrclm_count=0;
+
EXPORT_SYMBOL(totalram_pages);
EXPORT_SYMBOL(nr_swap_pages);

+/**
+ * The allocator tries to put allocations of the same type in the
+ * same 2^MAX_ORDER blocks of pages. When memory is low, this may
+ * not be possible so this describes what order they should fall
+ * back on
+ */
+int fallback_allocs[ALLOC_TYPES][ALLOC_TYPES] = {
+ { ALLOC_KERNNORCLM, ALLOC_KERNRCLM, ALLOC_USERRCLM },
+ { ALLOC_KERNRCLM, ALLOC_KERNNORCLM, ALLOC_USERRCLM },
+ { ALLOC_USERRCLM, ALLOC_KERNNORCLM, ALLOC_KERNRCLM }
+};
+
+
/*
* Used by page_zone() to look up the address of the struct zone whose
* id is encoded in the upper bits of page->flags
@@ -57,6 +79,7 @@ struct zone *zone_table[1 << (ZONES_SHIF
EXPORT_SYMBOL(zone_table);

static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
+static char *type_names[ALLOC_TYPES] = { "KernNoRclm", "KernRclm", "UserRclm"};
int min_free_kbytes = 1024;

unsigned long __initdata nr_kernel_pages;
@@ -103,6 +126,48 @@ static void bad_page(const char *functio
tainted |= TAINT_BAD_PAGE;
}

+/*
+ * Return what type of use the 2^MAX_ORDER block of pages is in use for
+ * that the given page is part of
+ */
+static int get_pageblock_type(struct page *page) {
+ struct zone *zone = page_zone(page);
+ int pageidx = (page - zone->zone_mem_map) >> MAX_ORDER;
+ int bitidx = pageidx * 2;
+
+ /* Bit 1 will be set if the block is kernel reclaimable */
+ if (test_bit(bitidx,zone->free_area_usemap)) return ALLOC_KERNRCLM;
+
+ /* Bit 2 will be set if the block is user reclaimable */
+ if (test_bit(bitidx+1, zone->free_area_usemap)) return ALLOC_USERRCLM;
+
+ return ALLOC_KERNNORCLM;
+}
+
+static void set_pageblock_type(struct page *page, int type) {
+ int bit1, bit2;
+ struct zone *zone = page_zone(page);
+ int pageidx = (page - zone->zone_mem_map) >> MAX_ORDER;
+ int bitidx = pageidx * 2;
+ bit1 = bit2 = 0;
+
+ if (type == ALLOC_KERNRCLM) {
+ set_bit(bitidx, zone->free_area_usemap);
+ clear_bit(bitidx+1, zone->free_area_usemap);
+ return;
+ }
+
+ if (type == ALLOC_USERRCLM) {
+ clear_bit(bitidx, zone->free_area_usemap);
+ set_bit(bitidx+1, zone->free_area_usemap);
+ return;
+ }
+
+ clear_bit(bitidx, zone->free_area_usemap);
+ clear_bit(bitidx+1, zone->free_area_usemap);
+
+}
+
#ifndef CONFIG_HUGETLB_PAGE
#define prep_compound_page(page, order) do { } while (0)
#define destroy_compound_page(page, order) do { } while (0)
@@ -231,6 +296,7 @@ static inline void __free_pages_bulk (st
unsigned long page_idx;
struct page *coalesced;
int order_size = 1 << order;
+ struct free_area *area;

if (unlikely(order))
destroy_compound_page(page, order);
@@ -240,9 +306,12 @@ static inline void __free_pages_bulk (st
BUG_ON(page_idx & (order_size - 1));
BUG_ON(bad_range(zone, page));

+ /* Select the area to use for freeing based on the type */
+ struct free_area *freelist =
+ zone->free_area_lists[get_pageblock_type(page)];
+
zone->free_pages += order_size;
while (order < MAX_ORDER-1) {
- struct free_area *area;
struct page *buddy;
int buddy_idx;

@@ -254,16 +323,29 @@ static inline void __free_pages_bulk (st
break;
/* Move the buddy up one level. */
list_del(&buddy->lru);
- area = zone->free_area + order;
+ area = freelist + order;
area->nr_free--;
rmv_page_order(buddy);
page_idx &= buddy_idx;
order++;
}
+
+ /*
+ * If a MAX_ORDER block of pages is being freed, it is
+ * no longer reserved for a particular type of allocation
+ * so put it in the global list
+ */
+ if (order >= MAX_ORDER-1) {
+ area = &(zone->free_area_global);
+ global_refill++;
+ } else {
+ area = freelist + order;
+ }
+
coalesced = base + page_idx;
set_page_order(coalesced, order);
- list_add(&coalesced->lru, &zone->free_area[order].free_list);
- zone->free_area[order].nr_free++;
+ list_add(&coalesced->lru, &area->free_list);
+ area->nr_free++;
}

static inline void free_pages_check(const char *function, struct page *page)
@@ -310,6 +392,7 @@ free_pages_bulk(struct zone *zone, int c
zone->pages_scanned = 0;
while (!list_empty(list) && count--) {
page = list_entry(list->prev, struct page, lru);
+
/* have to delete it as __free_pages_bulk list manipulates */
list_del(&page->lru);
__free_pages_bulk(page, base, zone, order);
@@ -420,14 +503,36 @@ static void prep_new_page(struct page *p
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
-static struct page *__rmqueue(struct zone *zone, unsigned int order)
+static struct page *__rmqueue(struct zone *zone, unsigned int order, int flags)
{
struct free_area * area;
unsigned int current_order;
struct page *page;
+ int global_split=0;
+
+ /* Select area to use based on gfp_flags */
+ int alloctype;
+ int retry_count=0;
+ if (flags & __GFP_USERRCLM) {
+ alloctype = ALLOC_USERRCLM;
+ userrclm_count++;
+ }
+ else if (flags & __GFP_KERNRCLM) {
+ alloctype = ALLOC_KERNRCLM;
+ kernrclm_count++;
+ } else {
+ alloctype = ALLOC_KERNNORCLM;
+ kernnorclm_count++;
+ }

+ /* Ok, pick the fallback order based on the type */
+ int *fallback_list = fallback_allocs[alloctype];
+
+retry:
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
- area = zone->free_area + current_order;
+ alloctype = fallback_list[retry_count];
+ area = zone->free_area_lists[alloctype] + current_order;
+
if (list_empty(&area->free_list))
continue;

@@ -439,6 +544,34 @@ static struct page *__rmqueue(struct zon
return expand(zone, page, order, current_order, area);
}

+ /* Take from the global pool if this is the first attempt */
+ if (!global_split && !list_empty(&(zone->free_area_global.free_list))){
+ /*
+ * Remove a MAX_ORDER block from the global pool and add
+ * it to the list of desired alloc_type
+ */
+ page = list_entry(zone->free_area_global.free_list.next,
+ struct page, lru);
+ list_del(&page->lru);
+ list_add(&page->lru,
+ &(zone->free_area_lists[alloctype][MAX_ORDER-1].free_list));
+ global_steal++;
+ global_split=1;
+
+ /* Mark this block of pages as for use with this alloc type */
+ set_pageblock_type(page, alloctype);
+
+ goto retry;
+ }
+
+ /*
+ * Here, the alloc type lists has been depleted as well as the global
+ * pool, so fallback
+ */
+ retry_count++;
+ fallback_count[alloctype]++;
+ if (retry_count != ALLOC_TYPES) goto retry;
+
return NULL;
}

@@ -448,7 +581,8 @@ static struct page *__rmqueue(struct zon
* Returns the number of new pages which were placed at *list.
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
- unsigned long count, struct list_head *list)
+ unsigned long count, struct list_head *list,
+ int gfp_flags)
{
unsigned long flags;
int i;
@@ -457,7 +591,7 @@ static int rmqueue_bulk(struct zone *zon

spin_lock_irqsave(&zone->lock, flags);
for (i = 0; i < count; ++i) {
- page = __rmqueue(zone, order);
+ page = __rmqueue(zone, order, gfp_flags);
if (page == NULL)
break;
allocated++;
@@ -493,7 +627,7 @@ static void __drain_pages(unsigned int c
void mark_free_pages(struct zone *zone)
{
unsigned long zone_pfn, flags;
- int order;
+ int order, type;
struct list_head *curr;

if (!zone->spanned_pages)
@@ -503,14 +637,17 @@ void mark_free_pages(struct zone *zone)
for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));

- for (order = MAX_ORDER - 1; order >= 0; --order)
- list_for_each(curr, &zone->free_area[order].free_list) {
- unsigned long start_pfn, i;
+ for (type=0; type < ALLOC_TYPES; type++) {

- start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
+ for (order = MAX_ORDER - 1; order >= 0; --order)
+ list_for_each(curr, &zone->free_area_lists[type][order].free_list) {
+ unsigned long start_pfn, i;
+
+ start_pfn = page_to_pfn(list_entry(curr, struct page, lru));

- for (i=0; i < (1<<order); i++)
- SetPageNosaveFree(pfn_to_page(start_pfn+i));
+ for (i=0; i < (1<<order); i++)
+ SetPageNosaveFree(pfn_to_page(start_pfn+i));
+ }
}
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -612,14 +749,15 @@ buffered_rmqueue(struct zone *zone, int
struct page *page = NULL;
int cold = !!(gfp_flags & __GFP_COLD);

- if (order == 0) {
+ if (order == 0 && (gfp_flags & __GFP_USERRCLM)) {
struct per_cpu_pages *pcp;

pcp = &zone->pageset[get_cpu()].pcp[cold];
local_irq_save(flags);
if (pcp->count <= pcp->low)
pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, &pcp->list);
+ pcp->batch, &pcp->list,
+ gfp_flags);
if (pcp->count) {
page = list_entry(pcp->list.next, struct page, lru);
list_del(&page->lru);
@@ -631,7 +769,7 @@ buffered_rmqueue(struct zone *zone, int

if (page == NULL) {
spin_lock_irqsave(&zone->lock, flags);
- page = __rmqueue(zone, order);
+ page = __rmqueue(zone, order, gfp_flags);
spin_unlock_irqrestore(&zone->lock, flags);
}

@@ -658,7 +796,7 @@ int zone_watermark_ok(struct zone *z, in
{
/* free_pages my go negative - that's OK */
long min = mark, free_pages = z->free_pages - (1 << order) + 1;
- int o;
+ int o, type;

if (gfp_high)
min -= min / 2;
@@ -667,15 +805,17 @@ int zone_watermark_ok(struct zone *z, in

if (free_pages <= min + z->protection[alloc_type])
return 0;
- for (o = 0; o < order; o++) {
- /* At the next order, this order's pages become unavailable */
- free_pages -= z->free_area[o].nr_free << o;
+ for (type=0; type < ALLOC_TYPES; type++) {
+ for (o = 0; o < order; o++) {
+ /* At the next order, this order's pages become unavailable */
+ free_pages -= z->free_area_lists[type][o].nr_free << o;

- /* Require fewer higher order pages to be free */
- min >>= 1;
+ /* Require fewer higher order pages to be free */
+ min >>= 1;

- if (free_pages <= min)
- return 0;
+ if (free_pages <= min)
+ return 0;
+ }
}
return 1;
}
@@ -1124,6 +1264,7 @@ void show_free_areas(void)
unsigned long inactive;
unsigned long free;
struct zone *zone;
+ int type;

for_each_zone(zone) {
show_node(zone);
@@ -1216,8 +1357,10 @@ void show_free_areas(void)

spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
- nr = zone->free_area[order].nr_free;
- total += nr << order;
+ for (type=0; type < ALLOC_TYPES; type++) {
+ nr = zone->free_area_lists[type][order].nr_free;
+ total += nr << order;
+ }
printk("%lu*%lukB ", nr, K(1UL) << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -1515,10 +1658,26 @@ void zone_init_free_lists(struct pglist_
unsigned long size)
{
int order;
- for (order = 0; order < MAX_ORDER ; order++) {
- INIT_LIST_HEAD(&zone->free_area[order].free_list);
- zone->free_area[order].nr_free = 0;
- }
+ int type;
+ struct free_area *area;
+
+ /* Initialse the three size ordered lists of free_areas */
+ for (type=0; type < ALLOC_TYPES; type++) {
+
+ for (order = 0; ; order++) {
+ area = zone->free_area_lists[type];
+
+ INIT_LIST_HEAD(&area[order].free_list);
+ area[order].nr_free = 0;
+ if (order == MAX_ORDER-1) {
+ break;
+ }
+ }
+ }
+
+ /* Initialise the global pool of 2^size pages */
+ INIT_LIST_HEAD(&zone->free_area_global.free_list);
+ zone->free_area_global.nr_free=0;
}

#ifndef __HAVE_ARCH_MEMMAP_INIT
@@ -1637,6 +1796,11 @@ static void __init free_area_init_core(s
zone_start_pfn += size;

zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+ zone->free_area_usemap =
+ (unsigned long *)alloc_bootmem_node(pgdat,
+ ((size >> MAX_ORDER) * 2 + 8) / 8);
+ memset((unsigned long *)zone->free_area_usemap, ALLOC_KERNNORCLM,
+ ((size >> MAX_ORDER) * 2 + 8) / 8);
}
}

@@ -1714,19 +1878,90 @@ static int frag_show(struct seq_file *m,
struct zone *zone;
struct zone *node_zones = pgdat->node_zones;
unsigned long flags;
- int order;
+ int order, type;
+ struct list_head *elem;

- for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
- if (!zone->present_pages)
- continue;

- spin_lock_irqsave(&zone->lock, flags);
- seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
- for (order = 0; order < MAX_ORDER; ++order)
- seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
- spin_unlock_irqrestore(&zone->lock, flags);
- seq_putc(m, '\n');
- }
+ /* Show global fragmentation statistics */
+ for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+ if (!zone->present_pages)
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+ unsigned long nr_bufs = 0;
+ for (order = 0; order < MAX_ORDER-1; ++order) {
+ nr_bufs = 0;
+
+ for (type=0; type < ALLOC_TYPES; type++) {
+ list_for_each(elem, &(zone->free_area_lists[type][order].free_list))
+ ++nr_bufs;
+ }
+ seq_printf(m, "%6lu ", nr_bufs);
+ }
+
+ /* Scan global list */
+ nr_bufs = 0;
+ list_for_each(elem, &(zone->free_area_global.free_list))
+ ++nr_bufs;
+ seq_printf(m, "%6lu ", nr_bufs);
+
+ spin_unlock_irqrestore(&zone->lock, flags);
+ seq_putc(m, '\n');
+ }
+
+ /* Show statistics for each allocation type */
+ seq_printf(m, "\nPer-allocation-type statistics");
+ for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+ if (!zone->present_pages)
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ unsigned long nr_bufs = 0;
+ for (type=0; type < ALLOC_TYPES; type++) {
+ seq_printf(m, "\nNode %d, zone %8s, type %10s",
+ pgdat->node_id, zone->name,
+ type_names[type]);
+ struct list_head *elem;
+ for (order = 0; order < MAX_ORDER; ++order) {
+ nr_bufs = 0;
+
+ list_for_each(elem, &(zone->free_area_lists[type][order].free_list))
+ ++nr_bufs;
+ seq_printf(m, "%6lu ", nr_bufs);
+ }
+ }
+
+ /* Scan global list */
+ seq_printf(m, "\n");
+ seq_printf(m, "Node %d, zone %8s, type %10s",
+ pgdat->node_id, zone->name,
+ "MAX_ORDER");
+ nr_bufs = 0;
+ list_for_each(elem, &(zone->free_area_global.free_list))
+ ++nr_bufs;
+ seq_printf(m, "%6lu ", nr_bufs);
+
+ spin_unlock_irqrestore(&zone->lock, flags);
+ seq_putc(m, '\n');
+ }
+
+ /* Show bean counters */
+ seq_printf(m, "\nGlobal beancounters\n");
+ seq_printf(m, "Global steals: %d\n", global_steal);
+ seq_printf(m, "Global refills: %d\n", global_refill);
+ seq_printf(m, "KernNoRclm allocs: %d\n", kernnorclm_count);
+ seq_printf(m, "KernRclm allocs: %d\n", kernrclm_count);
+ seq_printf(m, "UserRclm allocs: %d\n", userrclm_count);
+ seq_printf(m, "%-10s Fallback count: %d\n", type_names[0],
+ fallback_count[0]);
+ seq_printf(m, "%-10s Fallback count: %d\n", type_names[1],
+ fallback_count[1]);
+ seq_printf(m, "%-10s Fallback count: %d\n", type_names[2],
+ fallback_count[2]);
+
+
+
return 0;
}


2005-01-15 00:32:19

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH] Avoiding fragmentation through different allocator V2


Hi Mel,

On Thu, Jan 13, 2005 at 03:56:46PM +0000, Mel Gorman wrote:
> The patch is against 2.6.11-rc1 and I'm willing to stand by it's
> stability. I'm also confident it does it's job pretty well so I'd like it
> to be considered for inclusion.

This is very interesting!

Other than the advantage of decreased fragmentation which you aim, by providing
clustering of different types of allocations you might have a performance gain (or loss :))
due to changes in cache colouring effects.

It depends on the workload/application mix and type of cache of course, but I think there will be a
significant measurable difference on most common workloads.

Physically indexed caches are going to be more influenced (since virtually indexed caches
will only have an effect on kernel pages with your new scheme).

Have you done any investigation with that respect? IMHO such verification is really
important before attempting to merge it.

BTW talking about cache colouring, I this is an area which has a HUGE space for improvement.
The allocator is completly unaware of colouring (except the SLAB) - we should try to
come up with a light per-process allocation colouring optimizer. But thats another
history.

> For me, the next stage is to write a linear scanner that goes through the
> address space to free up a high-order block of pages on demand. This will
> be a tricky job so it'll take me quite a while.

We're paving the road to implement a generic "weak" migration function on top
of the current page migration infrastructure. With "weak" I mean that it bails
out easily if the page cannot be migrated, unlike the "strong" version which
_has_ to migrate the page(s) (for memory hotplug purpose).

With such function in place its easier to have different implementations of defragmentation
logic - we might want to coolaborate on that.

Your bitmap also allows a hint for the "defragmentator" to know the type of pages,
and possibly size of the block, so it can avoid earlier trying to migrate non reclaimable
memory. It possibly makes the scanning procedure much lightweight.


> Changelog since V1
>
> o Update patch to 2.6.11-rc1
> o Cleaned up bug where memory was wasted on a large bitmap
> o Extended fallback_count bean counters to show the fallback count for each
> allocation type
> o Better commenting
>
> This patch divides allocations into three different types of allocations;
>
> UserReclaimable - These are userspace pages that are easily reclaimable. Right
> now, I'm putting all allocations of GFP_USER and GFP_HIGHUSER as
> well as disk-buffer pages into this category. These pages are trivially
> reclaimed by writing the page out to swap or syncing with backing
> storage
>
> KernelReclaimable - These are pages allocated by the kernel that are easily
> reclaimed. This is stuff like inode caches, dcache, buffer_heads etc.
> These type of pages potentially could be reclaimed by dumping the
> caches and reaping the slabs (drastic, but you get the idea).
>
> KernelNonReclaimable - These are pages that are allocated by the kernel that
> are not trivially reclaimed. For example, the memory allocated for a
> loaded module would be in this category. By default, allocations are
> considered to be of this type
>
> Instead of having one global MAX_ORDER-sized array of free lists, there are
> three, one for each type of allocation. Finally, there is a list of pages of
> size 2^MAX_ORDER which is a global pool of the largest pages the kernel deals
> with.
>
> Once a 2^MAX_ORDER block of pages it split for a type of allocation, it is
> added to the free-lists for that type, in effect reserving it. Hence, over
> time, pages of the different types can be clustered together. This means that
> if we wanted 2^MAX_ORDER number of pages, we could linearly scan a block of
> pages allocated for UserReclaimable and page each of them out.
>
> Fallback is used when there are no 2^MAX_ORDER pages available and there
> are no free pages of the desired type. The fallback lists were chosen in a
> way that keeps the most easily reclaimable pages together.
>
>
> Signed-off-by: Mel Gorman <[email protected]>
>
> @@ -658,7 +796,7 @@ int zone_watermark_ok(struct zone *z, in
> {
> /* free_pages my go negative - that's OK */
> long min = mark, free_pages = z->free_pages - (1 << order) + 1;
> - int o;
> + int o, type;
>
> if (gfp_high)
> min -= min / 2;
> @@ -667,15 +805,17 @@ int zone_watermark_ok(struct zone *z, in
>
> if (free_pages <= min + z->protection[alloc_type])
> return 0;
> - for (o = 0; o < order; o++) {
> - /* At the next order, this order's pages become unavailable */
> - free_pages -= z->free_area[o].nr_free << o;
> + for (type=0; type < ALLOC_TYPES; type++) {
> + for (o = 0; o < order; o++) {
> + /* At the next order, this order's pages become unavailable */
> + free_pages -= z->free_area_lists[type][o].nr_free << o;

You want to do
free_pages -= (z->free_area_lists[0][o].nr_free + z->free_area_lists[2][o].nr_free +
z->free_area_lists[2][o].nr_free) << o;

So not to interfere with the "min" decay (and remove the allocation type loop).

>
> - /* Require fewer higher order pages to be free */
> - min >>= 1;
> + /* Require fewer higher order pages to be free */
> + min >>= 1;
>
> - if (free_pages <= min)
> - return 0;
> + if (free_pages <= min)
> + return 0;
> + }

I'll play with your patch during the weekend, run some benchmarks (STP is our friend),
try to measure the overhead, etc.

Congrats, this is very cool!

2005-01-15 01:12:35

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH] Avoiding fragmentation through different allocator V2

> You want to do
> free_pages -= (z->free_area_lists[0][o].nr_free + z->free_area_lists[2][o].nr_free +
^^^^ = 1
> z->free_area_lists[2][o].nr_free) << o;

I meant the sum of the free lists. You'd better use the defines instead of course :)

> So not to interfere with the "min" decay (and remove the allocation type loop).

2005-01-15 19:18:58

by Mel Gorman

[permalink] [raw]
Subject: Re: [PATCH] Avoiding fragmentation through different allocator V2

On Fri, 14 Jan 2005, Marcelo Tosatti wrote:

> On Thu, Jan 13, 2005 at 03:56:46PM +0000, Mel Gorman wrote:
> > The patch is against 2.6.11-rc1 and I'm willing to stand by it's
> > stability. I'm also confident it does it's job pretty well so I'd like it
> > to be considered for inclusion.
>
> This is very interesting!
>

Thanks

> Other than the advantage of decreased fragmentation which you aim, by
> providing clustering of different types of allocations you might have a
> performance gain (or loss :)) due to changes in cache colouring
> effects.
>

That is possible but it I haven't thought of a way of measuring the cache
colouring effects (if any). There is also the problem that the additional
complexity of the allocator will offset this benefit. The two main loss
points of the allocator are increased complexity and the increased size of
the zone struct.

> It depends on the workload/application mix and type of cache of course,
> but I think there will be a significant measurable difference on most
> common workloads.
>

If I could only measure it :/

> Have you done any investigation with that respect? IMHO such
> verification is really important before attempting to merge it.
>

No unfortunately. Do you know of a test I can use?

> BTW talking about cache colouring, I this is an area which has a HUGE
> space for improvement. The allocator is completly unaware of colouring
> (except the SLAB) - we should try to come up with a light per-process
> allocation colouring optimizer. But thats another history.
>

This also was tried and dropped. The allocator was a lot more complex and
the implementor was unable to measure it. IIRC, the patch was not accepted
with a comment along the lines of "If you can't measure it, it doesn't
exist". Before I walk down the page coloring path again, I'll need some
scheme that measures the cache-effect.

Totally aside, I'm doing this work because I've started a PhD on
developing solid metrics for measuring VM performance and then devising
new or modified algorithms using the metrics to see if the changes are any
good.

> > For me, the next stage is to write a linear scanner that goes through the
> > address space to free up a high-order block of pages on demand. This will
> > be a tricky job so it'll take me quite a while.
>
> We're paving the road to implement a generic "weak" migration function on top
> of the current page migration infrastructure. With "weak" I mean that it bails
> out easily if the page cannot be migrated, unlike the "strong" version which
> _has_ to migrate the page(s) (for memory hotplug purpose).
>
> With such function in place its easier to have different implementations of defragmentation
> logic - we might want to coolaborate on that.
>

I've also started something like this although I think you'll find my
first approach childishly simple. I implemented a linear scanner that
finds the KernRclm and UserRclm areas. It then makes a list of the PageLRU
pages and sends them to shrink_list(). I ran a test which put the machine
under heavy stress and then tried to allocate 75% of ZONE_NORMAL with
2^_MAX_ORDER pages (allocations done via a kernel module). I found that
the standard allocator was only able to successfully allocate 1% of the
allocations (3 blocks), my modified allocator managed 50% (81 blocks) and
with linear scanning in place, it was 76% (122 blocks). I figure I could
get the linear scanning figures even higher if I taught the allocator to
reserve the pages it frees for the process performing the linear scanning.

However, I also know the linear scanner trashed the LRU lists and probably
comes with all sorts of performance regressions just to make the
high-order allocations.

The new patches for the allocator (last patch I posted has a serious bug
in it), the linear scanner and the results will be posted as another mail.

> Your bitmap also allows a hint for the "defragmentator" to know the type
> of pages, and possibly size of the block, so it can avoid earlier trying
> to migrate non reclaimable memory. It possibly makes the scanning
> procedure much lightweight.
>

Potentially. I need to catch up more on the existing schemes. I've been
out of the VM loop for a long time now so I'm still playing the Catch-Up
game.

> > <SNIP>
>
> You want to do
> free_pages -= (z->free_area_lists[0][o].nr_free + z->free_area_lists[2][o].nr_free +
> z->free_area_lists[2][o].nr_free) << o;
>
> So not to interfere with the "min" decay (and remove the allocation type loop).
>

Agreed. New patch has this in place

> >
> > - /* Require fewer higher order pages to be free */
> > - min >>= 1;
> > + /* Require fewer higher order pages to be free */
> > + min >>= 1;
> >
> > - if (free_pages <= min)
> > - return 0;
> > + if (free_pages <= min)
> > + return 0;
> > + }
>
> I'll play with your patch during the weekend, run some benchmarks (STP
> is our friend), try to measure the overhead, etc.
>

I'll also look at STP again and start trying to move some of my tests to
it (I have a fairly involved testing scheme right now). I recall that STP
was very easy to get setup with, I was just pressed for time.

> Congrats, this is very cool!
>

Thanks a lot :)

--
Mel Gorman

2005-01-15 20:10:38

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH] Avoiding fragmentation through different allocator V2

On Sat, Jan 15, 2005 at 07:18:42PM +0000, Mel Gorman wrote:
> On Fri, 14 Jan 2005, Marcelo Tosatti wrote:
>
> > On Thu, Jan 13, 2005 at 03:56:46PM +0000, Mel Gorman wrote:
> > > The patch is against 2.6.11-rc1 and I'm willing to stand by it's
> > > stability. I'm also confident it does it's job pretty well so I'd like it
> > > to be considered for inclusion.
> >
> > This is very interesting!
> >
>
> Thanks
>
> > Other than the advantage of decreased fragmentation which you aim, by
> > providing clustering of different types of allocations you might have a
> > performance gain (or loss :)) due to changes in cache colouring
> > effects.
> >
>
> That is possible but it I haven't thought of a way of measuring the cache
> colouring effects (if any). There is also the problem that the additional
> complexity of the allocator will offset this benefit. The two main loss
> points of the allocator are increased complexity and the increased size of
> the zone struct.

We should be able to measure that too...

If you look at the performance numbers of applications which do data crunching,
reading/writing data to disk (scientific applications). Or even databases,
plus standard set of IO benchmarks...

Of course you're not able to measure the change in cache hits/misses (which would be nice),
but you can get an idea how measurable is the final performance impact, including
the page allocator overhead and the increase zone struct size (I dont think the struct zone
size increase makes much difference).

We should be able to use the CPU performance counters to get exact miss/hit numbers,
but it seems its not yet possible to use Mikael's Pettersson pmc inside the kernel, I asked him
sometime ago but never got along to trying anything:

Subject: Re: Measuring kernel-level code cache hits/misses with perfctr

> Hi Mikael,
>
> I've been wondering if its possible to use PMC's
> to monitor L1 and/or L2 cache hits from kernel code?

You can count them by using the global-mode counters interface
(present in the perfctr-2.6 package but not in the 2.6-mm kernel
unfortunately) and restricting the counters to CPL 0.

However, for profiling purposes you probably want to catch overflow
interrupts, and that's not supported for global-mode counters.
I simply haven't had time to implement that feature.


> > It depends on the workload/application mix and type of cache of course,
> > but I think there will be a significant measurable difference on most
> > common workloads.
> >
>
> If I could only measure it :/
>
> > Have you done any investigation with that respect? IMHO such
> > verification is really important before attempting to merge it.
> >
>
> No unfortunately. Do you know of a test I can use?

I think some CPU/memory intensive benchmarks should give us a hint of the total
impact ?

> > BTW talking about cache colouring, I this is an area which has a HUGE
> > space for improvement. The allocator is completly unaware of colouring
> > (except the SLAB) - we should try to come up with a light per-process
> > allocation colouring optimizer. But thats another history.
> >
>
> This also was tried and dropped. The allocator was a lot more complex and
> the implementor was unable to measure it. IIRC, the patch was not accepted
> with a comment along the lines of "If you can't measure it, it doesn't
> exist". Before I walk down the page coloring path again, I'll need some
> scheme that measures the cache-effect.

Someone needs to write the helper functions to use the PMC's and test that.

> Totally aside, I'm doing this work because I've started a PhD on
> developing solid metrics for measuring VM performance and then devising
> new or modified algorithms using the metrics to see if the changes are any
> good.

Nice! Make your work public! I'm personally very interested in this area.

> > > For me, the next stage is to write a linear scanner that goes through the
> > > address space to free up a high-order block of pages on demand. This will
> > > be a tricky job so it'll take me quite a while.
> >
> > We're paving the road to implement a generic "weak" migration function on top
> > of the current page migration infrastructure. With "weak" I mean that it bails
> > out easily if the page cannot be migrated, unlike the "strong" version which
> > _has_ to migrate the page(s) (for memory hotplug purpose).
> >
> > With such function in place its easier to have different implementations of defragmentation
> > logic - we might want to coolaborate on that.
> >
>
> I've also started something like this although I think you'll find my
> first approach childishly simple. I implemented a linear scanner that
> finds the KernRclm and UserRclm areas. It then makes a list of the PageLRU
> pages and sends them to shrink_list(). I ran a test which put the machine
> under heavy stress and then tried to allocate 75% of ZONE_NORMAL with
> 2^_MAX_ORDER pages (allocations done via a kernel module). I found that
> the standard allocator was only able to successfully allocate 1% of the
> allocations (3 blocks), my modified allocator managed 50% (81 blocks) and
> with linear scanning in place, it was 76% (122 blocks). I figure I could
> get the linear scanning figures even higher if I taught the allocator to
> reserve the pages it frees for the process performing the linear scanning.

Cool.

> However, I also know the linear scanner trashed the LRU lists and probably
> comes with all sorts of performance regressions just to make the
> high-order allocations.

Migrating pages instead of freeing them can greatly reduce the overhead I believe
and might be a low impact way of defragmenting memory.

> The new patches for the allocator (last patch I posted has a serious bug
> in it), the linear scanner and the results will be posted as another mail.
>
>
> > Your bitmap also allows a hint for the "defragmentator" to know the type
> > of pages, and possibly size of the block, so it can avoid earlier trying
> > to migrate non reclaimable memory. It possibly makes the scanning
> > procedure much lightweight.
> >
>
> Potentially. I need to catch up more on the existing schemes. I've been
> out of the VM loop for a long time now so I'm still playing the Catch-Up
> game.
>
> > > <SNIP>
> >
> > You want to do
> > free_pages -= (z->free_area_lists[0][o].nr_free + z->free_area_lists[2][o].nr_free +
> > z->free_area_lists[2][o].nr_free) << o;
> >
> > So not to interfere with the "min" decay (and remove the allocation type loop).
> >
>
> Agreed. New patch has this in place
>
> > >
> > > - /* Require fewer higher order pages to be free */
> > > - min >>= 1;
> > > + /* Require fewer higher order pages to be free */
> > > + min >>= 1;
> > >
> > > - if (free_pages <= min)
> > > - return 0;
> > > + if (free_pages <= min)
> > > + return 0;
> > > + }
> >
> > I'll play with your patch during the weekend, run some benchmarks (STP
> > is our friend), try to measure the overhead, etc.
> >
>
> I'll also look at STP again and start trying to move some of my tests to
> it (I have a fairly involved testing scheme right now). I recall that STP
> was very easy to get setup with, I was just pressed for time.

:)

> > Congrats, this is very cool!
> >
>
> Thanks a lot :)

I've added your patch to STP but:

[STP 300030]Kernel Patch Error Kernel: mel-three-type-allocator-v2 PLM # 4073

patching file usr/gen_init_cpio.c
patching file fs/buffer.c
Hunk #2 succeeded at 2998 with fuzz 1.
patching file fs/dcache.c
Hunk #1 FAILED at 715.
1 out of 1 hunk FAILED -- saving rejects to file fs/dcache.c.rej
patching file fs/ext2/super.c
patching file fs/ext3/super.c
patching file fs/ntfs/inode.c
patching file include/linux/gfp.h
patching file include/linux/mmzone.h
patching file mm/page_alloc.c
Hunk #10 FAILED at 581.
Hunk #11 succeeded at 591 with fuzz 1.
1 out of 22 hunks FAILED -- saving rejects to file mm/page_alloc.c.rej

It failed to apply to 2.6.10-rc1 - I'll work the rejects and rerun the tests.

Can you send me your last patch please?



2005-01-16 16:13:23

by Mel Gorman

[permalink] [raw]
Subject: Re: [PATCH] Avoiding fragmentation through different allocator V2


> > That is possible but it I haven't thought of a way of measuring the cache
> > colouring effects (if any). There is also the problem that the additional
> > complexity of the allocator will offset this benefit. The two main loss
> > points of the allocator are increased complexity and the increased size of
> > the zone struct.
>
> We should be able to measure that too...
>
> If you look at the performance numbers of applications which do data
> crunching, reading/writing data to disk (scientific applications). Or
> even databases, plus standard set of IO benchmarks...
>

I used two benchmarks to test this. The first was a test that ran gs
against a large postscript file 10 times and measured the average. The
hypothesis was that if I was trashing the CPU cache with the allocator,
there would be a marked difference between the results. The results are;

==> gsbench-2.6.11-rc1MBuddy.txt <==
Average: 115.47 real, 115.136 user, 0.338 sys

==> gsbench-2.6.11-rc1Standard.txt <==
Average: 115.468 real, 115.092 user, 0.337 sys

So, there is no significance there. I think we are safe for the CPU cache
as neither allocator is particularly cache aware.

The second test was a portion of the tests from aim9. The results are

MBuddy
7 page_test 120.01 9452 78.76010 133892.18 System Allocations & Pages/second
8 brk_test 120.03 3386 28.20961 479563.44 System Memory Allocations/second
9 jmp_test 120.00 501496 4179.13333 4179133.33 Non-local gotos/second
10 signal_test 120.01 11632 96.92526 96925.26 Signal Traps/second
11 exec_test 120.07 1587 13.21729 66.09 Program Loads/second
12 fork_test 120.03 1890 15.74606 1574.61 Task Creations/second
13 link_test 120.00 11152 92.93333 5854.80 Link/Unlink Pairs/second
56 fifo_test 120.00 173450 1445.41667 144541.67
FIFO Messages/second

Vanilla
7 page_test 120.01 9536 79.46004 135082.08 System Allocations & Pages/second
8 brk_test 120.01 3394 28.28098 480776.60 System Memory Allocations/second
9 jmp_test 120.00 498770 4156.41667 4156416.67 Non-local gotos/second
10 signal_test 120.00 11773 98.10833 98108.33 Signal Traps/second
11 exec_test 120.01 1591 13.25723 66.29 Program Loads/second
12 fork_test 120.00 1941 16.17500 1617.50 Task Creations/second
13 link_test 120.00 11188 93.23333 5873.70 Link/Unlink Pairs/second
56 fifo_test 120.00 179156 1492.96667 149296.67 FIFO Messages/second

Here, there are worrying differences all right. The modified allocator for
example is getting 1000 faults a second less than the standard allocator
but that is still less than 1%. This is something I need to work on
although I think it's optimisation work rather than a fundamental problem
with the approach.

I'm looking into using bonnie++ as another IO benchmark.

> We should be able to use the CPU performance counters to get exact
> miss/hit numbers, but it seems its not yet possible to use Mikael's
> Pettersson pmc inside the kernel, I asked him sometime ago but never got
> along to trying anything:
>
> <SNIP>

This is stuff I was not aware of before and will need to follow up on.

> I think some CPU/memory intensive benchmarks should give us a hint of the total
> impact ?
>

The ghostscript test was the one I choose. Script is below

> > However, I also know the linear scanner trashed the LRU lists and probably
> > comes with all sorts of performance regressions just to make the
> > high-order allocations.
>
> Migrating pages instead of freeing them can greatly reduce the overhead I believe
> and might be a low impact way of defragmenting memory.
>

Very likely. As it is, the scanner I used is really stupid but I wanted to
show that using a mechanism like it, we should be able to almost guarentee
the allocation of a high-order block, something we cannot currently do.

> I've added your patch to STP but:
>
> [STP 300030]Kernel Patch Error Kernel: mel-three-type-allocator-v2 PLM # 4073
>

I posted a new version under the subject "[PATCH] 1/2 Reducing
fragmentation through better allocation". It should apply cleanly to a
vanilla kernel. Sorry about the mess of the other patch.

> It failed to apply to 2.6.10-rc1 - I'll work the rejects and rerun the tests.
>

The patch is against 2.6.11-rc1, but I'm guessing you typos 2.6.10-rc1.

--
Mel Gorman

2005-01-16 17:48:52

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH] Avoiding fragmentation through different allocator V2

On Sat, Jan 15, 2005 at 07:18:42PM +0000, Mel Gorman wrote:
> On Fri, 14 Jan 2005, Marcelo Tosatti wrote:
>
> > On Thu, Jan 13, 2005 at 03:56:46PM +0000, Mel Gorman wrote:
> > > The patch is against 2.6.11-rc1 and I'm willing to stand by it's
> > > stability. I'm also confident it does it's job pretty well so I'd like it
> > > to be considered for inclusion.
> >
> > This is very interesting!
> >
>
> Thanks
>
> > Other than the advantage of decreased fragmentation which you aim, by
> > providing clustering of different types of allocations you might have a
> > performance gain (or loss :)) due to changes in cache colouring
> > effects.
> >
>
> That is possible but it I haven't thought of a way of measuring the cache
> colouring effects (if any). There is also the problem that the additional
> complexity of the allocator will offset this benefit. The two main loss
> points of the allocator are increased complexity and the increased size of
> the zone struct.
>
> > It depends on the workload/application mix and type of cache of course,
> > but I think there will be a significant measurable difference on most
> > common workloads.
> >
>
> If I could only measure it :/
>
> > Have you done any investigation with that respect? IMHO such
> > verification is really important before attempting to merge it.
> >
>
> No unfortunately. Do you know of a test I can use?

Some STP reaim results have significant performance increase in general, a few
small regressions.

I think that depending on the type of access pattern of the application(s) there
will be either performance gain or loss, but the result is interesting anyway. :)

I'll different more tests later on.

AIM OVERVIEW
The AIM Multiuser Benchmark - Suite VII tests and measures the performance of
Open System multiuser computers. Multiuser computer environments typically have
the following general characteristics in common:

- A large number of tasks are run concurrently
- Disk storage increases dramatically as the number of users increase.
- Complex numerically intense applications are performed infrequently
- An important amount of time is spent sorting and searching through large
amounts of data.
- After data is used it is placed back on disk because it is a shared resource.
- A large amount of time is spent in common runtime libraries.




NORMAL LOAD 4-way-SMP:


kernel: patch-2.6.11-rc1
plmid: 4066
Host: stp4-000
Reaim test
http://khack.osdl.org/stp/300031
kernel: 4066
Filesystem: ext3
Peak load Test: Maximum Jobs per Minute 4881.87 (average of 3 runs)
Quick Convergence Test: Maximum Jobs per Minute 4961.19 (average of 3 runs)

kernel: mel-v3-fixed
plmid: 4077
Host: stp4-001
Reaim test
http://khack.osdl.org/stp/300056
kernel: 4077
Filesystem: ext3
Peak load Test: Maximum Jobs per Minute 5065.93 (average of 3 runs)
Quick Convergence Test: Maximum Jobs per Minute 5294.48 (average of 3 runs)


NORMAL LOAD 1-WAY:

kernel: patch-2.6.11-rc1
plmid: 4066
Host: stp1-003
Reaim test
http://khack.osdl.org/stp/300029
kernel: 4066
Filesystem: ext3
Peak load Test: Maximum Jobs per Minute 993.13 (average of 3 runs)
Quick Convergence Test: Maximum Jobs per Minute 983.11 (average of 3 runs)
If some fields are empty or look unusual you may have an old version.
Compare to the current minimal requirements in Documentation/Changes.


kernel: mel-v3-fixed
plmid: 4077
Host: stp1-002
Reaim test
http://khack.osdl.org/stp/300055
kernel: 4077
Filesystem: ext3
Peak load Test: Maximum Jobs per Minute 982.69 (average of 3 runs)
Quick Convergence Test: Maximum Jobs per Minute 1008.06 (average of 3 runs)


COMPUTE LOAD 2way (this is more CPU intensive than NORMAL reaim load):

kernel: patch-2.6.11-rc1
plmid: 4066
Host: stp2-001
Reaim test
http://khack.osdl.org/stp/300060
kernel: 4066
Filesystem: ext3
Peak load Test: Maximum Jobs per Minute 1482.45 (average of 3 runs)
Quick Convergence Test: Maximum Jobs per Minute 1487.20 (average of 3 runs)
If some fields are empty or look unusual you may have an old version.
Compare to the current minimal requirements in Documentation/Changes.

kernel: mel-v3-fixed
plmid: 4077
Host: stp2-000
Reaim test
http://khack.osdl.org/stp/300058
kernel: 4077
Filesystem: ext3
Peak load Test: Maximum Jobs per Minute 1501.47 (average of 3 runs)
Quick Convergence Test: Maximum Jobs per Minute 1462.11 (average of 3 runs)
If some fields are empty or look unusual you may have an old version.
Compare to the current minimal requirements in Documentation/Changes.








2005-01-16 20:33:29

by Mel Gorman

[permalink] [raw]
Subject: Re: [PATCH] Avoiding fragmentation through different allocator V2

On Sun, 16 Jan 2005, Marcelo Tosatti wrote:

> > No unfortunately. Do you know of a test I can use?
>
> Some STP reaim results have significant performance increase in general, a few
> small regressions.
>
> I think that depending on the type of access pattern of the application(s) there
> will be either performance gain or loss, but the result is interesting anyway. :)
>

That is quite exciting and I'm pleased it was able to show gains in some
tests. Based on the aim9 tests, I took a look at the paths I affected to
see what improvements I could make. There were three significant ones

1. I inlined get_pageblock_type and set_pageblock_type
2. set_pageblock_type was calling page_zone() even though the only caller
knew the zone so I added the parameter
3. When taking fom the global pool, I was recanning all the order lists
which is does not any more

I am hoping that these three changes will clear up the worst of the minor
regressions.

With the changess, aim9 reported that the modified allocator performs as
well as the standard allocator. This means that the allocator is as fast,
we are reasonably sure there is no adverse cache effects (if anything
cache usage is improved) and we are far more likely to be able to service
high-order requests

root@monocle:~# grep _test aim9-vanilla-120.txt
7 page_test 120.00 9508 79.23333 134696.67 System Allocations & Pages/second
8 brk_test 120.01 3401 28.33931 481768.19 System Memory Allocations/second
9 jmp_test 120.00 498718 4155.98333 4155983.33 Non-local gotos/second
10 signal_test 120.01 11768 98.05850 98058.50 Signal Traps/second
11 exec_test 120.04 1585 13.20393 66.02 Program Loads/second
12 fork_test 120.04 1979 16.48617 1648.62 Task Creations/second
13 link_test 120.01 11174 93.10891 5865.86 Link/Unlink Pairs/second
root@monocle:~# grep _test aim9-mbuddyV3-120.txt
7 page_test 120.01 9660 80.49329 136838.60 System Allocations & Pages/second
8 brk_test 120.01 3409 28.40597 482901.42 System Memory Allocations/second
9 jmp_test 120.00 501533 4179.44167 4179441.67 Non-local gotos/second
10 signal_test 120.00 11677 97.30833 97308.33 Signal Traps/second
11 exec_test 120.05 1585 13.20283 66.01 Program Loads/second
12 fork_test 120.05 1889 15.73511 1573.51 Task Creations/second
13 link_test 120.01 11089 92.40063 5821.24 Link/Unlink Pairs/second


Patch with minor optimisations as follows;

diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/fs/buffer.c linux-2.6.11-rc1-mbuddy/fs/buffer.c
--- linux-2.6.11-rc1-clean/fs/buffer.c 2005-01-12 04:01:23.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/fs/buffer.c 2005-01-13 10:56:30.000000000 +0000
@@ -1134,7 +1134,8 @@ grow_dev_page(struct block_device *bdev,
struct page *page;
struct buffer_head *bh;

- page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ page = find_or_create_page(inode->i_mapping, index,
+ GFP_NOFS | __GFP_USERRCLM);
if (!page)
return NULL;

@@ -2997,7 +2998,8 @@ static void recalc_bh_state(void)

struct buffer_head *alloc_buffer_head(int gfp_flags)
{
- struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
+ struct buffer_head *ret = kmem_cache_alloc(bh_cachep,
+ gfp_flags|__GFP_KERNRCLM);
if (ret) {
preempt_disable();
__get_cpu_var(bh_accounting).nr++;
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/fs/dcache.c linux-2.6.11-rc1-mbuddy/fs/dcache.c
--- linux-2.6.11-rc1-clean/fs/dcache.c 2005-01-12 04:00:09.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/fs/dcache.c 2005-01-13 10:56:30.000000000 +0000
@@ -715,7 +715,8 @@ struct dentry *d_alloc(struct dentry * p
struct dentry *dentry;
char *dname;

- dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
+ dentry = kmem_cache_alloc(dentry_cache,
+ GFP_KERNEL|__GFP_KERNRCLM);
if (!dentry)
return NULL;

diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/fs/ext2/super.c linux-2.6.11-rc1-mbuddy/fs/ext2/super.c
--- linux-2.6.11-rc1-clean/fs/ext2/super.c 2005-01-12 04:01:24.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/fs/ext2/super.c 2005-01-13 10:56:30.000000000 +0000
@@ -137,7 +137,7 @@ static kmem_cache_t * ext2_inode_cachep;
static struct inode *ext2_alloc_inode(struct super_block *sb)
{
struct ext2_inode_info *ei;
- ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, SLAB_KERNEL);
+ ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, SLAB_KERNEL|__GFP_KERNRCLM);
if (!ei)
return NULL;
#ifdef CONFIG_EXT2_FS_POSIX_ACL
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/fs/ext3/super.c linux-2.6.11-rc1-mbuddy/fs/ext3/super.c
--- linux-2.6.11-rc1-clean/fs/ext3/super.c 2005-01-12 04:02:11.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/fs/ext3/super.c 2005-01-13 10:56:30.000000000 +0000
@@ -434,7 +434,7 @@ static struct inode *ext3_alloc_inode(st
{
struct ext3_inode_info *ei;

- ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS);
+ ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS|__GFP_KERNRCLM);
if (!ei)
return NULL;
#ifdef CONFIG_EXT3_FS_POSIX_ACL
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/fs/ntfs/inode.c linux-2.6.11-rc1-mbuddy/fs/ntfs/inode.c
--- linux-2.6.11-rc1-clean/fs/ntfs/inode.c 2005-01-12 04:01:45.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/fs/ntfs/inode.c 2005-01-13 10:56:30.000000000 +0000
@@ -318,7 +318,7 @@ struct inode *ntfs_alloc_big_inode(struc

ntfs_debug("Entering.");
ni = (ntfs_inode *)kmem_cache_alloc(ntfs_big_inode_cache,
- SLAB_NOFS);
+ SLAB_NOFS|__GFP_KERNRCLM);
if (likely(ni != NULL)) {
ni->state = 0;
return VFS_I(ni);
@@ -343,7 +343,8 @@ static inline ntfs_inode *ntfs_alloc_ext
ntfs_inode *ni;

ntfs_debug("Entering.");
- ni = (ntfs_inode *)kmem_cache_alloc(ntfs_inode_cache, SLAB_NOFS);
+ ni = (ntfs_inode *)kmem_cache_alloc(ntfs_inode_cache,
+ SLAB_NOFS|__GFP_KERNRCLM);
if (likely(ni != NULL)) {
ni->state = 0;
return ni;
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/include/linux/gfp.h linux-2.6.11-rc1-mbuddy/include/linux/gfp.h
--- linux-2.6.11-rc1-clean/include/linux/gfp.h 2005-01-12 04:00:35.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/include/linux/gfp.h 2005-01-15 18:16:47.000000000 +0000
@@ -38,21 +38,24 @@ struct vm_area_struct;
#define __GFP_NO_GROW 0x2000 /* Slab internal usage */
#define __GFP_COMP 0x4000 /* Add compound page metadata */
#define __GFP_ZERO 0x8000 /* Return zeroed page on success */
+#define __GFP_KERNRCLM 0x10000 /* Kernel page that is easily reclaimable */
+#define __GFP_USERRCLM 0x20000 /* User is a userspace user */

-#define __GFP_BITS_SHIFT 16 /* Room for 16 __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 18 /* Room for 18 __GFP_FOO bits */
#define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1)

/* if you forget to add the bitmask here kernel will crash, period */
#define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
- __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP)
+ __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
+ __GFP_USERRCLM|__GFP_KERNRCLM)

#define GFP_ATOMIC (__GFP_HIGH)
#define GFP_NOIO (__GFP_WAIT)
#define GFP_NOFS (__GFP_WAIT | __GFP_IO)
#define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS)
-#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS)
-#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM)
+#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_USERRCLM)
+#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM | __GFP_USERRCLM)

/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some
platforms, used as appropriate on others */
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/include/linux/mmzone.h linux-2.6.11-rc1-mbuddy/include/linux/mmzone.h
--- linux-2.6.11-rc1-clean/include/linux/mmzone.h 2005-01-12 04:01:17.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/include/linux/mmzone.h 2005-01-13 14:24:27.000000000 +0000
@@ -19,6 +19,10 @@
#else
#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
#endif
+#define ALLOC_TYPES 3
+#define ALLOC_KERNNORCLM 0
+#define ALLOC_KERNRCLM 1
+#define ALLOC_USERRCLM 2

struct free_area {
struct list_head free_list;
@@ -131,8 +135,37 @@ struct zone {
* free areas of different sizes
*/
spinlock_t lock;
- struct free_area free_area[MAX_ORDER];

+ /*
+ * There are ALLOC_TYPE number of MAX_ORDER free lists. Once a
+ * MAX_ORDER block of pages has been split for an allocation type,
+ * the whole block is reserved for that type of allocation. The
+ * types are User Reclaimable, Kernel Reclaimable and Kernel
+ * Non-reclaimable. The objective is to reduce fragmentation
+ * overall
+ */
+ struct free_area free_area_lists[ALLOC_TYPES][MAX_ORDER];
+
+ /*
+ * This is a list of page blocks of 2^MAX_ORDER. Once one of
+ * these are split, the buddy is added to the appropriate
+ * free_area_lists. When the buddies are later merged, they
+ * are placed back here
+ */
+ struct free_area free_area_global;
+
+ /*
+ * This map tracks what each 2^MAX_ORDER sized block has been used for.
+ * Each 2^MAX_ORDER block have pages has 2 bits in this map to remember
+ * what the block is for. When a page is freed, it's index within this
+ * bitmap is calculated using (address >> MAX_ORDER) * 2 . This means
+ * that pages will always be freed into the correct list in
+ * free_area_lists
+ *
+ * The bits are set when a 2^MAX_ORDER block of pages is split
+ */
+
+ unsigned long *free_area_usemap;

ZONE_PADDING(_pad1_)

diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/mm/page_alloc.c linux-2.6.11-rc1-mbuddy/mm/page_alloc.c
--- linux-2.6.11-rc1-clean/mm/page_alloc.c 2005-01-12 04:00:02.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/mm/page_alloc.c 2005-01-16 16:40:39.000000000 +0000
@@ -46,9 +46,30 @@ unsigned long totalhigh_pages;
long nr_swap_pages;
int sysctl_lower_zone_protection = 0;

+/* Bean counters for the per-type buddy allocator */
+int fallback_count[ALLOC_TYPES] = { 0, 0, 0};
+int global_steal=0;
+int global_refill=0;
+int kernnorclm_count=0;
+int kernrclm_count=0;
+int userrclm_count=0;
+
EXPORT_SYMBOL(totalram_pages);
EXPORT_SYMBOL(nr_swap_pages);

+/**
+ * The allocator tries to put allocations of the same type in the
+ * same 2^MAX_ORDER blocks of pages. When memory is low, this may
+ * not be possible so this describes what order they should fall
+ * back on
+ */
+int fallback_allocs[ALLOC_TYPES][ALLOC_TYPES] = {
+ { ALLOC_KERNNORCLM, ALLOC_KERNRCLM, ALLOC_USERRCLM },
+ { ALLOC_KERNRCLM, ALLOC_KERNNORCLM, ALLOC_USERRCLM },
+ { ALLOC_USERRCLM, ALLOC_KERNNORCLM, ALLOC_KERNRCLM }
+};
+
+
/*
* Used by page_zone() to look up the address of the struct zone whose
* id is encoded in the upper bits of page->flags
@@ -57,6 +78,7 @@ struct zone *zone_table[1 << (ZONES_SHIF
EXPORT_SYMBOL(zone_table);

static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
+static char *type_names[ALLOC_TYPES] = { "KernNoRclm", "KernRclm", "UserRclm"};
int min_free_kbytes = 1024;

unsigned long __initdata nr_kernel_pages;
@@ -103,6 +125,46 @@ static void bad_page(const char *functio
tainted |= TAINT_BAD_PAGE;
}

+/*
+ * Return what type of use the 2^MAX_ORDER block of pages is in use for
+ * that the given page is part of
+ */
+static inline int get_pageblock_type(struct page *page) {
+ struct zone *zone = page_zone(page);
+ int bitidx = ((page - zone->zone_mem_map) >> MAX_ORDER) * 2;
+
+ /* Bit 1 will be set if the block is kernel reclaimable */
+ if (test_bit(bitidx,zone->free_area_usemap)) return ALLOC_KERNRCLM;
+
+ /* Bit 2 will be set if the block is user reclaimable */
+ if (test_bit(bitidx+1, zone->free_area_usemap)) return ALLOC_USERRCLM;
+
+ return ALLOC_KERNNORCLM;
+}
+
+static inline void set_pageblock_type(struct page *page,
+ struct zone *zone, int type) {
+ int bit1, bit2;
+ int bitidx = ((page - zone->zone_mem_map) >> MAX_ORDER) * 2;
+ bit1 = bit2 = 0;
+
+ if (type == ALLOC_KERNRCLM) {
+ set_bit(bitidx, zone->free_area_usemap);
+ clear_bit(bitidx+1, zone->free_area_usemap);
+ return;
+ }
+
+ if (type == ALLOC_USERRCLM) {
+ clear_bit(bitidx, zone->free_area_usemap);
+ set_bit(bitidx+1, zone->free_area_usemap);
+ return;
+ }
+
+ clear_bit(bitidx, zone->free_area_usemap);
+ clear_bit(bitidx+1, zone->free_area_usemap);
+
+}
+
#ifndef CONFIG_HUGETLB_PAGE
#define prep_compound_page(page, order) do { } while (0)
#define destroy_compound_page(page, order) do { } while (0)
@@ -231,6 +293,7 @@ static inline void __free_pages_bulk (st
unsigned long page_idx;
struct page *coalesced;
int order_size = 1 << order;
+ struct free_area *area;

if (unlikely(order))
destroy_compound_page(page, order);
@@ -240,9 +303,12 @@ static inline void __free_pages_bulk (st
BUG_ON(page_idx & (order_size - 1));
BUG_ON(bad_range(zone, page));

+ /* Select the area to use for freeing based on the type */
+ struct free_area *freelist =
+ zone->free_area_lists[get_pageblock_type(page)];
+
zone->free_pages += order_size;
while (order < MAX_ORDER-1) {
- struct free_area *area;
struct page *buddy;
int buddy_idx;

@@ -254,16 +320,29 @@ static inline void __free_pages_bulk (st
break;
/* Move the buddy up one level. */
list_del(&buddy->lru);
- area = zone->free_area + order;
+ area = freelist + order;
area->nr_free--;
rmv_page_order(buddy);
page_idx &= buddy_idx;
order++;
}
+
+ /*
+ * If a MAX_ORDER block of pages is being freed, it is
+ * no longer reserved for a particular type of allocation
+ * so put it in the global list
+ */
+ if (order >= MAX_ORDER-1) {
+ area = &(zone->free_area_global);
+ global_refill++;
+ } else {
+ area = freelist + order;
+ }
+
coalesced = base + page_idx;
set_page_order(coalesced, order);
- list_add(&coalesced->lru, &zone->free_area[order].free_list);
- zone->free_area[order].nr_free++;
+ list_add(&coalesced->lru, &area->free_list);
+ area->nr_free++;
}

static inline void free_pages_check(const char *function, struct page *page)
@@ -310,6 +389,7 @@ free_pages_bulk(struct zone *zone, int c
zone->pages_scanned = 0;
while (!list_empty(list) && count--) {
page = list_entry(list->prev, struct page, lru);
+
/* have to delete it as __free_pages_bulk list manipulates */
list_del(&page->lru);
__free_pages_bulk(page, base, zone, order);
@@ -420,16 +500,42 @@ static void prep_new_page(struct page *p
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
-static struct page *__rmqueue(struct zone *zone, unsigned int order)
+static struct page *__rmqueue(struct zone *zone, unsigned int order, int flags)
{
struct free_area * area;
unsigned int current_order;
struct page *page;
+ int global_split=0;
+
+ /* Select area to use based on gfp_flags */
+ int alloctype;
+ int retry_count=0;
+ int startorder = order;
+ if (flags & __GFP_USERRCLM) {
+ alloctype = ALLOC_USERRCLM;
+ userrclm_count++;
+ }
+ else if (flags & __GFP_KERNRCLM) {
+ alloctype = ALLOC_KERNRCLM;
+ kernrclm_count++;
+ } else {
+ alloctype = ALLOC_KERNNORCLM;
+ kernnorclm_count++;
+ }
+
+ /* Ok, pick the fallback order based on the type */
+ int *fallback_list = fallback_allocs[alloctype];
+
+retry:
+ alloctype = fallback_list[retry_count];
+ area = zone->free_area_lists[alloctype] + startorder;
+ for (current_order = startorder;
+ current_order < MAX_ORDER; ++current_order) {

- for (current_order = order; current_order < MAX_ORDER; ++current_order) {
- area = zone->free_area + current_order;
- if (list_empty(&area->free_list))
+ if (list_empty(&area->free_list)) {
+ area++;
continue;
+ }

page = list_entry(area->free_list.next, struct page, lru);
list_del(&page->lru);
@@ -439,6 +545,36 @@ static struct page *__rmqueue(struct zon
return expand(zone, page, order, current_order, area);
}

+ /* Take from the global pool if this is the first attempt */
+ if (!global_split && !list_empty(&(zone->free_area_global.free_list))){
+ /*
+ * Remove a MAX_ORDER block from the global pool and add
+ * it to the list of desired alloc_type
+ */
+ page = list_entry(zone->free_area_global.free_list.next,
+ struct page, lru);
+ list_del(&page->lru);
+ list_add(&page->lru,
+ &(zone->free_area_lists[alloctype][MAX_ORDER-1].free_list));
+ global_steal++;
+ global_split=1;
+
+ /* Mark this block of pages as for use with this alloc type */
+ set_pageblock_type(page, zone, alloctype);
+ startorder = MAX_ORDER-1;
+
+ goto retry;
+ }
+
+ /*
+ * Here, the alloc type lists has been depleted as well as the global
+ * pool, so fallback
+ */
+ retry_count++;
+ startorder=order;
+ fallback_count[alloctype]++;
+ if (retry_count != ALLOC_TYPES) goto retry;
+
return NULL;
}

@@ -448,7 +584,8 @@ static struct page *__rmqueue(struct zon
* Returns the number of new pages which were placed at *list.
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
- unsigned long count, struct list_head *list)
+ unsigned long count, struct list_head *list,
+ int gfp_flags)
{
unsigned long flags;
int i;
@@ -457,7 +594,7 @@ static int rmqueue_bulk(struct zone *zon

spin_lock_irqsave(&zone->lock, flags);
for (i = 0; i < count; ++i) {
- page = __rmqueue(zone, order);
+ page = __rmqueue(zone, order, gfp_flags);
if (page == NULL)
break;
allocated++;
@@ -493,7 +630,7 @@ static void __drain_pages(unsigned int c
void mark_free_pages(struct zone *zone)
{
unsigned long zone_pfn, flags;
- int order;
+ int order, type;
struct list_head *curr;

if (!zone->spanned_pages)
@@ -503,14 +640,17 @@ void mark_free_pages(struct zone *zone)
for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));

- for (order = MAX_ORDER - 1; order >= 0; --order)
- list_for_each(curr, &zone->free_area[order].free_list) {
- unsigned long start_pfn, i;
+ for (type=0; type < ALLOC_TYPES; type++) {

- start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
+ for (order = MAX_ORDER - 1; order >= 0; --order)
+ list_for_each(curr, &zone->free_area_lists[type][order].free_list) {
+ unsigned long start_pfn, i;
+
+ start_pfn = page_to_pfn(list_entry(curr, struct page, lru));

- for (i=0; i < (1<<order); i++)
- SetPageNosaveFree(pfn_to_page(start_pfn+i));
+ for (i=0; i < (1<<order); i++)
+ SetPageNosaveFree(pfn_to_page(start_pfn+i));
+ }
}
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -612,14 +752,15 @@ buffered_rmqueue(struct zone *zone, int
struct page *page = NULL;
int cold = !!(gfp_flags & __GFP_COLD);

- if (order == 0) {
+ if (order == 0 && (gfp_flags & __GFP_USERRCLM)) {
struct per_cpu_pages *pcp;

pcp = &zone->pageset[get_cpu()].pcp[cold];
local_irq_save(flags);
if (pcp->count <= pcp->low)
pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, &pcp->list);
+ pcp->batch, &pcp->list,
+ gfp_flags);
if (pcp->count) {
page = list_entry(pcp->list.next, struct page, lru);
list_del(&page->lru);
@@ -631,7 +772,7 @@ buffered_rmqueue(struct zone *zone, int

if (page == NULL) {
spin_lock_irqsave(&zone->lock, flags);
- page = __rmqueue(zone, order);
+ page = __rmqueue(zone, order, gfp_flags);
spin_unlock_irqrestore(&zone->lock, flags);
}

@@ -658,6 +799,7 @@ int zone_watermark_ok(struct zone *z, in
{
/* free_pages my go negative - that's OK */
long min = mark, free_pages = z->free_pages - (1 << order) + 1;
+ struct free_area *kernnorclm, *kernrclm, *userrclm;
int o;

if (gfp_high)
@@ -667,9 +809,15 @@ int zone_watermark_ok(struct zone *z, in

if (free_pages <= min + z->protection[alloc_type])
return 0;
+ kernnorclm = z->free_area_lists[ALLOC_KERNNORCLM];
+ kernrclm = z->free_area_lists[ALLOC_KERNRCLM];
+ userrclm = z->free_area_lists[ALLOC_USERRCLM];
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
- free_pages -= z->free_area[o].nr_free << o;
+ free_pages -= (
+ kernnorclm[o].nr_free +
+ kernrclm[o].nr_free +
+ userrclm[o].nr_free) << o;

/* Require fewer higher order pages to be free */
min >>= 1;
@@ -1124,6 +1272,7 @@ void show_free_areas(void)
unsigned long inactive;
unsigned long free;
struct zone *zone;
+ int type;

for_each_zone(zone) {
show_node(zone);
@@ -1216,8 +1365,10 @@ void show_free_areas(void)

spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
- nr = zone->free_area[order].nr_free;
- total += nr << order;
+ for (type=0; type < ALLOC_TYPES; type++) {
+ nr = zone->free_area_lists[type][order].nr_free;
+ total += nr << order;
+ }
printk("%lu*%lukB ", nr, K(1UL) << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -1515,10 +1666,22 @@ void zone_init_free_lists(struct pglist_
unsigned long size)
{
int order;
- for (order = 0; order < MAX_ORDER ; order++) {
- INIT_LIST_HEAD(&zone->free_area[order].free_list);
- zone->free_area[order].nr_free = 0;
+ int type;
+ struct free_area *area;
+
+ /* Initialse the three size ordered lists of free_areas */
+ for (type=0; type < ALLOC_TYPES; type++) {
+ for (order = 0; order < MAX_ORDER; order++) {
+ area = zone->free_area_lists[type];
+
+ INIT_LIST_HEAD(&area[order].free_list);
+ area[order].nr_free = 0;
+ }
}
+
+ /* Initialise the global pool of 2^size pages */
+ INIT_LIST_HEAD(&zone->free_area_global.free_list);
+ zone->free_area_global.nr_free=0;
}

#ifndef __HAVE_ARCH_MEMMAP_INIT
@@ -1539,6 +1702,7 @@ static void __init free_area_init_core(s
const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
int cpu, nid = pgdat->node_id;
unsigned long zone_start_pfn = pgdat->node_start_pfn;
+ unsigned long usemapsize;

pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
@@ -1637,6 +1801,22 @@ static void __init free_area_init_core(s
zone_start_pfn += size;

zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+
+ /* Calculate size of required bitmap */
+ /* - Number of MAX_ORDER blocks in the zone */
+ usemapsize = (size + (1 << MAX_ORDER)) >> MAX_ORDER;
+
+ /* - Two bits to record what type of block it is */
+ usemapsize = (usemapsize * 2 + 8) / 8;
+
+ zone->free_area_usemap =
+ (unsigned long *)alloc_bootmem_node(pgdat, usemapsize);
+
+ memset((unsigned long *)zone->free_area_usemap,
+ ALLOC_KERNNORCLM, usemapsize);
+
+ printk(KERN_DEBUG " %s zone: %lu pages, %lu real pages, usemap size:%lu\n",
+ zone_names[j], size, realsize, usemapsize);
}
}

@@ -1714,19 +1894,90 @@ static int frag_show(struct seq_file *m,
struct zone *zone;
struct zone *node_zones = pgdat->node_zones;
unsigned long flags;
- int order;
+ int order, type;
+ struct list_head *elem;

- for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
- if (!zone->present_pages)
- continue;

- spin_lock_irqsave(&zone->lock, flags);
- seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
- for (order = 0; order < MAX_ORDER; ++order)
- seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
- spin_unlock_irqrestore(&zone->lock, flags);
- seq_putc(m, '\n');
- }
+ /* Show global fragmentation statistics */
+ for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+ if (!zone->present_pages)
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+ unsigned long nr_bufs = 0;
+ for (order = 0; order < MAX_ORDER-1; ++order) {
+ nr_bufs = 0;
+
+ for (type=0; type < ALLOC_TYPES; type++) {
+ list_for_each(elem, &(zone->free_area_lists[type][order].free_list))
+ ++nr_bufs;
+ }
+ seq_printf(m, "%6lu ", nr_bufs);
+ }
+
+ /* Scan global list */
+ nr_bufs = 0;
+ list_for_each(elem, &(zone->free_area_global.free_list))
+ ++nr_bufs;
+ seq_printf(m, "%6lu ", nr_bufs);
+
+ spin_unlock_irqrestore(&zone->lock, flags);
+ seq_putc(m, '\n');
+ }
+
+ /* Show statistics for each allocation type */
+ seq_printf(m, "\nPer-allocation-type statistics");
+ for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+ if (!zone->present_pages)
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ unsigned long nr_bufs = 0;
+ for (type=0; type < ALLOC_TYPES; type++) {
+ seq_printf(m, "\nNode %d, zone %8s, type %10s",
+ pgdat->node_id, zone->name,
+ type_names[type]);
+ struct list_head *elem;
+ for (order = 0; order < MAX_ORDER; ++order) {
+ nr_bufs = 0;
+
+ list_for_each(elem, &(zone->free_area_lists[type][order].free_list))
+ ++nr_bufs;
+ seq_printf(m, "%6lu ", nr_bufs);
+ }
+ }
+
+ /* Scan global list */
+ seq_printf(m, "\n");
+ seq_printf(m, "Node %d, zone %8s, type %10s",
+ pgdat->node_id, zone->name,
+ "MAX_ORDER");
+ nr_bufs = 0;
+ list_for_each(elem, &(zone->free_area_global.free_list))
+ ++nr_bufs;
+ seq_printf(m, "%6lu ", nr_bufs);
+
+ spin_unlock_irqrestore(&zone->lock, flags);
+ seq_putc(m, '\n');
+ }
+
+ /* Show bean counters */
+ seq_printf(m, "\nGlobal beancounters\n");
+ seq_printf(m, "Global steals: %d\n", global_steal);
+ seq_printf(m, "Global refills: %d\n", global_refill);
+ seq_printf(m, "KernNoRclm allocs: %d\n", kernnorclm_count);
+ seq_printf(m, "KernRclm allocs: %d\n", kernrclm_count);
+ seq_printf(m, "UserRclm allocs: %d\n", userrclm_count);
+ seq_printf(m, "%-10s Fallback count: %d\n", type_names[0],
+ fallback_count[0]);
+ seq_printf(m, "%-10s Fallback count: %d\n", type_names[1],
+ fallback_count[1]);
+ seq_printf(m, "%-10s Fallback count: %d\n", type_names[2],
+ fallback_count[2]);
+
+
+
return 0;
}