2005-01-01 10:25:24

by Geert Uytterhoeven

[permalink] [raw]
Subject: Re: Prezeroing V2 [2/4]: add second parameter to clear_page() for all arches

On Thu, 23 Dec 2004, Christoph Lameter wrote:
> o Extend clear_page to take an order parameter for all architectures.

> Index: linux-2.6.9/include/asm-m68k/page.h
> ===================================================================
> --- linux-2.6.9.orig/include/asm-m68k/page.h 2004-10-18 14:55:36.000000000 -0700
> +++ linux-2.6.9/include/asm-m68k/page.h 2004-12-23 07:44:14.000000000 -0800
> @@ -50,7 +50,7 @@
> );
> }
>
> -static inline void clear_page(void *page)
> +static inline void clear_page(void *page, int order)
> {
> unsigned long tmp;
> unsigned long *sp = page;
> @@ -69,16 +69,16 @@
> "dbra %1,1b\n\t"
> : "=a" (sp), "=d" (tmp)
> : "a" (page), "0" (sp),
> - "1" ((PAGE_SIZE - 16) / 16 - 1));
> + "1" (((PAGE_SIZE<<(order)) - 16) / 16 - 1));
> }
>
> #else
> -#define clear_page(page) memset((page), 0, PAGE_SIZE)
> +#define clear_page(page, 0) memset((page), 0, PAGE_SIZE << (order))
^
order

Gr{oetje,eeting}s,

Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- [email protected]

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds


2005-01-04 23:26:14

by Christoph Lameter

[permalink] [raw]
Subject: Prezeroing V3 [3/4]: Page zeroing through kscrubd

o Add page zeroing
o Add scrub daemon
o Add ability to view amount of zeroed information in /proc/meninfo

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.10/mm/page_alloc.c
===================================================================
--- linux-2.6.10.orig/mm/page_alloc.c 2005-01-04 14:17:01.000000000 -0800
+++ linux-2.6.10/mm/page_alloc.c 2005-01-04 14:17:02.000000000 -0800
@@ -12,6 +12,7 @@
* Zone balancing, Kanoj Sarcar, SGI, Jan 2000
* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
* (lots of bits borrowed from Ingo Molnar & Andrew Morton)
+ * Support for page zeroing, Christoph Lameter, SGI, Dec 2004
*/

#include <linux/config.h>
@@ -33,6 +34,7 @@
#include <linux/cpu.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
+#include <linux/scrub.h>

#include <asm/tlbflush.h>

@@ -180,7 +182,7 @@
* -- wli
*/

-static inline void __free_pages_bulk (struct page *page, struct page *base,
+static inline int __free_pages_bulk (struct page *page, struct page *base,
struct zone *zone, struct free_area *area, unsigned int order)
{
unsigned long page_idx, index, mask;
@@ -193,11 +195,10 @@
BUG();
index = page_idx >> (1 + order);

- zone->free_pages += 1 << order;
while (order < MAX_ORDER-1) {
struct page *buddy1, *buddy2;

- BUG_ON(area >= zone->free_area + MAX_ORDER);
+ BUG_ON(area >= zone->free_area[ZEROED] + MAX_ORDER);
if (!__test_and_change_bit(index, area->map))
/*
* the buddy page is still allocated.
@@ -219,6 +220,7 @@
}
list_add(&(base + page_idx)->lru, &area->free_list);
area->nr_free++;
+ return order;
}

static inline void free_pages_check(const char *function, struct page *page)
@@ -261,7 +263,7 @@
int ret = 0;

base = zone->zone_mem_map;
- area = zone->free_area + order;
+ area = zone->free_area[NOT_ZEROED] + order;
spin_lock_irqsave(&zone->lock, flags);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
@@ -269,7 +271,10 @@
page = list_entry(list->prev, struct page, lru);
/* have to delete it as __free_pages_bulk list manipulates */
list_del(&page->lru);
- __free_pages_bulk(page, base, zone, area, order);
+ zone->free_pages += 1 << order;
+ if (__free_pages_bulk(page, base, zone, area, order)
+ >= sysctl_scrub_start)
+ wakeup_kscrubd(zone);
ret++;
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -291,6 +296,21 @@
free_pages_bulk(page_zone(page), 1, &list, order);
}

+void end_zero_page(struct page *page)
+{
+ unsigned long flags;
+ int order = page->index;
+ struct zone * zone = page_zone(page);
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ zone->zero_pages += 1 << order;
+ __free_pages_bulk(page, zone->zone_mem_map, zone, zone->free_area[ZEROED] + order, order);
+
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+
#define MARK_USED(index, order, area) \
__change_bit((index) >> (1+(order)), (area)->map)

@@ -370,26 +390,47 @@
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
-static struct page *__rmqueue(struct zone *zone, unsigned int order)
+static void inline rmpage(struct page *page, struct zone *zone, struct free_area *area, int order)
+{
+ list_del(&page->lru);
+ area->nr_free--;
+ if (order != MAX_ORDER-1)
+ MARK_USED(page - zone->zone_mem_map, order, area);
+}
+
+struct page *scrubd_rmpage(struct zone *zone, struct free_area *area, int order)
+{
+ unsigned long flags;
+ struct page *page = NULL;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ if (!list_empty(&area->free_list)) {
+ page = list_entry(area->free_list.next, struct page, lru);
+
+ rmpage(page, zone, area, order);
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return page;
+}
+
+static struct page *__rmqueue(struct zone *zone, unsigned int order, int zero)
{
struct free_area * area;
unsigned int current_order;
struct page *page;
- unsigned int index;

for (current_order = order; current_order < MAX_ORDER; ++current_order) {
- area = zone->free_area + current_order;
+ area = zone->free_area[zero] + current_order;
if (list_empty(&area->free_list))
continue;

page = list_entry(area->free_list.next, struct page, lru);
- list_del(&page->lru);
- area->nr_free--;
- index = page - zone->zone_mem_map;
- if (current_order != MAX_ORDER-1)
- MARK_USED(index, current_order, area);
+ rmpage(page, zone, area, current_order);
zone->free_pages -= 1UL << order;
- return expand(zone, page, index, order, current_order, area);
+ if (zero)
+ zone->zero_pages -= 1UL << order;
+ return expand(zone, page, page - zone->zone_mem_map, order, current_order, area);
}

return NULL;
@@ -401,7 +442,7 @@
* Returns the number of new pages which were placed at *list.
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
- unsigned long count, struct list_head *list)
+ unsigned long count, struct list_head *list, int zero)
{
unsigned long flags;
int i;
@@ -410,7 +451,7 @@

spin_lock_irqsave(&zone->lock, flags);
for (i = 0; i < count; ++i) {
- page = __rmqueue(zone, order);
+ page = __rmqueue(zone, order, zero);
if (page == NULL)
break;
allocated++;
@@ -457,7 +498,7 @@
ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));

for (order = MAX_ORDER - 1; order >= 0; --order)
- list_for_each(curr, &zone->free_area[order].free_list) {
+ list_for_each(curr, &zone->free_area[NOT_ZEROED][order].free_list) {
unsigned long start_pfn, i;

start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
@@ -555,7 +596,9 @@
{
unsigned long flags;
struct page *page = NULL;
- int cold = !!(gfp_flags & __GFP_COLD);
+ int nr_pages = 1 << order;
+ int zero = !!((gfp_flags & __GFP_ZERO) && zone->zero_pages >= nr_pages);
+ int cold = !!(gfp_flags & __GFP_COLD) + 2*zero;

if (order == 0) {
struct per_cpu_pages *pcp;
@@ -564,7 +607,7 @@
local_irq_save(flags);
if (pcp->count <= pcp->low)
pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, &pcp->list);
+ pcp->batch, &pcp->list, zero);
if (pcp->count) {
page = list_entry(pcp->list.next, struct page, lru);
list_del(&page->lru);
@@ -576,19 +619,30 @@

if (page == NULL) {
spin_lock_irqsave(&zone->lock, flags);
- page = __rmqueue(zone, order);
+
+ page = __rmqueue(zone, order, zero);
+
+ /*
+ * If we failed to obtain a zero and/or unzeroed page
+ * then we may still be able to obtain the other
+ * type of page.
+ */
+ if (!page) {
+ page = __rmqueue(zone, order, !zero);
+ zero = 0;
+ }
+
spin_unlock_irqrestore(&zone->lock, flags);
}

if (page != NULL) {
BUG_ON(bad_range(zone, page));
- mod_page_state_zone(zone, pgalloc, 1 << order);
- prep_new_page(page, order);
+ mod_page_state_zone(zone, pgalloc, nr_pages);

- if (gfp_flags & __GFP_ZERO) {
+ if ((gfp_flags & __GFP_ZERO) && !zero) {
#ifdef CONFIG_HIGHMEM
if (PageHighMem(page)) {
- int n = 1 << order;
+ int n = nr_pages;

while (n-- >0)
clear_highpage(page + n);
@@ -596,6 +650,7 @@
#endif
clear_page(page_address(page), order);
}
+ prep_new_page(page, order);
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
}
@@ -622,7 +677,7 @@
return 0;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
- free_pages -= z->free_area[o].nr_free << o;
+ free_pages -= (z->free_area[NOT_ZEROED][o].nr_free + z->free_area[ZEROED][o].nr_free) << o;

/* Require fewer higher order pages to be free */
min >>= 1;
@@ -1000,7 +1055,7 @@
}

void __get_zone_counts(unsigned long *active, unsigned long *inactive,
- unsigned long *free, struct pglist_data *pgdat)
+ unsigned long *free, unsigned long *zero, struct pglist_data *pgdat)
{
struct zone *zones = pgdat->node_zones;
int i;
@@ -1008,27 +1063,31 @@
*active = 0;
*inactive = 0;
*free = 0;
+ *zero = 0;
for (i = 0; i < MAX_NR_ZONES; i++) {
*active += zones[i].nr_active;
*inactive += zones[i].nr_inactive;
*free += zones[i].free_pages;
+ *zero += zones[i].zero_pages;
}
}

void get_zone_counts(unsigned long *active,
- unsigned long *inactive, unsigned long *free)
+ unsigned long *inactive, unsigned long *free, unsigned long *zero)
{
struct pglist_data *pgdat;

*active = 0;
*inactive = 0;
*free = 0;
+ *zero = 0;
for_each_pgdat(pgdat) {
- unsigned long l, m, n;
- __get_zone_counts(&l, &m, &n, pgdat);
+ unsigned long l, m, n,o;
+ __get_zone_counts(&l, &m, &n, &o, pgdat);
*active += l;
*inactive += m;
*free += n;
+ *zero += o;
}
}

@@ -1065,6 +1124,7 @@

#define K(x) ((x) << (PAGE_SHIFT-10))

+const char *temp[3] = { "hot", "cold", "zero" };
/*
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
@@ -1077,6 +1137,7 @@
unsigned long active;
unsigned long inactive;
unsigned long free;
+ unsigned long zero;
struct zone *zone;

for_each_zone(zone) {
@@ -1097,10 +1158,10 @@

pageset = zone->pageset + cpu;

- for (temperature = 0; temperature < 2; temperature++)
+ for (temperature = 0; temperature < 3; temperature++)
printk("cpu %d %s: low %d, high %d, batch %d\n",
cpu,
- temperature ? "cold" : "hot",
+ temp[temperature],
pageset->pcp[temperature].low,
pageset->pcp[temperature].high,
pageset->pcp[temperature].batch);
@@ -1108,20 +1169,21 @@
}

get_page_state(&ps);
- get_zone_counts(&active, &inactive, &free);
+ get_zone_counts(&active, &inactive, &free, &zero);

printk("\nFree pages: %11ukB (%ukB HighMem)\n",
K(nr_free_pages()),
K(nr_free_highpages()));

printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
- "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
+ "unstable:%lu free:%u zero:%lu slab:%lu mapped:%lu pagetables:%lu\n",
active,
inactive,
ps.nr_dirty,
ps.nr_writeback,
ps.nr_unstable,
nr_free_pages(),
+ zero,
ps.nr_slab,
ps.nr_mapped,
ps.nr_page_table_pages);
@@ -1170,7 +1232,7 @@

spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
- nr = zone->free_area[order].nr_free;
+ nr = zone->free_area[NOT_ZEROED][order].nr_free + zone->free_area[ZEROED][order].nr_free;
total += nr << order;
printk("%lu*%lukB ", nr, K(1UL) << order);
}
@@ -1493,16 +1555,21 @@
for (order = 0; ; order++) {
unsigned long bitmap_size;

- INIT_LIST_HEAD(&zone->free_area[order].free_list);
+ INIT_LIST_HEAD(&zone->free_area[NOT_ZEROED][order].free_list);
+ INIT_LIST_HEAD(&zone->free_area[ZEROED][order].free_list);
if (order == MAX_ORDER-1) {
- zone->free_area[order].map = NULL;
+ zone->free_area[NOT_ZEROED][order].map = NULL;
+ zone->free_area[ZEROED][order].map = NULL;
break;
}

bitmap_size = pages_to_bitmap_size(order, size);
- zone->free_area[order].map =
+ zone->free_area[NOT_ZEROED][order].map =
+ (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
+ zone->free_area[ZEROED][order].map =
(unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
- zone->free_area[order].nr_free = 0;
+ zone->free_area[NOT_ZEROED][order].nr_free = 0;
+ zone->free_area[ZEROED][order].nr_free = 0;
}
}

@@ -1527,6 +1594,7 @@

pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
+ init_waitqueue_head(&pgdat->kscrubd_wait);
pgdat->kswapd_max_order = 0;

for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -1550,6 +1618,7 @@
spin_lock_init(&zone->lru_lock);
zone->zone_pgdat = pgdat;
zone->free_pages = 0;
+ zone->zero_pages = 0;

zone->temp_priority = zone->prev_priority = DEF_PRIORITY;

@@ -1583,6 +1652,13 @@
pcp->high = 2 * batch;
pcp->batch = 1 * batch;
INIT_LIST_HEAD(&pcp->list);
+
+ pcp = &zone->pageset[cpu].pcp[2]; /* zero pages */
+ pcp->count = 0;
+ pcp->low = 0;
+ pcp->high = 2 * batch;
+ pcp->batch = 1 * batch;
+ INIT_LIST_HEAD(&pcp->list);
}
printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
zone_names[j], realsize, batch);
@@ -1708,7 +1784,7 @@
spin_lock_irqsave(&zone->lock, flags);
seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
for (order = 0; order < MAX_ORDER; ++order)
- seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+ seq_printf(m, "%6lu ", zone->free_area[NOT_ZEROED][order].nr_free);
spin_unlock_irqrestore(&zone->lock, flags);
seq_putc(m, '\n');
}
Index: linux-2.6.10/include/linux/mmzone.h
===================================================================
--- linux-2.6.10.orig/include/linux/mmzone.h 2005-01-04 14:17:01.000000000 -0800
+++ linux-2.6.10/include/linux/mmzone.h 2005-01-04 14:17:02.000000000 -0800
@@ -52,7 +52,7 @@
};

struct per_cpu_pageset {
- struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
+ struct per_cpu_pages pcp[3]; /* 0: hot. 1: cold 2: cold zeroed pages */
#ifdef CONFIG_NUMA
unsigned long numa_hit; /* allocated in intended node */
unsigned long numa_miss; /* allocated in non intended node */
@@ -108,10 +108,14 @@
* ZONE_HIGHMEM > 896 MB only page cache and user processes
*/

+#define NOT_ZEROED 0
+#define ZEROED 1
+
struct zone {
/* Fields commonly accessed by the page allocator */
unsigned long free_pages;
unsigned long pages_min, pages_low, pages_high;
+ unsigned long zero_pages;
/*
* protection[] is a pre-calculated number of extra pages that must be
* available in a zone in order for __alloc_pages() to allocate memory
@@ -132,7 +136,7 @@
* free areas of different sizes
*/
spinlock_t lock;
- struct free_area free_area[MAX_ORDER];
+ struct free_area free_area[2][MAX_ORDER];


ZONE_PADDING(_pad1_)
@@ -267,6 +271,9 @@
wait_queue_head_t kswapd_wait;
struct task_struct *kswapd;
int kswapd_max_order;
+
+ wait_queue_head_t kscrubd_wait;
+ struct task_struct *kscrubd;
} pg_data_t;

#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
@@ -276,9 +283,9 @@
extern struct pglist_data *pgdat_list;

void __get_zone_counts(unsigned long *active, unsigned long *inactive,
- unsigned long *free, struct pglist_data *pgdat);
+ unsigned long *free, unsigned long *zero, struct pglist_data *pgdat);
void get_zone_counts(unsigned long *active, unsigned long *inactive,
- unsigned long *free);
+ unsigned long *free, unsigned long *zero);
void build_all_zonelists(void);
void wakeup_kswapd(struct zone *zone, int order);
int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
Index: linux-2.6.10/fs/proc/proc_misc.c
===================================================================
--- linux-2.6.10.orig/fs/proc/proc_misc.c 2004-12-24 13:34:00.000000000 -0800
+++ linux-2.6.10/fs/proc/proc_misc.c 2005-01-04 14:17:02.000000000 -0800
@@ -158,13 +158,14 @@
unsigned long inactive;
unsigned long active;
unsigned long free;
+ unsigned long zero;
unsigned long vmtot;
unsigned long committed;
unsigned long allowed;
struct vmalloc_info vmi;

get_page_state(&ps);
- get_zone_counts(&active, &inactive, &free);
+ get_zone_counts(&active, &inactive, &free, &zero);

/*
* display in kilobytes.
@@ -187,6 +188,7 @@
len = sprintf(page,
"MemTotal: %8lu kB\n"
"MemFree: %8lu kB\n"
+ "MemZero: %8lu kB\n"
"Buffers: %8lu kB\n"
"Cached: %8lu kB\n"
"SwapCached: %8lu kB\n"
@@ -210,6 +212,7 @@
"VmallocChunk: %8lu kB\n",
K(i.totalram),
K(i.freeram),
+ K(zero),
K(i.bufferram),
K(get_page_cache_size()-total_swapcache_pages-i.bufferram),
K(total_swapcache_pages),
Index: linux-2.6.10/mm/readahead.c
===================================================================
--- linux-2.6.10.orig/mm/readahead.c 2005-01-04 14:17:01.000000000 -0800
+++ linux-2.6.10/mm/readahead.c 2005-01-04 14:17:02.000000000 -0800
@@ -573,7 +573,8 @@
unsigned long active;
unsigned long inactive;
unsigned long free;
+ unsigned long zero;

- __get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id()));
+ __get_zone_counts(&active, &inactive, &free, &zero, NODE_DATA(numa_node_id()));
return min(nr, (inactive + free) / 2);
}
Index: linux-2.6.10/drivers/base/node.c
===================================================================
--- linux-2.6.10.orig/drivers/base/node.c 2005-01-04 14:17:00.000000000 -0800
+++ linux-2.6.10/drivers/base/node.c 2005-01-04 14:17:02.000000000 -0800
@@ -41,13 +41,15 @@
unsigned long inactive;
unsigned long active;
unsigned long free;
+ unsigned long zero;

si_meminfo_node(&i, nid);
- __get_zone_counts(&active, &inactive, &free, NODE_DATA(nid));
+ __get_zone_counts(&active, &inactive, &free, &zero, NODE_DATA(nid));

n = sprintf(buf, "\n"
"Node %d MemTotal: %8lu kB\n"
"Node %d MemFree: %8lu kB\n"
+ "Node %d MemZero: %8lu kB\n"
"Node %d MemUsed: %8lu kB\n"
"Node %d Active: %8lu kB\n"
"Node %d Inactive: %8lu kB\n"
@@ -57,6 +59,7 @@
"Node %d LowFree: %8lu kB\n",
nid, K(i.totalram),
nid, K(i.freeram),
+ nid, K(zero),
nid, K(i.totalram - i.freeram),
nid, K(active),
nid, K(inactive),
Index: linux-2.6.10/include/linux/sched.h
===================================================================
--- linux-2.6.10.orig/include/linux/sched.h 2005-01-04 14:17:01.000000000 -0800
+++ linux-2.6.10/include/linux/sched.h 2005-01-04 14:17:02.000000000 -0800
@@ -715,6 +715,7 @@
#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
#define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */
#define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */
+#define PF_KSCRUBD 0x00800000 /* I am kscrubd */

#ifdef CONFIG_SMP
extern int set_cpus_allowed(task_t *p, cpumask_t new_mask);
Index: linux-2.6.10/mm/Makefile
===================================================================
--- linux-2.6.10.orig/mm/Makefile 2004-12-24 13:35:00.000000000 -0800
+++ linux-2.6.10/mm/Makefile 2005-01-04 14:17:02.000000000 -0800
@@ -5,7 +5,7 @@
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
- vmalloc.o
+ vmalloc.o scrubd.o

obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
page_alloc.o page-writeback.o pdflush.o prio_tree.o \
Index: linux-2.6.10/mm/scrubd.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.10/mm/scrubd.c 2005-01-04 14:58:46.000000000 -0800
@@ -0,0 +1,147 @@
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/file.h>
+#include <linux/suspend.h>
+#include <linux/sysctl.h>
+#include <linux/scrub.h>
+
+unsigned int sysctl_scrub_start = 7; /* if a page of this order is coalesed then run kscrubd */
+unsigned int sysctl_scrub_stop = 2; /* Mininum order of page to zero */
+unsigned int sysctl_scrub_load = 999; /* Do not run scrubd if load > */
+
+/*
+ * sysctl handler for /proc/sys/vm/scrub_start
+ */
+int scrub_start_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+ proc_dointvec(table, write, file, buffer, length, ppos);
+ if (sysctl_scrub_start < MAX_ORDER) {
+ struct zone *zone;
+
+ for_each_zone(zone)
+ wakeup_kscrubd(zone);
+ }
+ return 0;
+}
+
+LIST_HEAD(zero_drivers);
+
+/*
+ * zero_highest_order_page takes a page off the freelist
+ * and then hands it off to block zeroing agents.
+ * The cleared pages are added to the back of
+ * the freelist where the page allocator may pick them up.
+ */
+int zero_highest_order_page(struct zone *z)
+{
+ int order;
+
+ for(order = MAX_ORDER-1; order >= sysctl_scrub_stop; order--) {
+ struct free_area *area = z->free_area[NOT_ZEROED] + order;
+ if (!list_empty(&area->free_list)) {
+ struct page *page = scrubd_rmpage(z, area, order);
+ struct list_head *l;
+
+ if (!page)
+ continue;
+
+ page->index = order;
+
+ list_for_each(l, &zero_drivers) {
+ struct zero_driver *driver = list_entry(l, struct zero_driver, list);
+ unsigned long size = PAGE_SIZE << order;
+
+ if (driver->start(page_address(page), size) == 0) {
+
+ unsigned ticks = (size*HZ)/driver->rate;
+ if (ticks) {
+ /* Wait the minimum time of the transfer */
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(ticks);
+ }
+ /* Then keep on checking until transfer is complete */
+ while (!driver->check())
+ schedule();
+ goto out;
+ }
+ }
+
+ /* Unable to find a zeroing device that would
+ * deal with this page so just do it on our own.
+ * This will likely thrash the cpu caches.
+ */
+ cond_resched();
+ clear_page(page_address(page), order);
+out:
+ end_zero_page(page);
+ cond_resched();
+ return 1 << order;
+ }
+ }
+ return 0;
+}
+
+/*
+ * scrub_pgdat() will work across all this node's zones.
+ */
+static void scrub_pgdat(pg_data_t *pgdat)
+{
+ int i;
+ unsigned long pages_zeroed;
+
+ if (system_state != SYSTEM_RUNNING)
+ return;
+
+ do {
+ pages_zeroed = 0;
+ for (i = 0; i < pgdat->nr_zones; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ pages_zeroed += zero_highest_order_page(zone);
+ }
+ } while (pages_zeroed);
+}
+
+/*
+ * The background scrub daemon, started as a kernel thread
+ * from the init process.
+ */
+static int kscrubd(void *p)
+{
+ pg_data_t *pgdat = (pg_data_t*)p;
+ struct task_struct *tsk = current;
+ DEFINE_WAIT(wait);
+ cpumask_t cpumask;
+
+ daemonize("kscrubd%d", pgdat->node_id);
+ cpumask = node_to_cpumask(pgdat->node_id);
+ if (!cpus_empty(cpumask))
+ set_cpus_allowed(tsk, cpumask);
+
+ tsk->flags |= PF_MEMALLOC | PF_KSCRUBD;
+
+ for ( ; ; ) {
+ if (current->flags & PF_FREEZE)
+ refrigerator(PF_FREEZE);
+ prepare_to_wait(&pgdat->kscrubd_wait, &wait, TASK_INTERRUPTIBLE);
+ schedule();
+ finish_wait(&pgdat->kscrubd_wait, &wait);
+
+ scrub_pgdat(pgdat);
+ }
+ return 0;
+}
+
+static int __init kscrubd_init(void)
+{
+ pg_data_t *pgdat;
+ for_each_pgdat(pgdat)
+ pgdat->kscrubd
+ = find_task_by_pid(kernel_thread(kscrubd, pgdat, CLONE_KERNEL));
+ return 0;
+}
+
+module_init(kscrubd_init)
Index: linux-2.6.10/include/linux/scrub.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.10/include/linux/scrub.h 2005-01-04 14:17:02.000000000 -0800
@@ -0,0 +1,51 @@
+#ifndef _LINUX_SCRUB_H
+#define _LINUX_SCRUB_H
+
+/*
+ * Definitions for scrubbing of memory include an interface
+ * for drivers that may that allow the zeroing of memory
+ * without invalidating the caches.
+ *
+ * Christoph Lameter, December 2004.
+ */
+
+struct zero_driver {
+ int (*start)(void *, unsigned long); /* Start bzero transfer */
+ int (*check)(void); /* Check if bzero is complete */
+ unsigned long rate; /* zeroing rate in bytes/sec */
+ struct list_head list;
+};
+
+extern struct list_head zero_drivers;
+
+extern unsigned int sysctl_scrub_start;
+extern unsigned int sysctl_scrub_stop;
+extern unsigned int sysctl_scrub_load;
+
+/* Registering and unregistering zero drivers */
+static inline void register_zero_driver(struct zero_driver *z)
+{
+ list_add(&z->list, &zero_drivers);
+}
+
+static inline void unregister_zero_driver(struct zero_driver *z)
+{
+ list_del(&z->list);
+}
+
+extern struct page *scrubd_rmpage(struct zone *zone, struct free_area *area, int order);
+
+static void inline wakeup_kscrubd(struct zone *zone)
+{
+ if (avenrun[0] >= (unsigned long)sysctl_scrub_load << FSHIFT)
+ return;
+ if (!waitqueue_active(&zone->zone_pgdat->kscrubd_wait))
+ return;
+ wake_up_interruptible(&zone->zone_pgdat->kscrubd_wait);
+}
+
+int scrub_start_handler(struct ctl_table *, int, struct file *,
+ void __user *, size_t *, loff_t *);
+
+extern void end_zero_page(struct page *page);
+#endif
Index: linux-2.6.10/kernel/sysctl.c
===================================================================
--- linux-2.6.10.orig/kernel/sysctl.c 2005-01-04 14:17:01.000000000 -0800
+++ linux-2.6.10/kernel/sysctl.c 2005-01-04 14:17:02.000000000 -0800
@@ -40,6 +40,7 @@
#include <linux/times.h>
#include <linux/limits.h>
#include <linux/dcache.h>
+#include <linux/scrub.h>
#include <linux/syscalls.h>

#include <asm/uaccess.h>
@@ -826,6 +827,33 @@
.strategy = &sysctl_jiffies,
},
#endif
+ {
+ .ctl_name = VM_SCRUB_START,
+ .procname = "scrub_start",
+ .data = &sysctl_scrub_start,
+ .maxlen = sizeof(sysctl_scrub_start),
+ .mode = 0644,
+ .proc_handler = &scrub_start_handler,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = VM_SCRUB_STOP,
+ .procname = "scrub_stop",
+ .data = &sysctl_scrub_stop,
+ .maxlen = sizeof(sysctl_scrub_stop),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = VM_SCRUB_LOAD,
+ .procname = "scrub_load",
+ .data = &sysctl_scrub_load,
+ .maxlen = sizeof(sysctl_scrub_load),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
{ .ctl_name = 0 }
};

Index: linux-2.6.10/include/linux/sysctl.h
===================================================================
--- linux-2.6.10.orig/include/linux/sysctl.h 2005-01-04 14:17:01.000000000 -0800
+++ linux-2.6.10/include/linux/sysctl.h 2005-01-04 14:17:02.000000000 -0800
@@ -169,6 +169,9 @@
VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */
VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
+ VM_SCRUB_START=30, /* percentage * 10 at which to start scrubd */
+ VM_SCRUB_STOP=31, /* percentage * 10 at which to stop scrubd */
+ VM_SCRUB_LOAD=31, /* Load factor at which not to scrub anymore */
};



2005-01-04 23:31:25

by Christoph Lameter

[permalink] [raw]
Subject: Prezeroing V3 [1/4]: Allow request for zeroed memory

This patch introduces __GFP_ZERO as an additional gfp_mask element to allow
to request zeroed pages from the page allocator.

o Modifies the page allocator so that it zeroes memory if __GFP_ZERO is set

o Replace all page zeroing after allocating pages by request for
zeroed pages.

o requires arch updates to clear_page in order to function properly.

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.10/mm/page_alloc.c
===================================================================
--- linux-2.6.10.orig/mm/page_alloc.c 2005-01-04 12:16:41.000000000 -0800
+++ linux-2.6.10/mm/page_alloc.c 2005-01-04 12:16:49.000000000 -0800
@@ -584,6 +584,18 @@
BUG_ON(bad_range(zone, page));
mod_page_state_zone(zone, pgalloc, 1 << order);
prep_new_page(page, order);
+
+ if (gfp_flags & __GFP_ZERO) {
+#ifdef CONFIG_HIGHMEM
+ if (PageHighMem(page)) {
+ int n = 1 << order;
+
+ while (n-- >0)
+ clear_highpage(page + n);
+ } else
+#endif
+ clear_page(page_address(page), order);
+ }
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
}
@@ -796,12 +808,9 @@
*/
BUG_ON(gfp_mask & __GFP_HIGHMEM);

- page = alloc_pages(gfp_mask, 0);
- if (page) {
- void *address = page_address(page);
- clear_page(address);
- return (unsigned long) address;
- }
+ page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
+ if (page)
+ return (unsigned long) page_address(page);
return 0;
}

Index: linux-2.6.10/include/linux/gfp.h
===================================================================
--- linux-2.6.10.orig/include/linux/gfp.h 2004-12-24 13:34:27.000000000 -0800
+++ linux-2.6.10/include/linux/gfp.h 2005-01-04 12:16:49.000000000 -0800
@@ -37,6 +37,7 @@
#define __GFP_NORETRY 0x1000 /* Do not retry. Might fail */
#define __GFP_NO_GROW 0x2000 /* Slab internal usage */
#define __GFP_COMP 0x4000 /* Add compound page metadata */
+#define __GFP_ZERO 0x8000 /* Return zeroed page on success */

#define __GFP_BITS_SHIFT 16 /* Room for 16 __GFP_FOO bits */
#define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1)
@@ -52,6 +53,7 @@
#define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS)
#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS)
#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM)
+#define GFP_HIGHZERO (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM | __GFP_ZERO)

/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some
platforms, used as appropriate on others */
Index: linux-2.6.10/mm/memory.c
===================================================================
--- linux-2.6.10.orig/mm/memory.c 2005-01-04 12:16:41.000000000 -0800
+++ linux-2.6.10/mm/memory.c 2005-01-04 12:16:49.000000000 -0800
@@ -1650,10 +1650,9 @@

if (unlikely(anon_vma_prepare(vma)))
goto no_mem;
- page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+ page = alloc_page_vma(GFP_HIGHZERO, vma, addr);
if (!page)
goto no_mem;
- clear_user_highpage(page, addr);

spin_lock(&mm->page_table_lock);
page_table = pte_offset_map(pmd, addr);
Index: linux-2.6.10/kernel/profile.c
===================================================================
--- linux-2.6.10.orig/kernel/profile.c 2004-12-24 13:35:28.000000000 -0800
+++ linux-2.6.10/kernel/profile.c 2005-01-04 12:16:49.000000000 -0800
@@ -326,17 +326,15 @@
node = cpu_to_node(cpu);
per_cpu(cpu_profile_flip, cpu) = 0;
if (!per_cpu(cpu_profile_hits, cpu)[1]) {
- page = alloc_pages_node(node, GFP_KERNEL, 0);
+ page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
if (!page)
return NOTIFY_BAD;
- clear_highpage(page);
per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
}
if (!per_cpu(cpu_profile_hits, cpu)[0]) {
- page = alloc_pages_node(node, GFP_KERNEL, 0);
+ page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
if (!page)
goto out_free;
- clear_highpage(page);
per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
}
break;
@@ -510,16 +508,14 @@
int node = cpu_to_node(cpu);
struct page *page;

- page = alloc_pages_node(node, GFP_KERNEL, 0);
+ page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
if (!page)
goto out_cleanup;
- clear_highpage(page);
per_cpu(cpu_profile_hits, cpu)[1]
= (struct profile_hit *)page_address(page);
- page = alloc_pages_node(node, GFP_KERNEL, 0);
+ page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
if (!page)
goto out_cleanup;
- clear_highpage(page);
per_cpu(cpu_profile_hits, cpu)[0]
= (struct profile_hit *)page_address(page);
}
Index: linux-2.6.10/mm/shmem.c
===================================================================
--- linux-2.6.10.orig/mm/shmem.c 2004-12-24 13:34:32.000000000 -0800
+++ linux-2.6.10/mm/shmem.c 2005-01-04 12:16:49.000000000 -0800
@@ -369,9 +369,8 @@
}

spin_unlock(&info->lock);
- page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
+ page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
if (page) {
- clear_highpage(page);
page->nr_swapped = 0;
}
spin_lock(&info->lock);
@@ -910,7 +909,7 @@
pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
pvma.vm_pgoff = idx;
pvma.vm_end = PAGE_SIZE;
- page = alloc_page_vma(gfp, &pvma, 0);
+ page = alloc_page_vma(gfp | __GFP_ZERO, &pvma, 0);
mpol_free(pvma.vm_policy);
return page;
}
@@ -926,7 +925,7 @@
shmem_alloc_page(unsigned long gfp,struct shmem_inode_info *info,
unsigned long idx)
{
- return alloc_page(gfp);
+ return alloc_page(gfp | __GFP_ZERO);
}
#endif

@@ -1135,7 +1134,6 @@

info->alloced++;
spin_unlock(&info->lock);
- clear_highpage(filepage);
flush_dcache_page(filepage);
SetPageUptodate(filepage);
}
Index: linux-2.6.10/mm/hugetlb.c
===================================================================
--- linux-2.6.10.orig/mm/hugetlb.c 2004-12-24 13:35:00.000000000 -0800
+++ linux-2.6.10/mm/hugetlb.c 2005-01-04 12:16:49.000000000 -0800
@@ -77,7 +77,6 @@
struct page *alloc_huge_page(void)
{
struct page *page;
- int i;

spin_lock(&hugetlb_lock);
page = dequeue_huge_page();
@@ -88,8 +87,7 @@
spin_unlock(&hugetlb_lock);
set_page_count(page, 1);
page[1].mapping = (void *)free_huge_page;
- for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
- clear_highpage(&page[i]);
+ clear_page(page_address(page), HUGETLB_PAGE_ORDER);
return page;
}

Index: linux-2.6.10/include/asm-ia64/pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-ia64/pgalloc.h 2005-01-04 12:16:41.000000000 -0800
+++ linux-2.6.10/include/asm-ia64/pgalloc.h 2005-01-04 12:16:49.000000000 -0800
@@ -61,9 +61,7 @@
pgd_t *pgd = pgd_alloc_one_fast(mm);

if (unlikely(pgd == NULL)) {
- pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
- if (likely(pgd != NULL))
- clear_page(pgd);
+ pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO);
}
return pgd;
}
@@ -106,10 +104,8 @@
static inline pmd_t*
pmd_alloc_one (struct mm_struct *mm, unsigned long addr)
{
- pmd_t *pmd = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+ pmd_t *pmd = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);

- if (likely(pmd != NULL))
- clear_page(pmd);
return pmd;
}

@@ -140,20 +136,16 @@
static inline struct page *
pte_alloc_one (struct mm_struct *mm, unsigned long addr)
{
- struct page *pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
+ struct page *pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);

- if (likely(pte != NULL))
- clear_page(page_address(pte));
return pte;
}

static inline pte_t *
pte_alloc_one_kernel (struct mm_struct *mm, unsigned long addr)
{
- pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+ pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);

- if (likely(pte != NULL))
- clear_page(pte);
return pte;
}

Index: linux-2.6.10/arch/i386/mm/pgtable.c
===================================================================
--- linux-2.6.10.orig/arch/i386/mm/pgtable.c 2005-01-04 12:16:39.000000000 -0800
+++ linux-2.6.10/arch/i386/mm/pgtable.c 2005-01-04 12:16:49.000000000 -0800
@@ -140,10 +140,7 @@

pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
- pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
- if (pte)
- clear_page(pte);
- return pte;
+ return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
}

struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
@@ -151,12 +148,10 @@
struct page *pte;

#ifdef CONFIG_HIGHPTE
- pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT, 0);
+ pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
#else
- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
#endif
- if (pte)
- clear_highpage(pte);
return pte;
}

Index: linux-2.6.10/arch/m68k/mm/motorola.c
===================================================================
--- linux-2.6.10.orig/arch/m68k/mm/motorola.c 2004-12-24 13:34:58.000000000 -0800
+++ linux-2.6.10/arch/m68k/mm/motorola.c 2005-01-04 12:16:49.000000000 -0800
@@ -1,4 +1,4 @@
-/*
+*
* linux/arch/m68k/motorola.c
*
* Routines specific to the Motorola MMU, originally from:
@@ -50,7 +50,7 @@

ptablep = (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);

- clear_page(ptablep);
+ clear_page(ptablep, 0);
__flush_page_to_ram(ptablep);
flush_tlb_kernel_page(ptablep);
nocache_page(ptablep);
@@ -90,7 +90,7 @@
if (((unsigned long)last_pgtable & ~PAGE_MASK) == 0) {
last_pgtable = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);

- clear_page(last_pgtable);
+ clear_page(last_pgtable, 0);
__flush_page_to_ram(last_pgtable);
flush_tlb_kernel_page(last_pgtable);
nocache_page(last_pgtable);
Index: linux-2.6.10/include/asm-mips/pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-mips/pgalloc.h 2004-12-24 13:34:57.000000000 -0800
+++ linux-2.6.10/include/asm-mips/pgalloc.h 2005-01-04 12:16:49.000000000 -0800
@@ -56,9 +56,7 @@
{
pte_t *pte;

- pte = (pte_t *) __get_free_pages(GFP_KERNEL|__GFP_REPEAT, PTE_ORDER);
- if (pte)
- clear_page(pte);
+ pte = (pte_t *) __get_free_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, PTE_ORDER);

return pte;
}
Index: linux-2.6.10/arch/alpha/mm/init.c
===================================================================
--- linux-2.6.10.orig/arch/alpha/mm/init.c 2004-12-24 13:35:28.000000000 -0800
+++ linux-2.6.10/arch/alpha/mm/init.c 2005-01-04 12:16:49.000000000 -0800
@@ -42,10 +42,9 @@
{
pgd_t *ret, *init;

- ret = (pgd_t *)__get_free_page(GFP_KERNEL);
+ ret = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
init = pgd_offset(&init_mm, 0UL);
if (ret) {
- clear_page(ret);
#ifdef CONFIG_ALPHA_LARGE_VMALLOC
memcpy (ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
(PTRS_PER_PGD - USER_PTRS_PER_PGD - 1)*sizeof(pgd_t));
@@ -63,9 +62,7 @@
pte_t *
pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
- pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
- if (pte)
- clear_page(pte);
+ pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
return pte;
}

Index: linux-2.6.10/include/asm-parisc/pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-parisc/pgalloc.h 2004-12-24 13:35:39.000000000 -0800
+++ linux-2.6.10/include/asm-parisc/pgalloc.h 2005-01-04 12:16:49.000000000 -0800
@@ -120,18 +120,14 @@
static inline struct page *
pte_alloc_one(struct mm_struct *mm, unsigned long address)
{
- struct page *page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
- if (likely(page != NULL))
- clear_page(page_address(page));
+ struct page *page = alloc_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
return page;
}

static inline pte_t *
pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr)
{
- pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
- if (likely(pte != NULL))
- clear_page(pte);
+ pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
return pte;
}

Index: linux-2.6.10/arch/sh/mm/pg-sh4.c
===================================================================
--- linux-2.6.10.orig/arch/sh/mm/pg-sh4.c 2004-12-24 13:34:30.000000000 -0800
+++ linux-2.6.10/arch/sh/mm/pg-sh4.c 2005-01-04 12:16:49.000000000 -0800
@@ -34,7 +34,7 @@
{
__set_bit(PG_mapped, &page->flags);
if (((address ^ (unsigned long)to) & CACHE_ALIAS) == 0)
- clear_page(to);
+ clear_page(to, 0);
else {
pgprot_t pgprot = __pgprot(_PAGE_PRESENT |
_PAGE_RW | _PAGE_CACHABLE |
Index: linux-2.6.10/include/asm-sparc64/pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-sparc64/pgalloc.h 2004-12-24 13:35:29.000000000 -0800
+++ linux-2.6.10/include/asm-sparc64/pgalloc.h 2005-01-04 12:16:49.000000000 -0800
@@ -73,10 +73,9 @@
struct page *page;

preempt_enable();
- page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
+ page = alloc_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
if (page) {
ret = (struct page *)page_address(page);
- clear_page(ret);
page->lru.prev = (void *) 2UL;

preempt_disable();
Index: linux-2.6.10/include/asm-sh/pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-sh/pgalloc.h 2004-12-24 13:34:45.000000000 -0800
+++ linux-2.6.10/include/asm-sh/pgalloc.h 2005-01-04 12:16:49.000000000 -0800
@@ -44,9 +44,7 @@
{
pte_t *pte;

- pte = (pte_t *) __get_free_page(GFP_KERNEL | __GFP_REPEAT);
- if (pte)
- clear_page(pte);
+ pte = (pte_t *) __get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO);

return pte;
}
@@ -56,9 +54,7 @@
{
struct page *pte;

- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
- if (pte)
- clear_page(page_address(pte));
+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);

return pte;
}
Index: linux-2.6.10/include/asm-m32r/pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-m32r/pgalloc.h 2004-12-24 13:35:28.000000000 -0800
+++ linux-2.6.10/include/asm-m32r/pgalloc.h 2005-01-04 12:16:49.000000000 -0800
@@ -23,10 +23,7 @@
*/
static __inline__ pgd_t *pgd_alloc(struct mm_struct *mm)
{
- pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
-
- if (pgd)
- clear_page(pgd);
+ pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO);

return pgd;
}
@@ -39,10 +36,7 @@
static __inline__ pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
unsigned long address)
{
- pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL);
-
- if (pte)
- clear_page(pte);
+ pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO);

return pte;
}
@@ -50,10 +44,8 @@
static __inline__ struct page *pte_alloc_one(struct mm_struct *mm,
unsigned long address)
{
- struct page *pte = alloc_page(GFP_KERNEL);
+ struct page *pte = alloc_page(GFP_KERNEL|__GFP_ZERO);

- if (pte)
- clear_page(page_address(pte));

return pte;
}
Index: linux-2.6.10/arch/um/kernel/mem.c
===================================================================
--- linux-2.6.10.orig/arch/um/kernel/mem.c 2005-01-04 12:16:40.000000000 -0800
+++ linux-2.6.10/arch/um/kernel/mem.c 2005-01-04 12:16:49.000000000 -0800
@@ -327,9 +327,7 @@
{
pte_t *pte;

- pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
- if (pte)
- clear_page(pte);
+ pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
return pte;
}

@@ -337,9 +335,7 @@
{
struct page *pte;

- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
- if (pte)
- clear_highpage(pte);
+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
return pte;
}

Index: linux-2.6.10/arch/ppc64/mm/init.c
===================================================================
--- linux-2.6.10.orig/arch/ppc64/mm/init.c 2004-12-24 13:34:58.000000000 -0800
+++ linux-2.6.10/arch/ppc64/mm/init.c 2005-01-04 12:16:49.000000000 -0800
@@ -761,7 +761,7 @@

void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
{
- clear_page(page);
+ clear_page(page, 0);

if (cur_cpu_spec->cpu_features & CPU_FTR_COHERENT_ICACHE)
return;
Index: linux-2.6.10/include/asm-sh64/pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-sh64/pgalloc.h 2004-12-24 13:34:00.000000000 -0800
+++ linux-2.6.10/include/asm-sh64/pgalloc.h 2005-01-04 12:16:49.000000000 -0800
@@ -112,9 +112,7 @@
{
pte_t *pte;

- pte = (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
- if (pte)
- clear_page(pte);
+ pte = (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT|__GFP_ZERO);

return pte;
}
@@ -123,9 +121,7 @@
{
struct page *pte;

- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
- if (pte)
- clear_page(page_address(pte));
+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);

return pte;
}
@@ -150,9 +146,7 @@
static __inline__ pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
{
pmd_t *pmd;
- pmd = (pmd_t *) __get_free_page(GFP_KERNEL|__GFP_REPEAT);
- if (pmd)
- clear_page(pmd);
+ pmd = (pmd_t *) __get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
return pmd;
}

Index: linux-2.6.10/include/asm-cris/pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-cris/pgalloc.h 2004-12-24 13:35:25.000000000 -0800
+++ linux-2.6.10/include/asm-cris/pgalloc.h 2005-01-04 12:16:49.000000000 -0800
@@ -24,18 +24,14 @@

extern inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
- pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
- if (pte)
- clear_page(pte);
+ pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
return pte;
}

extern inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
{
struct page *pte;
- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
- if (pte)
- clear_page(page_address(pte));
+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
return pte;
}

Index: linux-2.6.10/arch/ppc/mm/pgtable.c
===================================================================
--- linux-2.6.10.orig/arch/ppc/mm/pgtable.c 2004-12-24 13:34:26.000000000 -0800
+++ linux-2.6.10/arch/ppc/mm/pgtable.c 2005-01-04 12:16:49.000000000 -0800
@@ -85,8 +85,7 @@
{
pgd_t *ret;

- if ((ret = (pgd_t *)__get_free_pages(GFP_KERNEL, PGDIR_ORDER)) != NULL)
- clear_pages(ret, PGDIR_ORDER);
+ ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER);
return ret;
}

@@ -102,7 +101,7 @@
extern void *early_get_page(void);

if (mem_init_done) {
- pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+ pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
if (pte) {
struct page *ptepage = virt_to_page(pte);
ptepage->mapping = (void *) mm;
@@ -110,8 +109,6 @@
}
} else
pte = (pte_t *)early_get_page();
- if (pte)
- clear_page(pte);
return pte;
}

Index: linux-2.6.10/arch/ppc/mm/init.c
===================================================================
--- linux-2.6.10.orig/arch/ppc/mm/init.c 2005-01-04 12:16:40.000000000 -0800
+++ linux-2.6.10/arch/ppc/mm/init.c 2005-01-04 12:16:49.000000000 -0800
@@ -594,7 +594,7 @@
}
void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
{
- clear_page(page);
+ clear_page(page, 0);
clear_bit(PG_arch_1, &pg->flags);
}

Index: linux-2.6.10/fs/afs/file.c
===================================================================
--- linux-2.6.10.orig/fs/afs/file.c 2004-12-24 13:35:59.000000000 -0800
+++ linux-2.6.10/fs/afs/file.c 2005-01-04 12:16:49.000000000 -0800
@@ -172,7 +172,7 @@
(size_t) PAGE_SIZE);
desc.buffer = kmap(page);

- clear_page(desc.buffer);
+ clear_page(desc.buffer, 0);

/* read the contents of the file from the server into the
* page */
Index: linux-2.6.10/include/asm-alpha/pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-alpha/pgalloc.h 2004-12-24 13:35:50.000000000 -0800
+++ linux-2.6.10/include/asm-alpha/pgalloc.h 2005-01-04 12:16:49.000000000 -0800
@@ -40,9 +40,7 @@
static inline pmd_t *
pmd_alloc_one(struct mm_struct *mm, unsigned long address)
{
- pmd_t *ret = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
- if (ret)
- clear_page(ret);
+ pmd_t *ret = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
return ret;
}

Index: linux-2.6.10/include/linux/highmem.h
===================================================================
--- linux-2.6.10.orig/include/linux/highmem.h 2005-01-04 12:16:41.000000000 -0800
+++ linux-2.6.10/include/linux/highmem.h 2005-01-04 12:16:49.000000000 -0800
@@ -45,7 +45,7 @@
static inline void clear_highpage(struct page *page)
{
void *kaddr = kmap_atomic(page, KM_USER0);
- clear_page(kaddr);
+ clear_page(kaddr, 0);
kunmap_atomic(kaddr, KM_USER0);
}

Index: linux-2.6.10/arch/sh64/mm/ioremap.c
===================================================================
--- linux-2.6.10.orig/arch/sh64/mm/ioremap.c 2004-12-24 13:34:58.000000000 -0800
+++ linux-2.6.10/arch/sh64/mm/ioremap.c 2005-01-04 12:16:49.000000000 -0800
@@ -399,7 +399,7 @@
if (pte_none(*ptep) || !pte_present(*ptep))
return;

- clear_page((void *)ptep);
+ clear_page((void *)ptep, 0);
pte_clear(ptep);
}

Index: linux-2.6.10/include/asm-m68k/motorola_pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-m68k/motorola_pgalloc.h 2004-12-24 13:35:50.000000000 -0800
+++ linux-2.6.10/include/asm-m68k/motorola_pgalloc.h 2005-01-04 12:16:49.000000000 -0800
@@ -12,9 +12,8 @@
{
pte_t *pte;

- pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+ pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
if (pte) {
- clear_page(pte);
__flush_page_to_ram(pte);
flush_tlb_kernel_page(pte);
nocache_page(pte);
@@ -31,7 +30,7 @@

static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
{
- struct page *page = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
+ struct page *page = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
pte_t *pte;

if(!page)
@@ -39,7 +38,6 @@

pte = kmap(page);
if (pte) {
- clear_page(pte);
__flush_page_to_ram(pte);
flush_tlb_kernel_page(pte);
nocache_page(pte);
Index: linux-2.6.10/arch/sh/mm/pg-sh7705.c
===================================================================
--- linux-2.6.10.orig/arch/sh/mm/pg-sh7705.c 2004-12-24 13:34:58.000000000 -0800
+++ linux-2.6.10/arch/sh/mm/pg-sh7705.c 2005-01-04 12:16:49.000000000 -0800
@@ -78,13 +78,13 @@

__set_bit(PG_mapped, &page->flags);
if (((address ^ (unsigned long)to) & CACHE_ALIAS) == 0) {
- clear_page(to);
+ clear_page(to, 0);
__flush_wback_region(to, PAGE_SIZE);
} else {
__flush_purge_virtual_region(to,
(void *)(address & 0xfffff000),
PAGE_SIZE);
- clear_page(to);
+ clear_page(to, 0);
__flush_wback_region(to, PAGE_SIZE);
}
}
Index: linux-2.6.10/arch/sparc64/mm/init.c
===================================================================
--- linux-2.6.10.orig/arch/sparc64/mm/init.c 2004-12-24 13:34:31.000000000 -0800
+++ linux-2.6.10/arch/sparc64/mm/init.c 2005-01-04 12:16:49.000000000 -0800
@@ -1687,13 +1687,12 @@
* Set up the zero page, mark it reserved, so that page count
* is not manipulated when freeing the page from user ptes.
*/
- mem_map_zero = alloc_pages(GFP_KERNEL, 0);
+ mem_map_zero = alloc_pages(GFP_KERNEL|__GFP_ZERO, 0);
if (mem_map_zero == NULL) {
prom_printf("paging_init: Cannot alloc zero page.\n");
prom_halt();
}
SetPageReserved(mem_map_zero);
- clear_page(page_address(mem_map_zero));

codepages = (((unsigned long) _etext) - ((unsigned long) _start));
codepages = PAGE_ALIGN(codepages) >> PAGE_SHIFT;
Index: linux-2.6.10/include/asm-arm/pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-arm/pgalloc.h 2004-12-24 13:35:29.000000000 -0800
+++ linux-2.6.10/include/asm-arm/pgalloc.h 2005-01-04 12:16:49.000000000 -0800
@@ -50,9 +50,8 @@
{
pte_t *pte;

- pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+ pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
if (pte) {
- clear_page(pte);
clean_dcache_area(pte, sizeof(pte_t) * PTRS_PER_PTE);
pte += PTRS_PER_PTE;
}
@@ -65,10 +64,9 @@
{
struct page *pte;

- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
if (pte) {
void *page = page_address(pte);
- clear_page(page);
clean_dcache_area(page, sizeof(pte_t) * PTRS_PER_PTE);
}

Index: linux-2.6.10/drivers/net/tc35815.c
===================================================================
--- linux-2.6.10.orig/drivers/net/tc35815.c 2004-12-24 13:33:48.000000000 -0800
+++ linux-2.6.10/drivers/net/tc35815.c 2005-01-04 12:16:49.000000000 -0800
@@ -657,7 +657,7 @@
dma_cache_wback_inv((unsigned long)lp->fd_buf, PAGE_SIZE * FD_PAGE_NUM);
#endif
} else {
- clear_page(lp->fd_buf);
+ clear_page(lp->fd_buf, 0);
#ifdef __mips__
dma_cache_wback_inv((unsigned long)lp->fd_buf, PAGE_SIZE * FD_PAGE_NUM);
#endif
Index: linux-2.6.10/drivers/block/pktcdvd.c
===================================================================
--- linux-2.6.10.orig/drivers/block/pktcdvd.c 2004-12-24 13:33:49.000000000 -0800
+++ linux-2.6.10/drivers/block/pktcdvd.c 2005-01-04 12:16:49.000000000 -0800
@@ -135,12 +135,10 @@
goto no_bio;

for (i = 0; i < PAGES_PER_PACKET; i++) {
- pkt->pages[i] = alloc_page(GFP_KERNEL);
+ pkt->pages[i] = alloc_page(GFP_KERNEL|| __GFP_ZERO);
if (!pkt->pages[i])
goto no_page;
}
- for (i = 0; i < PAGES_PER_PACKET; i++)
- clear_page(page_address(pkt->pages[i]));

spin_lock_init(&pkt->lock);


2005-01-04 23:36:20

by Christoph Lameter

[permalink] [raw]
Subject: Prezeroing V3 [2/4]: Extension of clear_page to take an order parameter

o Extend clear_page to take an order parameter for all architectures.

Architecture support:
---------------------

Known to work:

ia64
i386
sparc64
m68k

Trivial modification expected to simply work:

arm
cris
h8300
m68knommu
ppc
ppc64
sh64
v850
parisc
sparc
um

Modification made but it would be good to have some feedback from the arch maintainers:

x86_64
s390
alpha
sh
mips
m32r

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.10/include/asm-ia64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-ia64/page.h 2004-12-24 13:34:00.000000000 -0800
+++ linux-2.6.10/include/asm-ia64/page.h 2005-01-04 12:34:03.000000000 -0800
@@ -56,7 +56,7 @@
# ifdef __KERNEL__
# define STRICT_MM_TYPECHECKS

-extern void clear_page (void *page);
+extern void clear_page (void *page, int order);
extern void copy_page (void *to, void *from);

/*
@@ -65,7 +65,7 @@
*/
#define clear_user_page(addr, vaddr, page) \
do { \
- clear_page(addr); \
+ clear_page(addr, 0); \
flush_dcache_page(page); \
} while (0)

Index: linux-2.6.10/include/asm-i386/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-i386/page.h 2005-01-04 12:16:41.000000000 -0800
+++ linux-2.6.10/include/asm-i386/page.h 2005-01-04 12:34:03.000000000 -0800
@@ -18,7 +18,7 @@

#include <asm/mmx.h>

-#define clear_page(page) mmx_clear_page((void *)(page))
+#define clear_page(page, order) mmx_clear_page((void *)(page),order)
#define copy_page(to,from) mmx_copy_page(to,from)

#else
@@ -28,12 +28,12 @@
* Maybe the K6-III ?
*/

-#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE)

#endif

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

/*
Index: linux-2.6.10/include/asm-x86_64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-x86_64/page.h 2005-01-04 12:16:41.000000000 -0800
+++ linux-2.6.10/include/asm-x86_64/page.h 2005-01-04 12:34:03.000000000 -0800
@@ -32,10 +32,10 @@
#ifdef __KERNEL__
#ifndef __ASSEMBLY__

-void clear_page(void *);
+void clear_page(void *, int);
void copy_page(void *, void *);

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

/*
Index: linux-2.6.10/include/asm-sparc/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-sparc/page.h 2004-12-24 13:34:29.000000000 -0800
+++ linux-2.6.10/include/asm-sparc/page.h 2005-01-04 12:34:03.000000000 -0800
@@ -28,10 +28,10 @@

#ifndef __ASSEMBLY__

-#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE)
#define clear_user_page(addr, vaddr, page) \
- do { clear_page(addr); \
+ do { clear_page(addr, 0); \
sparc_flush_page_to_ram(page); \
} while (0)
#define copy_user_page(to, from, vaddr, page) \
Index: linux-2.6.10/include/asm-s390/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-s390/page.h 2004-12-24 13:34:01.000000000 -0800
+++ linux-2.6.10/include/asm-s390/page.h 2005-01-04 12:34:03.000000000 -0800
@@ -22,12 +22,12 @@

#ifndef __s390x__

-static inline void clear_page(void *page)
+static inline void clear_page(void *page, int order)
{
register_pair rp;

rp.subreg.even = (unsigned long) page;
- rp.subreg.odd = (unsigned long) 4096;
+ rp.subreg.odd = (unsigned long) 4096 << order;
asm volatile (" slr 1,1\n"
" mvcl %0,0"
: "+&a" (rp) : : "memory", "cc", "1" );
@@ -63,14 +63,19 @@

#else /* __s390x__ */

-static inline void clear_page(void *page)
+static inline void clear_page(void *page, int order)
{
- asm volatile (" lgr 2,%0\n"
+ int nr = 1 << order;
+
+ while (nr-- >0) {
+ asm volatile (" lgr 2,%0\n"
" lghi 3,4096\n"
" slgr 1,1\n"
" mvcl 2,0"
: : "a" ((void *) (page))
: "memory", "cc", "1", "2", "3" );
+ page += PAGE_SIZE;
+ }
}

static inline void copy_page(void *to, void *from)
@@ -103,7 +108,7 @@

#endif /* __s390x__ */

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

/* Pure 2^n version of get_order */
Index: linux-2.6.10/arch/i386/lib/mmx.c
===================================================================
--- linux-2.6.10.orig/arch/i386/lib/mmx.c 2004-12-24 13:34:48.000000000 -0800
+++ linux-2.6.10/arch/i386/lib/mmx.c 2005-01-04 12:34:03.000000000 -0800
@@ -128,7 +128,7 @@
* other MMX using processors do not.
*/

-static void fast_clear_page(void *page)
+static void fast_clear_page(void *page, int order)
{
int i;

@@ -138,7 +138,7 @@
" pxor %%mm0, %%mm0\n" : :
);

- for(i=0;i<4096/64;i++)
+ for(i=0;i<((4096/64) << order);i++)
{
__asm__ __volatile__ (
" movntq %%mm0, (%0)\n"
@@ -257,7 +257,7 @@
* Generic MMX implementation without K7 specific streaming
*/

-static void fast_clear_page(void *page)
+static void fast_clear_page(void *page, int order)
{
int i;

@@ -267,7 +267,7 @@
" pxor %%mm0, %%mm0\n" : :
);

- for(i=0;i<4096/128;i++)
+ for(i=0;i<((4096/128) << order);i++)
{
__asm__ __volatile__ (
" movq %%mm0, (%0)\n"
@@ -359,23 +359,23 @@
* Favour MMX for page clear and copy.
*/

-static void slow_zero_page(void * page)
+static void slow_clear_page(void * page, int order)
{
int d0, d1;
__asm__ __volatile__( \
"cld\n\t" \
"rep ; stosl" \
: "=&c" (d0), "=&D" (d1)
- :"a" (0),"1" (page),"0" (1024)
+ :"a" (0),"1" (page),"0" (1024 << order)
:"memory");
}
-
-void mmx_clear_page(void * page)
+
+void mmx_clear_page(void * page, int order)
{
if(unlikely(in_interrupt()))
- slow_zero_page(page);
+ slow_clear_page(page, order);
else
- fast_clear_page(page);
+ fast_clear_page(page, order);
}

static void slow_copy_page(void *to, void *from)
Index: linux-2.6.10/include/asm-x86_64/mmx.h
===================================================================
--- linux-2.6.10.orig/include/asm-x86_64/mmx.h 2004-12-24 13:34:57.000000000 -0800
+++ linux-2.6.10/include/asm-x86_64/mmx.h 2005-01-04 12:34:03.000000000 -0800
@@ -8,7 +8,7 @@
#include <linux/types.h>

extern void *_mmx_memcpy(void *to, const void *from, size_t size);
-extern void mmx_clear_page(void *page);
+extern void mmx_clear_page(void *page, int order);
extern void mmx_copy_page(void *to, void *from);

#endif
Index: linux-2.6.10/arch/ia64/lib/clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/ia64/lib/clear_page.S 2004-12-24 13:33:50.000000000 -0800
+++ linux-2.6.10/arch/ia64/lib/clear_page.S 2005-01-04 12:34:03.000000000 -0800
@@ -7,6 +7,7 @@
* 1/06/01 davidm Tuned for Itanium.
* 2/12/02 kchen Tuned for both Itanium and McKinley
* 3/08/02 davidm Some more tweaking
+ * 12/10/04 clameter Make it work on pages of order size
*/
#include <linux/config.h>

@@ -29,27 +30,33 @@
#define dst4 r11

#define dst_last r31
+#define totsize r14

GLOBAL_ENTRY(clear_page)
.prologue
- .regstk 1,0,0,0
- mov r16 = PAGE_SIZE/L3_LINE_SIZE-1 // main loop count, -1=repeat/until
+ .regstk 2,0,0,0
+ mov r16 = PAGE_SIZE/L3_LINE_SIZE // main loop count
+ mov totsize = PAGE_SIZE
.save ar.lc, saved_lc
mov saved_lc = ar.lc
-
+ ;;
.body
+ adds dst1 = 16, in0
mov ar.lc = (PREFETCH_LINES - 1)
mov dst_fetch = in0
- adds dst1 = 16, in0
adds dst2 = 32, in0
+ shl r16 = r16, in1
+ shl totsize = totsize, in1
;;
.fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
adds dst3 = 48, in0 // executing this multiple times is harmless
br.cloop.sptk.few .fetch
+ add r16 = -1,r16
+ add dst_last = totsize, dst_fetch
+ adds dst4 = 64, in0
;;
- addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
mov ar.lc = r16 // one L3 line per iteration
- adds dst4 = 64, in0
+ adds dst_last = -PREFETCH_LINES*L3_LINE_SIZE, dst_last
;;
#ifdef CONFIG_ITANIUM
// Optimized for Itanium
Index: linux-2.6.10/arch/x86_64/lib/clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/x86_64/lib/clear_page.S 2004-12-24 13:34:33.000000000 -0800
+++ linux-2.6.10/arch/x86_64/lib/clear_page.S 2005-01-04 12:34:03.000000000 -0800
@@ -7,6 +7,7 @@
clear_page:
xorl %eax,%eax
movl $4096/64,%ecx
+ shl %esi, %ecx
.p2align 4
.Lloop:
decl %ecx
@@ -42,6 +43,7 @@
.section .altinstr_replacement,"ax"
clear_page_c:
movl $4096/8,%ecx
+ shl %esi, %ecx
xorl %eax,%eax
rep
stosq
Index: linux-2.6.10/include/asm-sh/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-sh/page.h 2004-12-24 13:35:28.000000000 -0800
+++ linux-2.6.10/include/asm-sh/page.h 2005-01-04 12:34:03.000000000 -0800
@@ -36,12 +36,22 @@
#ifdef __KERNEL__
#ifndef __ASSEMBLY__

-extern void (*clear_page)(void *to);
+extern void (*_clear_page)(void *to);
extern void (*copy_page)(void *to, void *from);

extern void clear_page_slow(void *to);
extern void copy_page_slow(void *to, void *from);

+static inline void clear_page(void *page, int order)
+{
+ unsigned int nr = 1 << order;
+
+ while (nr-- >0) {
+ _clear_page(page);
+ page += PAGE_SIZE;
+ }
+}
+
#if defined(CONFIG_SH7705_CACHE_32KB) && defined(CONFIG_MMU)
struct page;
extern void clear_user_page(void *to, unsigned long address, struct page *pg);
@@ -49,7 +59,7 @@
extern void __clear_user_page(void *to, void *orig_to);
extern void __copy_user_page(void *to, void *from, void *orig_to);
#elif defined(CONFIG_CPU_SH2) || defined(CONFIG_CPU_SH3) || !defined(CONFIG_MMU)
-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
#elif defined(CONFIG_CPU_SH4)
struct page;
Index: linux-2.6.10/include/asm-i386/mmx.h
===================================================================
--- linux-2.6.10.orig/include/asm-i386/mmx.h 2004-12-24 13:34:57.000000000 -0800
+++ linux-2.6.10/include/asm-i386/mmx.h 2005-01-04 12:34:03.000000000 -0800
@@ -8,7 +8,7 @@
#include <linux/types.h>

extern void *_mmx_memcpy(void *to, const void *from, size_t size);
-extern void mmx_clear_page(void *page);
+extern void mmx_clear_page(void *page, int order);
extern void mmx_copy_page(void *to, void *from);

#endif
Index: linux-2.6.10/arch/alpha/lib/clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/alpha/lib/clear_page.S 2004-12-24 13:35:25.000000000 -0800
+++ linux-2.6.10/arch/alpha/lib/clear_page.S 2005-01-04 12:34:03.000000000 -0800
@@ -6,11 +6,10 @@

.text
.align 4
- .global clear_page
- .ent clear_page
-clear_page:
+ .global _clear_page
+ .ent _clear_page
+_clear_page:
.prologue 0
-
lda $0,128
nop
unop
@@ -36,4 +35,4 @@
unop
nop

- .end clear_page
+ .end _clear_page
Index: linux-2.6.10/include/asm-sh64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-sh64/page.h 2004-12-24 13:34:33.000000000 -0800
+++ linux-2.6.10/include/asm-sh64/page.h 2005-01-04 12:34:03.000000000 -0800
@@ -50,12 +50,20 @@
extern void sh64_page_clear(void *page);
extern void sh64_page_copy(void *from, void *to);

-#define clear_page(page) sh64_page_clear(page)
+static inline void clear_page(page, order)
+{
+ int nr = 1 << order;
+
+ while (nr-- >0) {
+ sh64_page_clear(page++, 0);
+ }
+}
+
#define copy_page(to,from) sh64_page_copy(from, to)

#if defined(CONFIG_DCACHE_DISABLED)

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) sh_clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

#else
Index: linux-2.6.10/include/asm-h8300/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-h8300/page.h 2004-12-24 13:35:25.000000000 -0800
+++ linux-2.6.10/include/asm-h8300/page.h 2005-01-04 12:34:03.000000000 -0800
@@ -24,10 +24,10 @@
#define get_user_page(vaddr) __get_free_page(GFP_KERNEL)
#define free_user_page(page, addr) free_page(addr)

-#define clear_page(page) memset((page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((to), (from), PAGE_SIZE)

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

/*
Index: linux-2.6.10/include/asm-arm/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-arm/page.h 2004-12-24 13:34:01.000000000 -0800
+++ linux-2.6.10/include/asm-arm/page.h 2005-01-04 12:34:03.000000000 -0800
@@ -128,7 +128,7 @@
preempt_enable(); \
} while (0)

-#define clear_page(page) memzero((void *)(page), PAGE_SIZE)
+#define clear_page(page, order) memzero((void *)(page), PAGE_SIZE << (order))
extern void copy_page(void *to, const void *from);

#undef STRICT_MM_TYPECHECKS
Index: linux-2.6.10/include/asm-ppc64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-ppc64/page.h 2004-12-24 13:33:49.000000000 -0800
+++ linux-2.6.10/include/asm-ppc64/page.h 2005-01-04 12:34:03.000000000 -0800
@@ -102,12 +102,12 @@
#define REGION_MASK (((1UL<<REGION_SIZE)-1UL)<<REGION_SHIFT)
#define REGION_STRIDE (1UL << REGION_SHIFT)

-static __inline__ void clear_page(void *addr)
+static __inline__ void clear_page(void *addr, int order)
{
unsigned long lines, line_size;

line_size = systemcfg->dCacheL1LineSize;
- lines = naca->dCacheL1LinesPerPage;
+ lines = naca->dCacheL1LinesPerPage << order;

__asm__ __volatile__(
"mtctr %1 # clear_page\n\
Index: linux-2.6.10/include/asm-m32r/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-m32r/page.h 2004-12-24 13:34:29.000000000 -0800
+++ linux-2.6.10/include/asm-m32r/page.h 2005-01-04 12:34:03.000000000 -0800
@@ -11,10 +11,22 @@
#ifdef __KERNEL__
#ifndef __ASSEMBLY__

-extern void clear_page(void *to);
+extern void _clear_page(void *to);
+
+static inline void clear_page(void *page, int order)
+{
+ unsigned int nr = 1 << order;
+
+ while (nr-- > 0) {
+ _clear_page(page);
+ page += PAGE_SIZE;
+ }
+}
+
+
extern void copy_page(void *to, void *from);

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

/*
Index: linux-2.6.10/include/asm-alpha/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-alpha/page.h 2004-12-24 13:35:24.000000000 -0800
+++ linux-2.6.10/include/asm-alpha/page.h 2005-01-04 12:34:03.000000000 -0800
@@ -15,8 +15,20 @@

#define STRICT_MM_TYPECHECKS

-extern void clear_page(void *page);
-#define clear_user_page(page, vaddr, pg) clear_page(page)
+extern void _clear_page(void *page);
+
+static inline void clear_page(void *page, int order)
+{
+ int nr = 1 << order;
+
+ while (nr--)
+ {
+ _clear_page(page);
+ page += PAGE_SIZE;
+ }
+}
+
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)

extern void copy_page(void * _to, void * _from);
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
Index: linux-2.6.10/arch/mips/mm/pg-sb1.c
===================================================================
--- linux-2.6.10.orig/arch/mips/mm/pg-sb1.c 2004-12-24 13:35:50.000000000 -0800
+++ linux-2.6.10/arch/mips/mm/pg-sb1.c 2005-01-04 12:34:03.000000000 -0800
@@ -42,7 +42,7 @@
#ifdef CONFIG_SIBYTE_DMA_PAGEOPS
static inline void clear_page_cpu(void *page)
#else
-void clear_page(void *page)
+void _clear_page(void *page)
#endif
{
unsigned char *addr = (unsigned char *) page;
@@ -172,14 +172,13 @@
IOADDR(A_DM_REGISTER(cpu, R_DM_DSCR_BASE)));
}

-void clear_page(void *page)
+void _clear_page(void *page)
{
int cpu = smp_processor_id();

/* if the page is above Kseg0, use old way */
if (KSEGX(page) != CAC_BASE)
return clear_page_cpu(page);
-
page_descr[cpu].dscr_a = PHYSADDR(page) | M_DM_DSCRA_ZERO_MEM | M_DM_DSCRA_L2C_DEST | M_DM_DSCRA_INTERRUPT;
page_descr[cpu].dscr_b = V_DM_DSCRB_SRC_LENGTH(PAGE_SIZE);
__raw_writeq(1, IOADDR(A_DM_REGISTER(cpu, R_DM_DSCR_COUNT)));
@@ -218,5 +217,5 @@

#endif

-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(_clear_page);
EXPORT_SYMBOL(copy_page);
Index: linux-2.6.10/include/asm-m68k/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-m68k/page.h 2004-12-24 13:35:49.000000000 -0800
+++ linux-2.6.10/include/asm-m68k/page.h 2005-01-04 12:34:03.000000000 -0800
@@ -50,7 +50,7 @@
);
}

-static inline void clear_page(void *page)
+static inline void clear_page(void *page, int order)
{
unsigned long tmp;
unsigned long *sp = page;
@@ -69,16 +69,16 @@
"dbra %1,1b\n\t"
: "=a" (sp), "=d" (tmp)
: "a" (page), "0" (sp),
- "1" ((PAGE_SIZE - 16) / 16 - 1));
+ "1" (((PAGE_SIZE<<(order)) - 16) / 16 - 1));
}

#else
-#define clear_page(page) memset((page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((to), (from), PAGE_SIZE)
#endif

#define clear_user_page(addr, vaddr, page) \
- do { clear_page(addr); \
+ do { clear_page(addr, 0); \
flush_dcache_page(page); \
} while (0)
#define copy_user_page(to, from, vaddr, page) \
Index: linux-2.6.10/include/asm-mips/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-mips/page.h 2004-12-24 13:34:31.000000000 -0800
+++ linux-2.6.10/include/asm-mips/page.h 2005-01-04 12:34:03.000000000 -0800
@@ -39,7 +39,18 @@
#ifdef __KERNEL__
#ifndef __ASSEMBLY__

-extern void clear_page(void * page);
+extern void _clear_page(void * page);
+
+static inline void clear_page(void *page, int order)
+{
+ unsigned int nr = 1 << order;
+
+ while (nr-- >0) {
+ _clear_page(page);
+ page += PAGE_SIZE;
+ }
+}
+
extern void copy_page(void * to, void * from);

extern unsigned long shm_align_mask;
@@ -57,7 +68,7 @@
{
extern void (*flush_data_cache_page)(unsigned long addr);

- clear_page(addr);
+ clear_page(addr, 0);
if (pages_do_alias((unsigned long) addr, vaddr))
flush_data_cache_page((unsigned long)addr);
}
Index: linux-2.6.10/include/asm-m68knommu/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-m68knommu/page.h 2004-12-24 13:34:33.000000000 -0800
+++ linux-2.6.10/include/asm-m68knommu/page.h 2005-01-04 12:34:03.000000000 -0800
@@ -24,10 +24,10 @@
#define get_user_page(vaddr) __get_free_page(GFP_KERNEL)
#define free_user_page(page, addr) free_page(addr)

-#define clear_page(page) memset((page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((to), (from), PAGE_SIZE)

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

/*
Index: linux-2.6.10/include/asm-cris/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-cris/page.h 2004-12-24 13:34:30.000000000 -0800
+++ linux-2.6.10/include/asm-cris/page.h 2005-01-04 12:34:03.000000000 -0800
@@ -15,10 +15,10 @@

#ifdef __KERNEL__

-#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE)

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

/*
Index: linux-2.6.10/include/asm-v850/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-v850/page.h 2004-12-24 13:35:00.000000000 -0800
+++ linux-2.6.10/include/asm-v850/page.h 2005-01-04 12:34:03.000000000 -0800
@@ -37,11 +37,11 @@

#define STRICT_MM_TYPECHECKS

-#define clear_page(page) memset ((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset ((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to, from) memcpy ((void *)(to), (void *)from, PAGE_SIZE)

#define clear_user_page(addr, vaddr, page) \
- do { clear_page(addr); \
+ do { clear_page(addr, 0); \
flush_dcache_page(page); \
} while (0)
#define copy_user_page(to, from, vaddr, page) \
Index: linux-2.6.10/include/asm-parisc/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-parisc/page.h 2004-12-24 13:34:26.000000000 -0800
+++ linux-2.6.10/include/asm-parisc/page.h 2005-01-04 12:34:03.000000000 -0800
@@ -13,7 +13,7 @@
#include <asm/types.h>
#include <asm/cache.h>

-#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) copy_user_page_asm((void *)(to), (void *)(from))

struct page;
Index: linux-2.6.10/arch/arm/mm/copypage-v6.c
===================================================================
--- linux-2.6.10.orig/arch/arm/mm/copypage-v6.c 2004-12-24 13:34:31.000000000 -0800
+++ linux-2.6.10/arch/arm/mm/copypage-v6.c 2005-01-04 12:34:03.000000000 -0800
@@ -47,7 +47,7 @@
*/
void v6_clear_user_page_nonaliasing(void *kaddr, unsigned long vaddr)
{
- clear_page(kaddr);
+ _clear_page(kaddr);
}

/*
@@ -116,7 +116,7 @@

set_pte(to_pte + offset, pfn_pte(__pa(kaddr) >> PAGE_SHIFT, to_pgprot));
flush_tlb_kernel_page(to);
- clear_page((void *)to);
+ _clear_page((void *)to);

spin_unlock(&v6_lock);
}
Index: linux-2.6.10/arch/m32r/mm/page.S
===================================================================
--- linux-2.6.10.orig/arch/m32r/mm/page.S 2004-12-24 13:34:57.000000000 -0800
+++ linux-2.6.10/arch/m32r/mm/page.S 2005-01-04 12:34:03.000000000 -0800
@@ -51,7 +51,7 @@
jmp r14

.text
- .global clear_page
+ .global _clear_page
/*
* clear_page (to)
*
@@ -60,7 +60,7 @@
* 16 * 256
*/
.align 4
-clear_page:
+_clear_page:
ldi r2, #255
ldi r4, #0
ld r3, @r0 /* cache line allocate */
Index: linux-2.6.10/include/asm-ppc/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-ppc/page.h 2004-12-24 13:34:29.000000000 -0800
+++ linux-2.6.10/include/asm-ppc/page.h 2005-01-04 12:34:03.000000000 -0800
@@ -85,7 +85,7 @@

struct page;
extern void clear_pages(void *page, int order);
-static inline void clear_page(void *page) { clear_pages(page, 0); }
+#define clear_page clear_pages
extern void copy_page(void *to, void *from);
extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg);
extern void copy_user_page(void *to, void *from, unsigned long vaddr,
Index: linux-2.6.10/arch/alpha/kernel/alpha_ksyms.c
===================================================================
--- linux-2.6.10.orig/arch/alpha/kernel/alpha_ksyms.c 2004-12-24 13:33:51.000000000 -0800
+++ linux-2.6.10/arch/alpha/kernel/alpha_ksyms.c 2005-01-04 12:34:03.000000000 -0800
@@ -88,7 +88,7 @@
EXPORT_SYMBOL(__memsetw);
EXPORT_SYMBOL(__constant_c_memset);
EXPORT_SYMBOL(copy_page);
-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(_clear_page);

EXPORT_SYMBOL(__direct_map_base);
EXPORT_SYMBOL(__direct_map_size);
Index: linux-2.6.10/arch/alpha/lib/ev6-clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/alpha/lib/ev6-clear_page.S 2004-12-24 13:35:24.000000000 -0800
+++ linux-2.6.10/arch/alpha/lib/ev6-clear_page.S 2005-01-04 12:34:03.000000000 -0800
@@ -6,9 +6,9 @@

.text
.align 4
- .global clear_page
- .ent clear_page
-clear_page:
+ .global _clear_page
+ .ent _clear_page
+_clear_page:
.prologue 0

lda $0,128
@@ -51,4 +51,4 @@
nop
nop

- .end clear_page
+ .end _clear_page
Index: linux-2.6.10/arch/sh/mm/init.c
===================================================================
--- linux-2.6.10.orig/arch/sh/mm/init.c 2004-12-24 13:35:24.000000000 -0800
+++ linux-2.6.10/arch/sh/mm/init.c 2005-01-04 12:34:03.000000000 -0800
@@ -57,7 +57,7 @@
#endif

void (*copy_page)(void *from, void *to);
-void (*clear_page)(void *to);
+void (*_clear_page)(void *to);

void show_mem(void)
{
@@ -255,7 +255,7 @@
* later in the boot process if a better method is available.
*/
copy_page = copy_page_slow;
- clear_page = clear_page_slow;
+ _clear_page = clear_page_slow;

/* this will put all low memory onto the freelists */
totalram_pages += free_all_bootmem_node(NODE_DATA(0));
Index: linux-2.6.10/arch/sh/mm/pg-dma.c
===================================================================
--- linux-2.6.10.orig/arch/sh/mm/pg-dma.c 2004-12-24 13:35:00.000000000 -0800
+++ linux-2.6.10/arch/sh/mm/pg-dma.c 2005-01-04 12:34:03.000000000 -0800
@@ -78,7 +78,7 @@
return ret;

copy_page = copy_page_dma;
- clear_page = clear_page_dma;
+ _clear_page = clear_page_dma;

return ret;
}
Index: linux-2.6.10/arch/sh/mm/pg-nommu.c
===================================================================
--- linux-2.6.10.orig/arch/sh/mm/pg-nommu.c 2004-12-24 13:34:32.000000000 -0800
+++ linux-2.6.10/arch/sh/mm/pg-nommu.c 2005-01-04 12:34:03.000000000 -0800
@@ -27,7 +27,7 @@
static int __init pg_nommu_init(void)
{
copy_page = copy_page_nommu;
- clear_page = clear_page_nommu;
+ _clear_page = clear_page_nommu;

return 0;
}
Index: linux-2.6.10/arch/mips/mm/pg-r4k.c
===================================================================
--- linux-2.6.10.orig/arch/mips/mm/pg-r4k.c 2004-12-24 13:34:49.000000000 -0800
+++ linux-2.6.10/arch/mips/mm/pg-r4k.c 2005-01-04 12:34:03.000000000 -0800
@@ -39,9 +39,9 @@

static unsigned int clear_page_array[0x130 / 4];

-void clear_page(void * page) __attribute__((alias("clear_page_array")));
+void _clear_page(void * page) __attribute__((alias("clear_page_array")));

-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(_clear_page);

/*
* Maximum sizes:
Index: linux-2.6.10/arch/m32r/kernel/m32r_ksyms.c
===================================================================
--- linux-2.6.10.orig/arch/m32r/kernel/m32r_ksyms.c 2004-12-24 13:34:29.000000000 -0800
+++ linux-2.6.10/arch/m32r/kernel/m32r_ksyms.c 2005-01-04 12:34:03.000000000 -0800
@@ -102,7 +102,7 @@
EXPORT_SYMBOL(memcmp);
EXPORT_SYMBOL(memscan);
EXPORT_SYMBOL(copy_page);
-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(_clear_page);

EXPORT_SYMBOL(strcat);
EXPORT_SYMBOL(strchr);
Index: linux-2.6.10/include/asm-arm26/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-arm26/page.h 2004-12-24 13:35:22.000000000 -0800
+++ linux-2.6.10/include/asm-arm26/page.h 2005-01-04 12:34:03.000000000 -0800
@@ -25,7 +25,7 @@
preempt_enable(); \
} while (0)

-#define clear_page(page) memzero((void *)(page), PAGE_SIZE)
+#define clear_page(page, order) memzero((void *)(page), PAGE_SIZE << (order))
#define copy_page(to, from) __copy_user_page(to, from, 0);

#undef STRICT_MM_TYPECHECKS
Index: linux-2.6.10/include/asm-sparc64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-sparc64/page.h 2004-12-24 13:34:32.000000000 -0800
+++ linux-2.6.10/include/asm-sparc64/page.h 2005-01-04 12:34:03.000000000 -0800
@@ -14,8 +14,8 @@

#ifndef __ASSEMBLY__

-extern void _clear_page(void *page);
-#define clear_page(X) _clear_page((void *)(X))
+extern void _clear_page(void *page, unsigned long order);
+#define clear_page(X,Y) _clear_page((void *)(X),(Y))
struct page;
extern void clear_user_page(void *addr, unsigned long vaddr, struct page *page);
#define copy_page(X,Y) memcpy((void *)(X), (void *)(Y), PAGE_SIZE)
Index: linux-2.6.10/arch/sparc64/lib/clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/sparc64/lib/clear_page.S 2004-12-24 13:35:23.000000000 -0800
+++ linux-2.6.10/arch/sparc64/lib/clear_page.S 2005-01-04 12:34:03.000000000 -0800
@@ -28,9 +28,12 @@
.text

.globl _clear_page
-_clear_page: /* %o0=dest */
+_clear_page: /* %o0=dest, %o1=order */
+ sethi %hi(PAGE_SIZE/64), %o2
+ clr %o4
+ or %o2, %lo(PAGE_SIZE/64), %o2
ba,pt %xcc, clear_page_common
- clr %o4
+ sllx %o2, %o1, %o1

/* This thing is pretty important, it shows up
* on the profiles via do_anonymous_page().
@@ -69,16 +72,16 @@
flush %g6
wrpr %o4, 0x0, %pstate

+ sethi %hi(PAGE_SIZE/64), %o1
mov 1, %o4
+ or %o1, %lo(PAGE_SIZE/64), %o1

clear_page_common:
VISEntryHalf
membar #StoreLoad | #StoreStore | #LoadStore
fzero %f0
- sethi %hi(PAGE_SIZE/64), %o1
mov %o0, %g1 ! remember vaddr for tlbflush
fzero %f2
- or %o1, %lo(PAGE_SIZE/64), %o1
faddd %f0, %f2, %f4
fmuld %f0, %f2, %f6
faddd %f0, %f2, %f8

2005-01-04 23:36:39

by Christoph Lameter

[permalink] [raw]
Subject: Prezeroing V3 [4/4]: Driver for hardware zeroing on Altix

o Zeroing driver implemented with the Block Transfer Engine in the Altix
SN2 SHub.

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.10/arch/ia64/sn/kernel/bte.c
===================================================================
--- linux-2.6.10.orig/arch/ia64/sn/kernel/bte.c 2004-12-24 13:34:58.000000000 -0800
+++ linux-2.6.10/arch/ia64/sn/kernel/bte.c 2005-01-03 13:36:07.000000000 -0800
@@ -4,6 +4,8 @@
* for more details.
*
* Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * Support for zeroing pages, Christoph Lameter, SGI, December 2004.
*/

#include <linux/config.h>
@@ -20,6 +22,8 @@
#include <linux/bootmem.h>
#include <linux/string.h>
#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/scrub.h>

#include <asm/sn/bte.h>

@@ -30,7 +34,7 @@
/* two interfaces on two btes */
#define MAX_INTERFACES_TO_TRY 4

-static struct bteinfo_s *bte_if_on_node(nasid_t nasid, int interface)
+static inline struct bteinfo_s *bte_if_on_node(nasid_t nasid, int interface)
{
nodepda_t *tmp_nodepda;

@@ -132,7 +136,6 @@
if (bte == NULL) {
continue;
}
-
if (spin_trylock(&bte->spinlock)) {
if (!(*bte->most_rcnt_na & BTE_WORD_AVAILABLE) ||
(BTE_LNSTAT_LOAD(bte) & BTE_ACTIVE)) {
@@ -157,7 +160,7 @@
}
} while (1);

- if (notification == NULL) {
+ if (notification == NULL || (mode & BTE_NOTIFY_AND_GET_POINTER)) {
/* User does not want to be notified. */
bte->most_rcnt_na = &bte->notify;
} else {
@@ -192,6 +195,8 @@

itc_end = ia64_get_itc() + (40000000 * local_cpu_data->cyc_per_usec);

+ if (mode & BTE_NOTIFY_AND_GET_POINTER)
+ *(u64 volatile **)(notification) = &bte->notify;
spin_unlock_irqrestore(&bte->spinlock, irq_flags);

if (notification != NULL) {
@@ -449,5 +454,37 @@
mynodepda->bte_if[i].cleanup_active = 0;
mynodepda->bte_if[i].bh_error = 0;
}
+}
+
+u64 *bte_zero_notify[MAX_COMPACT_NODES];
+
+static int bte_check_bzero(void)
+{
+ int node = get_nasid();
+
+ return *(bte_zero_notify[node]) != BTE_WORD_BUSY;
+}
+
+static int bte_start_bzero(void *p, unsigned long len)
+{
+ int node = get_nasid();
+
+ /* Check limitations.
+ 1. System must be running (weird things happen during bootup)
+ 2. Size >64KB. Smaller requests cause too much bte traffic
+ */
+ if (len >= BTE_MAX_XFER || len < 60000 || system_state != SYSTEM_RUNNING)
+ return EINVAL;
+
+ return bte_zero(ia64_tpa(p), len, BTE_NOTIFY_AND_GET_POINTER, bte_zero_notify+node);
+}
+
+static struct zero_driver bte_bzero = {
+ .start = bte_start_bzero,
+ .check = bte_check_bzero,
+ .rate = 500000000 /* 500 MB /sec */
+};

+void sn_bte_bzero_init(void) {
+ register_zero_driver(&bte_bzero);
}
Index: linux-2.6.10/arch/ia64/sn/kernel/setup.c
===================================================================
--- linux-2.6.10.orig/arch/ia64/sn/kernel/setup.c 2004-12-24 13:34:27.000000000 -0800
+++ linux-2.6.10/arch/ia64/sn/kernel/setup.c 2005-01-03 13:36:07.000000000 -0800
@@ -243,6 +243,7 @@
int pxm;
int major = sn_sal_rev_major(), minor = sn_sal_rev_minor();
extern void sn_cpu_init(void);
+ extern void sn_bte_bzero_init(void);

/*
* If the generic code has enabled vga console support - lets
@@ -333,6 +334,7 @@
screen_info = sn_screen_info;

sn_timer_init();
+ sn_bte_bzero_init();
}

/**
Index: linux-2.6.10/include/asm-ia64/sn/bte.h
===================================================================
--- linux-2.6.10.orig/include/asm-ia64/sn/bte.h 2004-12-24 13:34:45.000000000 -0800
+++ linux-2.6.10/include/asm-ia64/sn/bte.h 2005-01-03 13:36:07.000000000 -0800
@@ -48,6 +48,8 @@
#define BTE_ZERO_FILL (BTE_NOTIFY | IBCT_ZFIL_MODE)
/* Use a reserved bit to let the caller specify a wait for any BTE */
#define BTE_WACQUIRE (0x4000)
+/* Return the pointer to the notification cacheline to the user */
+#define BTE_NOTIFY_AND_GET_POINTER (0x8000)
/* Use the BTE on the node with the destination memory */
#define BTE_USE_DEST (BTE_WACQUIRE << 1)
/* Use any available BTE interface on any node for the transfer */

2005-01-04 23:41:26

by Christoph Lameter

[permalink] [raw]
Subject: Prezeroing V3 [0/4]: Discussion and i386 performance tests

Change from V2 to V3:
o Updates for clear_page on various platforms
o Performance measurements on i386 (2x PIII-450 384M RAM)
o Port patches to 2.6.10-bk7
o Add scrub_load so that a high load prevents scrubd from running
(So that people may feel better about this approach. Set by
default to 999 so its off. The typical result of not running kscrubd
under high loads is to slow the system down even further since zeroing
large consecutive areas of memory is more efficient than zeroing page
size chunks. Memory subsystems are typically optimized for linear accesses
and reach their peak performance if large areas of memory are written to)
o Various fixes

The patches increasing the page fault rate (introduction of atomic pte
operations and anticipatory prefaulting) do so by reducing the locking
overhead and are therefore mainly of interest for applications running in
SMP systems with a high number of cpus. The single thread performance does
just show minor increases. Only the performance of multi-threaded
applications increases significantly.

The most expensive operation in the page fault handler is (apart of SMP
locking overhead) the zeroing of the page that is also done in the page fault
handler. This zeroing means that all cachelines of the faulted page (on Altix
that means all 128 cachelines of 128 byte each) must be loaded and later
written back. This patch allows to avoid having to load all cachelines
if only a part of the cachelines of that page is needed immediately after
the fault. Doing so will only be effective for sparsely accessed memory
which is typical for anonymous memory and pte maps. Prezeroed pages will
only be used for those purposes. Unzeroed pages will be used as usual for
file mapping, page caching etc etc.

Others have also thought that prezeroing could be a benefit and have tried
provide a way to provide zeroed pages to the page fault handler:

http://marc.theaimsgroup.com/?t=109914559100004&r=1&w=2
http://marc.theaimsgroup.com/?t=109777267500005&r=1&w=2
http://marc.theaimsgroup.com/?l=linux-kernel&m=104931944213955&w=2

However, these attempt have tried to zero pages that are like to be used
soon (and that may have recently been accessed). Elements of these pages
are thus already in the cpu caches. Approaches like that will only shift
processing to somewhere else and not bring any performance benefits.
Prezeroing only makes sense for pages that are not currently needed and that
are not in the cpu caches. Pages that have recently been touched and that
soon will be touched again are better hot zeroed since the zeroing will
largely be done to cachelines already in the cpu caches.

The patch makes prezeroing very effective by:

1. Aggregating zeroing operations to only apply to pages of higher order,
which results in many pages that will later become zero 0 to be zeroed in one
step.
For that purpose the existing clear_page function is extended and made to
take an additional argument specifying the order of the page to be cleared.

2. Hardware support for offloading zeroing from the cpu. This avoids
the invalidation of the cpu caches by extensive zeroing operations.

The scrub daemon is invoked when a unzeroed page of a certain order has
been generated so that its worth running it. If no higher order pages are
present then the logic will favor hot zeroing rather than simply shifting
processing around. kscrubd typically runs only for a fraction of a second
and sleeps for long periods of time even under memory benchmarking. kscrubd
performs short bursts of zeroing when needed and tries to stay out off the
processor as much as possible.

The result is a significant increase of the page fault performance even for
single threaded applications (i386 2x PIII-450 384M RAM allocating 256M in
each run):

w/o patch:
Gb Rep Threads User System Wall flt/cpu/s fault/wsec
0 1 1 0.006s 0.389s 0.039s157455.320 157070.694
0 1 2 0.007s 0.607s 0.032s101476.689 190350.885

w/patch
Gb Rep Threads User System Wall flt/cpu/s fault/wsec
0 1 1 0.008s 0.083s 0.009s672151.422 664045.899
0 1 2 0.005s 0.129s 0.008s459629.796 741857.373

The performance can only be upheld if enough zeroed pages are available.
In a heavy memory intensive benchmark the system may run out of these very
fast but the efficient algorithm for page zeroing still makes this a winner
(2 way system with 384MB RAM, no hardware zeroing support). In the following
measurement the test is repeated 10 times allocating 256M each in rapid
succession which would deplete the pool of zeroed pages quickly):

w/o patch:
Gb Rep Threads User System Wall flt/cpu/s fault/wsec
0 10 1 0.058s 3.913s 3.097s157335.774 157076.932
0 10 2 0.063s 6.139s 3.027s100756.788 190572.486

w/patch
Gb Rep Threads User System Wall flt/cpu/s fault/wsec
0 10 1 0.059s 1.828s 1.089s330913.517 330225.515
0 10 2 0.082s 1.951s 1.094s307172.100 320680.232

Note that zeroing of pages makes no sense if the application
touches all cache lines of a page allocated (there is no influence of
prezeroing on benchmarks like lmbench for that reason) since the extensive
caching of modern cpus means that the zeroes written to a hot zeroed page
will then be overwritten by the application in the cpu cache and thus
the zeros will never make it to memory! The test program used above only
touches one 128 byte cache line of a 16k page (ia64). Sparsely
populated and accessed areas are typical for lots of applications.

Here is another test in order to gauge the influence of the number of cache
lines touched on the performance of the prezero enhancements:

Gb Rep Thr CLine User System Wall flt/cpu/s fault/wsec
1 1 1 1 0.01s 0.12s 0.01s500813.853 497925.891
1 1 1 2 0.01s 0.11s 0.01s493453.103 472877.725
1 1 1 4 0.02s 0.10s 0.01s479351.658 471507.415
1 1 1 8 0.01s 0.13s 0.01s424742.054 416725.013
1 1 1 16 0.05s 0.12s 0.01s347715.359 336983.834
1 1 1 32 0.12s 0.13s 0.02s258112.286 256246.731
1 1 1 64 0.24s 0.14s 0.03s169896.381 168189.283
1 1 1 128 0.49s 0.14s 0.06s102300.257 101674.435

The benefits of prezeroing are reduced to minimal quantities if all
cachelines of a page are touched. Prezeroing can only be effective
if the whole page is not immediately used after the page fault.

The patch is composed of 4 parts:

[1/4] Introduce __GFP_ZERO
Modifies the page allocator to be able to take the __GFP_ZERO flag
and returns zeroed memory on request. Modifies locations throughout
the linux sources that retrieve a page and then zero it to request
a zeroed page.

[2/4] Architecture specific clear_page updates
Adds second order argument to clear_page and updates all arches.

Note: The two first pages may be used alone if no zeroing engine is wanted.

[3/4] Page Zeroing
Adds management of ZEROED and NOT_ZEROED pages and a background daemon
called scrubd. scrubd is disabled by default but can be enabled
by writing an order number to /proc/sys/vm/scrub_start. If a page
is coalesced of that order or higher then the scrub daemon will
start zeroing until all pages of order /proc/sys/vm/scrub_stop and
higher are zeroed and then go back to sleep.

In an SMP environment the scrub daemon is typically
running on the most idle cpu. Thus a single threaded application running
on one cpu may have the other cpu zeroing pages for it etc. The scrub
daemon is hardly noticable and usually finished zeroing quickly since
most processors are optimized for linear memory filling.

[4/4] SGI Altix Block Transfer Engine Support
Implements a driver to shift the zeroing off the cpu into hardware.
With hardware support there will be minimal impact of zeroing
on the performance of the system.

2005-01-05 00:40:55

by Linus Torvalds

[permalink] [raw]
Subject: Re: Prezeroing V3 [1/4]: Allow request for zeroed memory



On Tue, 4 Jan 2005, Christoph Lameter wrote:
>
> This patch introduces __GFP_ZERO as an additional gfp_mask element to allow
> to request zeroed pages from the page allocator.

Ok, let's start merging this slowly, and in particular, this 1/4 one looks
pretty much like a cleanup regardless of whatever else happen, so let's
just do it. However, for it to really be a cleanup, how about making
_this_ part:

> +
> + if (gfp_flags & __GFP_ZERO) {
> +#ifdef CONFIG_HIGHMEM
> + if (PageHighMem(page)) {
> + int n = 1 << order;
> +
> + while (n-- >0)
> + clear_highpage(page + n);
> + } else
> +#endif
> + clear_page(page_address(page), order);
> + }

Match the existing previous part:

> if (order && (gfp_flags & __GFP_COMP))
> prep_compound_page(page, order);


and just split it up into a "prep_zero_page(page, order)"? I dislike
#ifdef's in the middle of deep functions. In the middle of a _trivial_
function it's much more palatable.

At that point at least part 1 ends up being a nice clean patch on its own,
and should even shrink the code-size a bit. IOW, it not only is a cleanup,
there is even a technical argument for it (even without worrying about the
next stages).

Hmm?

Linus

2005-01-05 00:45:08

by Andrew Morton

[permalink] [raw]
Subject: Re: Prezeroing V3 [1/4]: Allow request for zeroed memory

Linus Torvalds <[email protected]> wrote:
>
> On Tue, 4 Jan 2005, Christoph Lameter wrote:
> >
> > This patch introduces __GFP_ZERO as an additional gfp_mask element to allow
> > to request zeroed pages from the page allocator.
>
> Ok, let's start merging this slowly

One week hence, please. Things like the no-bitmaps-for-the-buddy-allocator
have been well tested and should go in first.

2005-01-05 01:02:35

by Dave Hansen

[permalink] [raw]
Subject: Re: Prezeroing V3 [1/4]: Allow request for zeroed memory

On Tue, 2005-01-04 at 15:13 -0800, Christoph Lameter wrote:
> + if (gfp_flags & __GFP_ZERO) {
> +#ifdef CONFIG_HIGHMEM
> + if (PageHighMem(page)) {
> + int n = 1 << order;
> +
> + while (n-- >0)
> + clear_highpage(page + n);
> + } else
> +#endif
> + clear_page(page_address(page), order);
> + }
> if (order && (gfp_flags & __GFP_COMP))
> prep_compound_page(page, order);

That #ifdef can probably die. The compiler should get that all by
itself:

> #ifdef CONFIG_HIGHMEM
> #define PageHighMem(page) test_bit(PG_highmem, &(page)->flags)
> #else
> #define PageHighMem(page) 0 /* needed to optimize away at compile time */
> #endif

-- Dave

2005-01-05 01:18:25

by Christoph Lameter

[permalink] [raw]
Subject: Re: Prezeroing V3 [1/4]: Allow request for zeroed memory

On Tue, 4 Jan 2005, Dave Hansen wrote:

> That #ifdef can probably die. The compiler should get that all by
> itself:
>
> > #ifdef CONFIG_HIGHMEM
> > #define PageHighMem(page) test_bit(PG_highmem, &(page)->flags)
> > #else
> > #define PageHighMem(page) 0 /* needed to optimize away at compile time */
> > #endif

Ahh. Great. Do I need to submit a corrected patch that removes those two
lines or is it fine as is?

2005-01-05 01:18:42

by Christoph Lameter

[permalink] [raw]
Subject: Re: Prezeroing V3 [1/4]: Allow request for zeroed memory

On Tue, 4 Jan 2005, Andrew Morton wrote:

> > Ok, let's start merging this slowly
>
> One week hence, please. Things like the no-bitmaps-for-the-buddy-allocator
> have been well tested and should go in first.

The first two patches are basically cleanup type stuff and will not affect
the page allocator in a significant way. On the other hand they touch many
files and are thus difficult to maintain.

2005-01-05 01:27:42

by Linus Torvalds

[permalink] [raw]
Subject: Re: Prezeroing V3 [1/4]: Allow request for zeroed memory



On Tue, 4 Jan 2005, Christoph Lameter wrote:
>
> Ahh. Great. Do I need to submit a corrected patch that removes those two
> lines or is it fine as is?

Please do split it up into a function of its own. It's going to look a lot
prettier as an intermediate phase. I realize that that touches #3 in the
series, but I suspect that one will also just be prettier as a result.

Linus

2005-01-05 02:16:52

by Andi Kleen

[permalink] [raw]
Subject: Re: Prezeroing V3 [4/4]: Driver for hardware zeroing on Altix

Christoph Lameter <[email protected]> writes:

> + /* Check limitations.
> + 1. System must be running (weird things happen during bootup)
> + 2. Size >64KB. Smaller requests cause too much bte traffic
> + */
> + if (len >= BTE_MAX_XFER || len < 60000 || system_state != SYSTEM_RUNNING)
> + return EINVAL;

surely return -EINVAL;

Also have you thought about doing a similar driver for x86/x86-64 using
cache bypassing stores?

-Andi

2005-01-05 16:28:54

by Christoph Lameter

[permalink] [raw]
Subject: Re: Prezeroing V3 [4/4]: Driver for hardware zeroing on Altix

On Wed, 5 Jan 2005, Andi Kleen wrote:

> Christoph Lameter <[email protected]> writes:
>
> > + /* Check limitations.
> > + 1. System must be running (weird things happen during bootup)
> > + 2. Size >64KB. Smaller requests cause too much bte traffic
> > + */
> > + if (len >= BTE_MAX_XFER || len < 60000 || system_state != SYSTEM_RUNNING)
> > + return EINVAL;
>
> surely return -EINVAL;

Anything will do as long as its != 0. But yeah that would more closely
follow convention.

> Also have you thought about doing a similar driver for x86/x86-64 using
> cache bypassing stores?

As you know we do ia64 and I am no expert on x86_64. But the interface for
hardware zeroing is designed for purposes like that.

2005-01-05 23:14:32

by Christoph Lameter

[permalink] [raw]
Subject: Re: Prezeroing V3 [1/4]: Allow request for zeroed memory

On Tue, 4 Jan 2005, Linus Torvalds wrote:

> Please do split it up into a function of its own. It's going to look a lot
> prettier as an intermediate phase. I realize that that touches #3 in the
> series, but I suspect that one will also just be prettier as a result.

Here is the first patch redone as you wanted. I also removed all
dependencies on the second patch. This should be able to get in
on its own.
I will sent the revised second patch dealing with updating clear_page
later and keep back the last two patches until the bitmap thing has been
changed in the buddy allocator.

Signed-off-by: Christoph Lameter <[email protected]>

This patch introduces __GFP_ZERO as an additional gfp_mask element to allow
to request zeroed pages from the page allocator.

- Modifies the page allocator so that it zeroes memory if __GFP_ZERO is set

- Replace all page zeroing after allocating pages by prior allocations with
allocations using __GFP_ZERO

Index: linux-2.6.10/mm/page_alloc.c
===================================================================
--- linux-2.6.10.orig/mm/page_alloc.c 2005-01-04 14:17:01.000000000 -0800
+++ linux-2.6.10/mm/page_alloc.c 2005-01-05 09:32:52.000000000 -0800
@@ -549,6 +549,12 @@
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
*/
+static inline void prep_zero_page(struct page *page, int order) {
+ int i;
+
+ for(i = 0; i < (1 << order); i++)
+ clear_highpage(page + i);
+}

static struct page *
buffered_rmqueue(struct zone *zone, int order, int gfp_flags)
@@ -584,6 +590,10 @@
BUG_ON(bad_range(zone, page));
mod_page_state_zone(zone, pgalloc, 1 << order);
prep_new_page(page, order);
+
+ if (gfp_flags & __GFP_ZERO)
+ prep_zero_page(page, order);
+
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
}
@@ -796,12 +806,9 @@
*/
BUG_ON(gfp_mask & __GFP_HIGHMEM);

- page = alloc_pages(gfp_mask, 0);
- if (page) {
- void *address = page_address(page);
- clear_page(address);
- return (unsigned long) address;
- }
+ page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
+ if (page)
+ return (unsigned long) page_address(page);
return 0;
}

Index: linux-2.6.10/include/linux/gfp.h
===================================================================
--- linux-2.6.10.orig/include/linux/gfp.h 2004-12-24 13:34:27.000000000 -0800
+++ linux-2.6.10/include/linux/gfp.h 2005-01-05 09:30:39.000000000 -0800
@@ -37,6 +37,7 @@
#define __GFP_NORETRY 0x1000 /* Do not retry. Might fail */
#define __GFP_NO_GROW 0x2000 /* Slab internal usage */
#define __GFP_COMP 0x4000 /* Add compound page metadata */
+#define __GFP_ZERO 0x8000 /* Return zeroed page on success */

#define __GFP_BITS_SHIFT 16 /* Room for 16 __GFP_FOO bits */
#define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1)
@@ -52,6 +53,7 @@
#define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS)
#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS)
#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM)
+#define GFP_HIGHZERO (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM | __GFP_ZERO)

/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some
platforms, used as appropriate on others */
Index: linux-2.6.10/mm/memory.c
===================================================================
--- linux-2.6.10.orig/mm/memory.c 2005-01-04 14:17:01.000000000 -0800
+++ linux-2.6.10/mm/memory.c 2005-01-05 09:30:39.000000000 -0800
@@ -1650,10 +1650,9 @@

if (unlikely(anon_vma_prepare(vma)))
goto no_mem;
- page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+ page = alloc_page_vma(GFP_HIGHZERO, vma, addr);
if (!page)
goto no_mem;
- clear_user_highpage(page, addr);

spin_lock(&mm->page_table_lock);
page_table = pte_offset_map(pmd, addr);
Index: linux-2.6.10/kernel/profile.c
===================================================================
--- linux-2.6.10.orig/kernel/profile.c 2004-12-24 13:35:28.000000000 -0800
+++ linux-2.6.10/kernel/profile.c 2005-01-05 09:30:39.000000000 -0800
@@ -326,17 +326,15 @@
node = cpu_to_node(cpu);
per_cpu(cpu_profile_flip, cpu) = 0;
if (!per_cpu(cpu_profile_hits, cpu)[1]) {
- page = alloc_pages_node(node, GFP_KERNEL, 0);
+ page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
if (!page)
return NOTIFY_BAD;
- clear_highpage(page);
per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
}
if (!per_cpu(cpu_profile_hits, cpu)[0]) {
- page = alloc_pages_node(node, GFP_KERNEL, 0);
+ page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
if (!page)
goto out_free;
- clear_highpage(page);
per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
}
break;
@@ -510,16 +508,14 @@
int node = cpu_to_node(cpu);
struct page *page;

- page = alloc_pages_node(node, GFP_KERNEL, 0);
+ page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
if (!page)
goto out_cleanup;
- clear_highpage(page);
per_cpu(cpu_profile_hits, cpu)[1]
= (struct profile_hit *)page_address(page);
- page = alloc_pages_node(node, GFP_KERNEL, 0);
+ page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
if (!page)
goto out_cleanup;
- clear_highpage(page);
per_cpu(cpu_profile_hits, cpu)[0]
= (struct profile_hit *)page_address(page);
}
Index: linux-2.6.10/mm/shmem.c
===================================================================
--- linux-2.6.10.orig/mm/shmem.c 2004-12-24 13:34:32.000000000 -0800
+++ linux-2.6.10/mm/shmem.c 2005-01-05 09:30:39.000000000 -0800
@@ -369,9 +369,8 @@
}

spin_unlock(&info->lock);
- page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
+ page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
if (page) {
- clear_highpage(page);
page->nr_swapped = 0;
}
spin_lock(&info->lock);
@@ -910,7 +909,7 @@
pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
pvma.vm_pgoff = idx;
pvma.vm_end = PAGE_SIZE;
- page = alloc_page_vma(gfp, &pvma, 0);
+ page = alloc_page_vma(gfp | __GFP_ZERO, &pvma, 0);
mpol_free(pvma.vm_policy);
return page;
}
@@ -926,7 +925,7 @@
shmem_alloc_page(unsigned long gfp,struct shmem_inode_info *info,
unsigned long idx)
{
- return alloc_page(gfp);
+ return alloc_page(gfp | __GFP_ZERO);
}
#endif

@@ -1135,7 +1134,6 @@

info->alloced++;
spin_unlock(&info->lock);
- clear_highpage(filepage);
flush_dcache_page(filepage);
SetPageUptodate(filepage);
}
Index: linux-2.6.10/include/asm-ia64/pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-ia64/pgalloc.h 2005-01-04 14:17:01.000000000 -0800
+++ linux-2.6.10/include/asm-ia64/pgalloc.h 2005-01-05 09:30:39.000000000 -0800
@@ -61,9 +61,7 @@
pgd_t *pgd = pgd_alloc_one_fast(mm);

if (unlikely(pgd == NULL)) {
- pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
- if (likely(pgd != NULL))
- clear_page(pgd);
+ pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO);
}
return pgd;
}
@@ -106,10 +104,8 @@
static inline pmd_t*
pmd_alloc_one (struct mm_struct *mm, unsigned long addr)
{
- pmd_t *pmd = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+ pmd_t *pmd = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);

- if (likely(pmd != NULL))
- clear_page(pmd);
return pmd;
}

@@ -140,20 +136,16 @@
static inline struct page *
pte_alloc_one (struct mm_struct *mm, unsigned long addr)
{
- struct page *pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
+ struct page *pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);

- if (likely(pte != NULL))
- clear_page(page_address(pte));
return pte;
}

static inline pte_t *
pte_alloc_one_kernel (struct mm_struct *mm, unsigned long addr)
{
- pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+ pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);

- if (likely(pte != NULL))
- clear_page(pte);
return pte;
}

Index: linux-2.6.10/arch/i386/mm/pgtable.c
===================================================================
--- linux-2.6.10.orig/arch/i386/mm/pgtable.c 2005-01-04 14:16:59.000000000 -0800
+++ linux-2.6.10/arch/i386/mm/pgtable.c 2005-01-05 09:30:39.000000000 -0800
@@ -140,10 +140,7 @@

pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
- pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
- if (pte)
- clear_page(pte);
- return pte;
+ return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
}

struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
@@ -151,12 +148,10 @@
struct page *pte;

#ifdef CONFIG_HIGHPTE
- pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT, 0);
+ pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
#else
- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
#endif
- if (pte)
- clear_highpage(pte);
return pte;
}

Index: linux-2.6.10/include/asm-mips/pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-mips/pgalloc.h 2004-12-24 13:34:57.000000000 -0800
+++ linux-2.6.10/include/asm-mips/pgalloc.h 2005-01-05 09:30:39.000000000 -0800
@@ -56,9 +56,7 @@
{
pte_t *pte;

- pte = (pte_t *) __get_free_pages(GFP_KERNEL|__GFP_REPEAT, PTE_ORDER);
- if (pte)
- clear_page(pte);
+ pte = (pte_t *) __get_free_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, PTE_ORDER);

return pte;
}
Index: linux-2.6.10/arch/alpha/mm/init.c
===================================================================
--- linux-2.6.10.orig/arch/alpha/mm/init.c 2004-12-24 13:35:28.000000000 -0800
+++ linux-2.6.10/arch/alpha/mm/init.c 2005-01-05 09:30:39.000000000 -0800
@@ -42,10 +42,9 @@
{
pgd_t *ret, *init;

- ret = (pgd_t *)__get_free_page(GFP_KERNEL);
+ ret = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
init = pgd_offset(&init_mm, 0UL);
if (ret) {
- clear_page(ret);
#ifdef CONFIG_ALPHA_LARGE_VMALLOC
memcpy (ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
(PTRS_PER_PGD - USER_PTRS_PER_PGD - 1)*sizeof(pgd_t));
@@ -63,9 +62,7 @@
pte_t *
pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
- pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
- if (pte)
- clear_page(pte);
+ pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
return pte;
}

Index: linux-2.6.10/include/asm-parisc/pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-parisc/pgalloc.h 2004-12-24 13:35:39.000000000 -0800
+++ linux-2.6.10/include/asm-parisc/pgalloc.h 2005-01-05 09:30:39.000000000 -0800
@@ -120,18 +120,14 @@
static inline struct page *
pte_alloc_one(struct mm_struct *mm, unsigned long address)
{
- struct page *page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
- if (likely(page != NULL))
- clear_page(page_address(page));
+ struct page *page = alloc_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
return page;
}

static inline pte_t *
pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr)
{
- pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
- if (likely(pte != NULL))
- clear_page(pte);
+ pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
return pte;
}

Index: linux-2.6.10/include/asm-sparc64/pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-sparc64/pgalloc.h 2004-12-24 13:35:29.000000000 -0800
+++ linux-2.6.10/include/asm-sparc64/pgalloc.h 2005-01-05 09:30:39.000000000 -0800
@@ -73,10 +73,9 @@
struct page *page;

preempt_enable();
- page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
+ page = alloc_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
if (page) {
ret = (struct page *)page_address(page);
- clear_page(ret);
page->lru.prev = (void *) 2UL;

preempt_disable();
Index: linux-2.6.10/include/asm-sh/pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-sh/pgalloc.h 2004-12-24 13:34:45.000000000 -0800
+++ linux-2.6.10/include/asm-sh/pgalloc.h 2005-01-05 09:30:39.000000000 -0800
@@ -44,9 +44,7 @@
{
pte_t *pte;

- pte = (pte_t *) __get_free_page(GFP_KERNEL | __GFP_REPEAT);
- if (pte)
- clear_page(pte);
+ pte = (pte_t *) __get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO);

return pte;
}
@@ -56,9 +54,7 @@
{
struct page *pte;

- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
- if (pte)
- clear_page(page_address(pte));
+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);

return pte;
}
Index: linux-2.6.10/include/asm-m32r/pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-m32r/pgalloc.h 2004-12-24 13:35:28.000000000 -0800
+++ linux-2.6.10/include/asm-m32r/pgalloc.h 2005-01-05 09:30:39.000000000 -0800
@@ -23,10 +23,7 @@
*/
static __inline__ pgd_t *pgd_alloc(struct mm_struct *mm)
{
- pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
-
- if (pgd)
- clear_page(pgd);
+ pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO);

return pgd;
}
@@ -39,10 +36,7 @@
static __inline__ pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
unsigned long address)
{
- pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL);
-
- if (pte)
- clear_page(pte);
+ pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO);

return pte;
}
@@ -50,10 +44,8 @@
static __inline__ struct page *pte_alloc_one(struct mm_struct *mm,
unsigned long address)
{
- struct page *pte = alloc_page(GFP_KERNEL);
+ struct page *pte = alloc_page(GFP_KERNEL|__GFP_ZERO);

- if (pte)
- clear_page(page_address(pte));

return pte;
}
Index: linux-2.6.10/arch/um/kernel/mem.c
===================================================================
--- linux-2.6.10.orig/arch/um/kernel/mem.c 2005-01-04 14:17:00.000000000 -0800
+++ linux-2.6.10/arch/um/kernel/mem.c 2005-01-05 09:30:39.000000000 -0800
@@ -327,9 +327,7 @@
{
pte_t *pte;

- pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
- if (pte)
- clear_page(pte);
+ pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
return pte;
}

@@ -337,9 +335,7 @@
{
struct page *pte;

- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
- if (pte)
- clear_highpage(pte);
+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
return pte;
}

Index: linux-2.6.10/include/asm-sh64/pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-sh64/pgalloc.h 2004-12-24 13:34:00.000000000 -0800
+++ linux-2.6.10/include/asm-sh64/pgalloc.h 2005-01-05 09:30:39.000000000 -0800
@@ -112,9 +112,7 @@
{
pte_t *pte;

- pte = (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
- if (pte)
- clear_page(pte);
+ pte = (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT|__GFP_ZERO);

return pte;
}
@@ -123,9 +121,7 @@
{
struct page *pte;

- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
- if (pte)
- clear_page(page_address(pte));
+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);

return pte;
}
@@ -150,9 +146,7 @@
static __inline__ pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
{
pmd_t *pmd;
- pmd = (pmd_t *) __get_free_page(GFP_KERNEL|__GFP_REPEAT);
- if (pmd)
- clear_page(pmd);
+ pmd = (pmd_t *) __get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
return pmd;
}

Index: linux-2.6.10/include/asm-cris/pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-cris/pgalloc.h 2004-12-24 13:35:25.000000000 -0800
+++ linux-2.6.10/include/asm-cris/pgalloc.h 2005-01-05 09:30:39.000000000 -0800
@@ -24,18 +24,14 @@

extern inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
- pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
- if (pte)
- clear_page(pte);
+ pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
return pte;
}

extern inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
{
struct page *pte;
- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
- if (pte)
- clear_page(page_address(pte));
+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
return pte;
}

Index: linux-2.6.10/arch/ppc/mm/pgtable.c
===================================================================
--- linux-2.6.10.orig/arch/ppc/mm/pgtable.c 2004-12-24 13:34:26.000000000 -0800
+++ linux-2.6.10/arch/ppc/mm/pgtable.c 2005-01-05 09:30:39.000000000 -0800
@@ -85,8 +85,7 @@
{
pgd_t *ret;

- if ((ret = (pgd_t *)__get_free_pages(GFP_KERNEL, PGDIR_ORDER)) != NULL)
- clear_pages(ret, PGDIR_ORDER);
+ ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER);
return ret;
}

@@ -102,7 +101,7 @@
extern void *early_get_page(void);

if (mem_init_done) {
- pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+ pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
if (pte) {
struct page *ptepage = virt_to_page(pte);
ptepage->mapping = (void *) mm;
@@ -110,8 +109,6 @@
}
} else
pte = (pte_t *)early_get_page();
- if (pte)
- clear_page(pte);
return pte;
}

Index: linux-2.6.10/include/asm-alpha/pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-alpha/pgalloc.h 2004-12-24 13:35:50.000000000 -0800
+++ linux-2.6.10/include/asm-alpha/pgalloc.h 2005-01-05 09:30:39.000000000 -0800
@@ -40,9 +40,7 @@
static inline pmd_t *
pmd_alloc_one(struct mm_struct *mm, unsigned long address)
{
- pmd_t *ret = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
- if (ret)
- clear_page(ret);
+ pmd_t *ret = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
return ret;
}

Index: linux-2.6.10/include/asm-m68k/motorola_pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-m68k/motorola_pgalloc.h 2004-12-24 13:35:50.000000000 -0800
+++ linux-2.6.10/include/asm-m68k/motorola_pgalloc.h 2005-01-05 09:30:39.000000000 -0800
@@ -12,9 +12,8 @@
{
pte_t *pte;

- pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+ pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
if (pte) {
- clear_page(pte);
__flush_page_to_ram(pte);
flush_tlb_kernel_page(pte);
nocache_page(pte);
@@ -31,7 +30,7 @@

static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
{
- struct page *page = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
+ struct page *page = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
pte_t *pte;

if(!page)
@@ -39,7 +38,6 @@

pte = kmap(page);
if (pte) {
- clear_page(pte);
__flush_page_to_ram(pte);
flush_tlb_kernel_page(pte);
nocache_page(pte);
Index: linux-2.6.10/arch/sparc64/mm/init.c
===================================================================
--- linux-2.6.10.orig/arch/sparc64/mm/init.c 2004-12-24 13:34:31.000000000 -0800
+++ linux-2.6.10/arch/sparc64/mm/init.c 2005-01-05 09:30:39.000000000 -0800
@@ -1687,13 +1687,12 @@
* Set up the zero page, mark it reserved, so that page count
* is not manipulated when freeing the page from user ptes.
*/
- mem_map_zero = alloc_pages(GFP_KERNEL, 0);
+ mem_map_zero = alloc_pages(GFP_KERNEL|__GFP_ZERO, 0);
if (mem_map_zero == NULL) {
prom_printf("paging_init: Cannot alloc zero page.\n");
prom_halt();
}
SetPageReserved(mem_map_zero);
- clear_page(page_address(mem_map_zero));

codepages = (((unsigned long) _etext) - ((unsigned long) _start));
codepages = PAGE_ALIGN(codepages) >> PAGE_SHIFT;
Index: linux-2.6.10/include/asm-arm/pgalloc.h
===================================================================
--- linux-2.6.10.orig/include/asm-arm/pgalloc.h 2004-12-24 13:35:29.000000000 -0800
+++ linux-2.6.10/include/asm-arm/pgalloc.h 2005-01-05 09:30:39.000000000 -0800
@@ -50,9 +50,8 @@
{
pte_t *pte;

- pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+ pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
if (pte) {
- clear_page(pte);
clean_dcache_area(pte, sizeof(pte_t) * PTRS_PER_PTE);
pte += PTRS_PER_PTE;
}
@@ -65,10 +64,9 @@
{
struct page *pte;

- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
if (pte) {
void *page = page_address(pte);
- clear_page(page);
clean_dcache_area(page, sizeof(pte_t) * PTRS_PER_PTE);
}

Index: linux-2.6.10/drivers/block/pktcdvd.c
===================================================================
--- linux-2.6.10.orig/drivers/block/pktcdvd.c 2004-12-24 13:33:49.000000000 -0800
+++ linux-2.6.10/drivers/block/pktcdvd.c 2005-01-05 09:30:39.000000000 -0800
@@ -135,12 +135,10 @@
goto no_bio;

for (i = 0; i < PAGES_PER_PACKET; i++) {
- pkt->pages[i] = alloc_page(GFP_KERNEL);
+ pkt->pages[i] = alloc_page(GFP_KERNEL|| __GFP_ZERO);
if (!pkt->pages[i])
goto no_page;
}
- for (i = 0; i < PAGES_PER_PACKET; i++)
- clear_page(page_address(pkt->pages[i]));

spin_lock_init(&pkt->lock);

2005-01-05 23:34:10

by Christoph Lameter

[permalink] [raw]
Subject: Re: Prezeroing V3 [2/4]: Extension of clear_page to take an order parameter

Here is an updated version that is independent of the first patch and
contains all the necessary modifications to make clear_page take a second
parameter.

Architecture support:
---------------------

Known to work:

ia64
i386
sparc64
m68k

Trivial modification expected to simply work:

arm
cris
h8300
m68knommu
ppc
ppc64
sh64
v850
parisc
sparc
um

Modification made but it would be good to have some feedback from the arch maintainers:

x86_64
s390
alpha
sh
mips
m32r

Index: linux-2.6.10/include/asm-ia64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-ia64/page.h 2004-12-24 13:34:00.000000000 -0800
+++ linux-2.6.10/include/asm-ia64/page.h 2005-01-05 10:09:51.000000000 -0800
@@ -56,7 +56,7 @@
# ifdef __KERNEL__
# define STRICT_MM_TYPECHECKS

-extern void clear_page (void *page);
+extern void clear_page (void *page, int order);
extern void copy_page (void *to, void *from);

/*
@@ -65,7 +65,7 @@
*/
#define clear_user_page(addr, vaddr, page) \
do { \
- clear_page(addr); \
+ clear_page(addr, 0); \
flush_dcache_page(page); \
} while (0)

Index: linux-2.6.10/include/asm-i386/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-i386/page.h 2005-01-04 14:17:01.000000000 -0800
+++ linux-2.6.10/include/asm-i386/page.h 2005-01-05 10:09:51.000000000 -0800
@@ -18,7 +18,7 @@

#include <asm/mmx.h>

-#define clear_page(page) mmx_clear_page((void *)(page))
+#define clear_page(page, order) mmx_clear_page((void *)(page),order)
#define copy_page(to,from) mmx_copy_page(to,from)

#else
@@ -28,12 +28,12 @@
* Maybe the K6-III ?
*/

-#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE)

#endif

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

/*
Index: linux-2.6.10/include/asm-x86_64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-x86_64/page.h 2005-01-04 14:17:01.000000000 -0800
+++ linux-2.6.10/include/asm-x86_64/page.h 2005-01-05 10:09:51.000000000 -0800
@@ -32,10 +32,10 @@
#ifdef __KERNEL__
#ifndef __ASSEMBLY__

-void clear_page(void *);
+void clear_page(void *, int);
void copy_page(void *, void *);

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

/*
Index: linux-2.6.10/include/asm-sparc/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-sparc/page.h 2004-12-24 13:34:29.000000000 -0800
+++ linux-2.6.10/include/asm-sparc/page.h 2005-01-05 10:09:51.000000000 -0800
@@ -28,10 +28,10 @@

#ifndef __ASSEMBLY__

-#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE)
#define clear_user_page(addr, vaddr, page) \
- do { clear_page(addr); \
+ do { clear_page(addr, 0); \
sparc_flush_page_to_ram(page); \
} while (0)
#define copy_user_page(to, from, vaddr, page) \
Index: linux-2.6.10/include/asm-s390/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-s390/page.h 2004-12-24 13:34:01.000000000 -0800
+++ linux-2.6.10/include/asm-s390/page.h 2005-01-05 10:09:51.000000000 -0800
@@ -22,12 +22,12 @@

#ifndef __s390x__

-static inline void clear_page(void *page)
+static inline void clear_page(void *page, int order)
{
register_pair rp;

rp.subreg.even = (unsigned long) page;
- rp.subreg.odd = (unsigned long) 4096;
+ rp.subreg.odd = (unsigned long) 4096 << order;
asm volatile (" slr 1,1\n"
" mvcl %0,0"
: "+&a" (rp) : : "memory", "cc", "1" );
@@ -63,14 +63,19 @@

#else /* __s390x__ */

-static inline void clear_page(void *page)
+static inline void clear_page(void *page, int order)
{
- asm volatile (" lgr 2,%0\n"
+ int nr = 1 << order;
+
+ while (nr-- >0) {
+ asm volatile (" lgr 2,%0\n"
" lghi 3,4096\n"
" slgr 1,1\n"
" mvcl 2,0"
: : "a" ((void *) (page))
: "memory", "cc", "1", "2", "3" );
+ page += PAGE_SIZE;
+ }
}

static inline void copy_page(void *to, void *from)
@@ -103,7 +108,7 @@

#endif /* __s390x__ */

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

/* Pure 2^n version of get_order */
Index: linux-2.6.10/arch/i386/lib/mmx.c
===================================================================
--- linux-2.6.10.orig/arch/i386/lib/mmx.c 2004-12-24 13:34:48.000000000 -0800
+++ linux-2.6.10/arch/i386/lib/mmx.c 2005-01-05 10:09:51.000000000 -0800
@@ -128,7 +128,7 @@
* other MMX using processors do not.
*/

-static void fast_clear_page(void *page)
+static void fast_clear_page(void *page, int order)
{
int i;

@@ -138,7 +138,7 @@
" pxor %%mm0, %%mm0\n" : :
);

- for(i=0;i<4096/64;i++)
+ for(i=0;i<((4096/64) << order);i++)
{
__asm__ __volatile__ (
" movntq %%mm0, (%0)\n"
@@ -257,7 +257,7 @@
* Generic MMX implementation without K7 specific streaming
*/

-static void fast_clear_page(void *page)
+static void fast_clear_page(void *page, int order)
{
int i;

@@ -267,7 +267,7 @@
" pxor %%mm0, %%mm0\n" : :
);

- for(i=0;i<4096/128;i++)
+ for(i=0;i<((4096/128) << order);i++)
{
__asm__ __volatile__ (
" movq %%mm0, (%0)\n"
@@ -359,23 +359,23 @@
* Favour MMX for page clear and copy.
*/

-static void slow_zero_page(void * page)
+static void slow_clear_page(void * page, int order)
{
int d0, d1;
__asm__ __volatile__( \
"cld\n\t" \
"rep ; stosl" \
: "=&c" (d0), "=&D" (d1)
- :"a" (0),"1" (page),"0" (1024)
+ :"a" (0),"1" (page),"0" (1024 << order)
:"memory");
}
-
-void mmx_clear_page(void * page)
+
+void mmx_clear_page(void * page, int order)
{
if(unlikely(in_interrupt()))
- slow_zero_page(page);
+ slow_clear_page(page, order);
else
- fast_clear_page(page);
+ fast_clear_page(page, order);
}

static void slow_copy_page(void *to, void *from)
Index: linux-2.6.10/include/asm-x86_64/mmx.h
===================================================================
--- linux-2.6.10.orig/include/asm-x86_64/mmx.h 2004-12-24 13:34:57.000000000 -0800
+++ linux-2.6.10/include/asm-x86_64/mmx.h 2005-01-05 10:09:51.000000000 -0800
@@ -8,7 +8,7 @@
#include <linux/types.h>

extern void *_mmx_memcpy(void *to, const void *from, size_t size);
-extern void mmx_clear_page(void *page);
+extern void mmx_clear_page(void *page, int order);
extern void mmx_copy_page(void *to, void *from);

#endif
Index: linux-2.6.10/arch/ia64/lib/clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/ia64/lib/clear_page.S 2004-12-24 13:33:50.000000000 -0800
+++ linux-2.6.10/arch/ia64/lib/clear_page.S 2005-01-05 10:09:51.000000000 -0800
@@ -7,6 +7,7 @@
* 1/06/01 davidm Tuned for Itanium.
* 2/12/02 kchen Tuned for both Itanium and McKinley
* 3/08/02 davidm Some more tweaking
+ * 12/10/04 clameter Make it work on pages of order size
*/
#include <linux/config.h>

@@ -29,27 +30,33 @@
#define dst4 r11

#define dst_last r31
+#define totsize r14

GLOBAL_ENTRY(clear_page)
.prologue
- .regstk 1,0,0,0
- mov r16 = PAGE_SIZE/L3_LINE_SIZE-1 // main loop count, -1=repeat/until
+ .regstk 2,0,0,0
+ mov r16 = PAGE_SIZE/L3_LINE_SIZE // main loop count
+ mov totsize = PAGE_SIZE
.save ar.lc, saved_lc
mov saved_lc = ar.lc
-
+ ;;
.body
+ adds dst1 = 16, in0
mov ar.lc = (PREFETCH_LINES - 1)
mov dst_fetch = in0
- adds dst1 = 16, in0
adds dst2 = 32, in0
+ shl r16 = r16, in1
+ shl totsize = totsize, in1
;;
.fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
adds dst3 = 48, in0 // executing this multiple times is harmless
br.cloop.sptk.few .fetch
+ add r16 = -1,r16
+ add dst_last = totsize, dst_fetch
+ adds dst4 = 64, in0
;;
- addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
mov ar.lc = r16 // one L3 line per iteration
- adds dst4 = 64, in0
+ adds dst_last = -PREFETCH_LINES*L3_LINE_SIZE, dst_last
;;
#ifdef CONFIG_ITANIUM
// Optimized for Itanium
Index: linux-2.6.10/arch/x86_64/lib/clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/x86_64/lib/clear_page.S 2004-12-24 13:34:33.000000000 -0800
+++ linux-2.6.10/arch/x86_64/lib/clear_page.S 2005-01-05 10:09:51.000000000 -0800
@@ -7,6 +7,7 @@
clear_page:
xorl %eax,%eax
movl $4096/64,%ecx
+ shl %esi, %ecx
.p2align 4
.Lloop:
decl %ecx
@@ -42,6 +43,7 @@
.section .altinstr_replacement,"ax"
clear_page_c:
movl $4096/8,%ecx
+ shl %esi, %ecx
xorl %eax,%eax
rep
stosq
Index: linux-2.6.10/include/asm-sh/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-sh/page.h 2004-12-24 13:35:28.000000000 -0800
+++ linux-2.6.10/include/asm-sh/page.h 2005-01-05 10:09:51.000000000 -0800
@@ -36,12 +36,22 @@
#ifdef __KERNEL__
#ifndef __ASSEMBLY__

-extern void (*clear_page)(void *to);
+extern void (*_clear_page)(void *to);
extern void (*copy_page)(void *to, void *from);

extern void clear_page_slow(void *to);
extern void copy_page_slow(void *to, void *from);

+static inline void clear_page(void *page, int order)
+{
+ unsigned int nr = 1 << order;
+
+ while (nr-- >0) {
+ _clear_page(page);
+ page += PAGE_SIZE;
+ }
+}
+
#if defined(CONFIG_SH7705_CACHE_32KB) && defined(CONFIG_MMU)
struct page;
extern void clear_user_page(void *to, unsigned long address, struct page *pg);
@@ -49,7 +59,7 @@
extern void __clear_user_page(void *to, void *orig_to);
extern void __copy_user_page(void *to, void *from, void *orig_to);
#elif defined(CONFIG_CPU_SH2) || defined(CONFIG_CPU_SH3) || !defined(CONFIG_MMU)
-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
#elif defined(CONFIG_CPU_SH4)
struct page;
Index: linux-2.6.10/include/asm-i386/mmx.h
===================================================================
--- linux-2.6.10.orig/include/asm-i386/mmx.h 2004-12-24 13:34:57.000000000 -0800
+++ linux-2.6.10/include/asm-i386/mmx.h 2005-01-05 10:09:51.000000000 -0800
@@ -8,7 +8,7 @@
#include <linux/types.h>

extern void *_mmx_memcpy(void *to, const void *from, size_t size);
-extern void mmx_clear_page(void *page);
+extern void mmx_clear_page(void *page, int order);
extern void mmx_copy_page(void *to, void *from);

#endif
Index: linux-2.6.10/arch/alpha/lib/clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/alpha/lib/clear_page.S 2004-12-24 13:35:25.000000000 -0800
+++ linux-2.6.10/arch/alpha/lib/clear_page.S 2005-01-05 10:09:51.000000000 -0800
@@ -6,11 +6,10 @@

.text
.align 4
- .global clear_page
- .ent clear_page
-clear_page:
+ .global _clear_page
+ .ent _clear_page
+_clear_page:
.prologue 0
-
lda $0,128
nop
unop
@@ -36,4 +35,4 @@
unop
nop

- .end clear_page
+ .end _clear_page
Index: linux-2.6.10/include/asm-sh64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-sh64/page.h 2004-12-24 13:34:33.000000000 -0800
+++ linux-2.6.10/include/asm-sh64/page.h 2005-01-05 10:09:51.000000000 -0800
@@ -50,12 +50,20 @@
extern void sh64_page_clear(void *page);
extern void sh64_page_copy(void *from, void *to);

-#define clear_page(page) sh64_page_clear(page)
+static inline void clear_page(page, order)
+{
+ int nr = 1 << order;
+
+ while (nr-- >0) {
+ sh64_page_clear(page++, 0);
+ }
+}
+
#define copy_page(to,from) sh64_page_copy(from, to)

#if defined(CONFIG_DCACHE_DISABLED)

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) sh_clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

#else
Index: linux-2.6.10/include/asm-h8300/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-h8300/page.h 2004-12-24 13:35:25.000000000 -0800
+++ linux-2.6.10/include/asm-h8300/page.h 2005-01-05 10:09:51.000000000 -0800
@@ -24,10 +24,10 @@
#define get_user_page(vaddr) __get_free_page(GFP_KERNEL)
#define free_user_page(page, addr) free_page(addr)

-#define clear_page(page) memset((page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((to), (from), PAGE_SIZE)

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

/*
Index: linux-2.6.10/include/asm-arm/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-arm/page.h 2004-12-24 13:34:01.000000000 -0800
+++ linux-2.6.10/include/asm-arm/page.h 2005-01-05 10:09:51.000000000 -0800
@@ -128,7 +128,7 @@
preempt_enable(); \
} while (0)

-#define clear_page(page) memzero((void *)(page), PAGE_SIZE)
+#define clear_page(page, order) memzero((void *)(page), PAGE_SIZE << (order))
extern void copy_page(void *to, const void *from);

#undef STRICT_MM_TYPECHECKS
Index: linux-2.6.10/include/asm-ppc64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-ppc64/page.h 2004-12-24 13:33:49.000000000 -0800
+++ linux-2.6.10/include/asm-ppc64/page.h 2005-01-05 10:09:51.000000000 -0800
@@ -102,12 +102,12 @@
#define REGION_MASK (((1UL<<REGION_SIZE)-1UL)<<REGION_SHIFT)
#define REGION_STRIDE (1UL << REGION_SHIFT)

-static __inline__ void clear_page(void *addr)
+static __inline__ void clear_page(void *addr, int order)
{
unsigned long lines, line_size;

line_size = systemcfg->dCacheL1LineSize;
- lines = naca->dCacheL1LinesPerPage;
+ lines = naca->dCacheL1LinesPerPage << order;

__asm__ __volatile__(
"mtctr %1 # clear_page\n\
Index: linux-2.6.10/include/asm-m32r/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-m32r/page.h 2004-12-24 13:34:29.000000000 -0800
+++ linux-2.6.10/include/asm-m32r/page.h 2005-01-05 10:09:51.000000000 -0800
@@ -11,10 +11,22 @@
#ifdef __KERNEL__
#ifndef __ASSEMBLY__

-extern void clear_page(void *to);
+extern void _clear_page(void *to);
+
+static inline void clear_page(void *page, int order)
+{
+ unsigned int nr = 1 << order;
+
+ while (nr-- > 0) {
+ _clear_page(page);
+ page += PAGE_SIZE;
+ }
+}
+
+
extern void copy_page(void *to, void *from);

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

/*
Index: linux-2.6.10/include/asm-alpha/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-alpha/page.h 2004-12-24 13:35:24.000000000 -0800
+++ linux-2.6.10/include/asm-alpha/page.h 2005-01-05 10:09:51.000000000 -0800
@@ -15,8 +15,20 @@

#define STRICT_MM_TYPECHECKS

-extern void clear_page(void *page);
-#define clear_user_page(page, vaddr, pg) clear_page(page)
+extern void _clear_page(void *page);
+
+static inline void clear_page(void *page, int order)
+{
+ int nr = 1 << order;
+
+ while (nr--)
+ {
+ _clear_page(page);
+ page += PAGE_SIZE;
+ }
+}
+
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)

extern void copy_page(void * _to, void * _from);
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
Index: linux-2.6.10/arch/mips/mm/pg-sb1.c
===================================================================
--- linux-2.6.10.orig/arch/mips/mm/pg-sb1.c 2004-12-24 13:35:50.000000000 -0800
+++ linux-2.6.10/arch/mips/mm/pg-sb1.c 2005-01-05 10:09:51.000000000 -0800
@@ -42,7 +42,7 @@
#ifdef CONFIG_SIBYTE_DMA_PAGEOPS
static inline void clear_page_cpu(void *page)
#else
-void clear_page(void *page)
+void _clear_page(void *page)
#endif
{
unsigned char *addr = (unsigned char *) page;
@@ -172,14 +172,13 @@
IOADDR(A_DM_REGISTER(cpu, R_DM_DSCR_BASE)));
}

-void clear_page(void *page)
+void _clear_page(void *page)
{
int cpu = smp_processor_id();

/* if the page is above Kseg0, use old way */
if (KSEGX(page) != CAC_BASE)
return clear_page_cpu(page);
-
page_descr[cpu].dscr_a = PHYSADDR(page) | M_DM_DSCRA_ZERO_MEM | M_DM_DSCRA_L2C_DEST | M_DM_DSCRA_INTERRUPT;
page_descr[cpu].dscr_b = V_DM_DSCRB_SRC_LENGTH(PAGE_SIZE);
__raw_writeq(1, IOADDR(A_DM_REGISTER(cpu, R_DM_DSCR_COUNT)));
@@ -218,5 +217,5 @@

#endif

-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(_clear_page);
EXPORT_SYMBOL(copy_page);
Index: linux-2.6.10/include/asm-m68k/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-m68k/page.h 2004-12-24 13:35:49.000000000 -0800
+++ linux-2.6.10/include/asm-m68k/page.h 2005-01-05 10:09:51.000000000 -0800
@@ -50,7 +50,7 @@
);
}

-static inline void clear_page(void *page)
+static inline void clear_page(void *page, int order)
{
unsigned long tmp;
unsigned long *sp = page;
@@ -69,16 +69,16 @@
"dbra %1,1b\n\t"
: "=a" (sp), "=d" (tmp)
: "a" (page), "0" (sp),
- "1" ((PAGE_SIZE - 16) / 16 - 1));
+ "1" (((PAGE_SIZE<<(order)) - 16) / 16 - 1));
}

#else
-#define clear_page(page) memset((page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((to), (from), PAGE_SIZE)
#endif

#define clear_user_page(addr, vaddr, page) \
- do { clear_page(addr); \
+ do { clear_page(addr, 0); \
flush_dcache_page(page); \
} while (0)
#define copy_user_page(to, from, vaddr, page) \
Index: linux-2.6.10/include/asm-mips/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-mips/page.h 2004-12-24 13:34:31.000000000 -0800
+++ linux-2.6.10/include/asm-mips/page.h 2005-01-05 10:09:51.000000000 -0800
@@ -39,7 +39,18 @@
#ifdef __KERNEL__
#ifndef __ASSEMBLY__

-extern void clear_page(void * page);
+extern void _clear_page(void * page);
+
+static inline void clear_page(void *page, int order)
+{
+ unsigned int nr = 1 << order;
+
+ while (nr-- >0) {
+ _clear_page(page);
+ page += PAGE_SIZE;
+ }
+}
+
extern void copy_page(void * to, void * from);

extern unsigned long shm_align_mask;
@@ -57,7 +68,7 @@
{
extern void (*flush_data_cache_page)(unsigned long addr);

- clear_page(addr);
+ clear_page(addr, 0);
if (pages_do_alias((unsigned long) addr, vaddr))
flush_data_cache_page((unsigned long)addr);
}
Index: linux-2.6.10/include/asm-m68knommu/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-m68knommu/page.h 2004-12-24 13:34:33.000000000 -0800
+++ linux-2.6.10/include/asm-m68knommu/page.h 2005-01-05 10:09:51.000000000 -0800
@@ -24,10 +24,10 @@
#define get_user_page(vaddr) __get_free_page(GFP_KERNEL)
#define free_user_page(page, addr) free_page(addr)

-#define clear_page(page) memset((page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((to), (from), PAGE_SIZE)

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

/*
Index: linux-2.6.10/include/asm-cris/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-cris/page.h 2004-12-24 13:34:30.000000000 -0800
+++ linux-2.6.10/include/asm-cris/page.h 2005-01-05 10:09:51.000000000 -0800
@@ -15,10 +15,10 @@

#ifdef __KERNEL__

-#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE)

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

/*
Index: linux-2.6.10/include/asm-v850/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-v850/page.h 2004-12-24 13:35:00.000000000 -0800
+++ linux-2.6.10/include/asm-v850/page.h 2005-01-05 10:09:51.000000000 -0800
@@ -37,11 +37,11 @@

#define STRICT_MM_TYPECHECKS

-#define clear_page(page) memset ((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset ((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to, from) memcpy ((void *)(to), (void *)from, PAGE_SIZE)

#define clear_user_page(addr, vaddr, page) \
- do { clear_page(addr); \
+ do { clear_page(addr, 0); \
flush_dcache_page(page); \
} while (0)
#define copy_user_page(to, from, vaddr, page) \
Index: linux-2.6.10/include/asm-parisc/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-parisc/page.h 2004-12-24 13:34:26.000000000 -0800
+++ linux-2.6.10/include/asm-parisc/page.h 2005-01-05 10:09:51.000000000 -0800
@@ -13,7 +13,7 @@
#include <asm/types.h>
#include <asm/cache.h>

-#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) copy_user_page_asm((void *)(to), (void *)(from))

struct page;
Index: linux-2.6.10/arch/arm/mm/copypage-v6.c
===================================================================
--- linux-2.6.10.orig/arch/arm/mm/copypage-v6.c 2004-12-24 13:34:31.000000000 -0800
+++ linux-2.6.10/arch/arm/mm/copypage-v6.c 2005-01-05 10:09:51.000000000 -0800
@@ -47,7 +47,7 @@
*/
void v6_clear_user_page_nonaliasing(void *kaddr, unsigned long vaddr)
{
- clear_page(kaddr);
+ _clear_page(kaddr);
}

/*
@@ -116,7 +116,7 @@

set_pte(to_pte + offset, pfn_pte(__pa(kaddr) >> PAGE_SHIFT, to_pgprot));
flush_tlb_kernel_page(to);
- clear_page((void *)to);
+ _clear_page((void *)to);

spin_unlock(&v6_lock);
}
Index: linux-2.6.10/arch/m32r/mm/page.S
===================================================================
--- linux-2.6.10.orig/arch/m32r/mm/page.S 2004-12-24 13:34:57.000000000 -0800
+++ linux-2.6.10/arch/m32r/mm/page.S 2005-01-05 10:09:51.000000000 -0800
@@ -51,7 +51,7 @@
jmp r14

.text
- .global clear_page
+ .global _clear_page
/*
* clear_page (to)
*
@@ -60,7 +60,7 @@
* 16 * 256
*/
.align 4
-clear_page:
+_clear_page:
ldi r2, #255
ldi r4, #0
ld r3, @r0 /* cache line allocate */
Index: linux-2.6.10/include/asm-ppc/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-ppc/page.h 2004-12-24 13:34:29.000000000 -0800
+++ linux-2.6.10/include/asm-ppc/page.h 2005-01-05 10:09:51.000000000 -0800
@@ -85,7 +85,7 @@

struct page;
extern void clear_pages(void *page, int order);
-static inline void clear_page(void *page) { clear_pages(page, 0); }
+#define clear_page clear_pages
extern void copy_page(void *to, void *from);
extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg);
extern void copy_user_page(void *to, void *from, unsigned long vaddr,
Index: linux-2.6.10/arch/alpha/kernel/alpha_ksyms.c
===================================================================
--- linux-2.6.10.orig/arch/alpha/kernel/alpha_ksyms.c 2004-12-24 13:33:51.000000000 -0800
+++ linux-2.6.10/arch/alpha/kernel/alpha_ksyms.c 2005-01-05 10:09:51.000000000 -0800
@@ -88,7 +88,7 @@
EXPORT_SYMBOL(__memsetw);
EXPORT_SYMBOL(__constant_c_memset);
EXPORT_SYMBOL(copy_page);
-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(_clear_page);

EXPORT_SYMBOL(__direct_map_base);
EXPORT_SYMBOL(__direct_map_size);
Index: linux-2.6.10/arch/alpha/lib/ev6-clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/alpha/lib/ev6-clear_page.S 2004-12-24 13:35:24.000000000 -0800
+++ linux-2.6.10/arch/alpha/lib/ev6-clear_page.S 2005-01-05 10:09:51.000000000 -0800
@@ -6,9 +6,9 @@

.text
.align 4
- .global clear_page
- .ent clear_page
-clear_page:
+ .global _clear_page
+ .ent _clear_page
+_clear_page:
.prologue 0

lda $0,128
@@ -51,4 +51,4 @@
nop
nop

- .end clear_page
+ .end _clear_page
Index: linux-2.6.10/arch/sh/mm/init.c
===================================================================
--- linux-2.6.10.orig/arch/sh/mm/init.c 2004-12-24 13:35:24.000000000 -0800
+++ linux-2.6.10/arch/sh/mm/init.c 2005-01-05 10:09:51.000000000 -0800
@@ -57,7 +57,7 @@
#endif

void (*copy_page)(void *from, void *to);
-void (*clear_page)(void *to);
+void (*_clear_page)(void *to);

void show_mem(void)
{
@@ -255,7 +255,7 @@
* later in the boot process if a better method is available.
*/
copy_page = copy_page_slow;
- clear_page = clear_page_slow;
+ _clear_page = clear_page_slow;

/* this will put all low memory onto the freelists */
totalram_pages += free_all_bootmem_node(NODE_DATA(0));
Index: linux-2.6.10/arch/sh/mm/pg-dma.c
===================================================================
--- linux-2.6.10.orig/arch/sh/mm/pg-dma.c 2004-12-24 13:35:00.000000000 -0800
+++ linux-2.6.10/arch/sh/mm/pg-dma.c 2005-01-05 10:09:51.000000000 -0800
@@ -78,7 +78,7 @@
return ret;

copy_page = copy_page_dma;
- clear_page = clear_page_dma;
+ _clear_page = clear_page_dma;

return ret;
}
Index: linux-2.6.10/arch/sh/mm/pg-nommu.c
===================================================================
--- linux-2.6.10.orig/arch/sh/mm/pg-nommu.c 2004-12-24 13:34:32.000000000 -0800
+++ linux-2.6.10/arch/sh/mm/pg-nommu.c 2005-01-05 10:09:51.000000000 -0800
@@ -27,7 +27,7 @@
static int __init pg_nommu_init(void)
{
copy_page = copy_page_nommu;
- clear_page = clear_page_nommu;
+ _clear_page = clear_page_nommu;

return 0;
}
Index: linux-2.6.10/arch/mips/mm/pg-r4k.c
===================================================================
--- linux-2.6.10.orig/arch/mips/mm/pg-r4k.c 2004-12-24 13:34:49.000000000 -0800
+++ linux-2.6.10/arch/mips/mm/pg-r4k.c 2005-01-05 10:09:51.000000000 -0800
@@ -39,9 +39,9 @@

static unsigned int clear_page_array[0x130 / 4];

-void clear_page(void * page) __attribute__((alias("clear_page_array")));
+void _clear_page(void * page) __attribute__((alias("clear_page_array")));

-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(_clear_page);

/*
* Maximum sizes:
Index: linux-2.6.10/arch/m32r/kernel/m32r_ksyms.c
===================================================================
--- linux-2.6.10.orig/arch/m32r/kernel/m32r_ksyms.c 2004-12-24 13:34:29.000000000 -0800
+++ linux-2.6.10/arch/m32r/kernel/m32r_ksyms.c 2005-01-05 10:09:51.000000000 -0800
@@ -102,7 +102,7 @@
EXPORT_SYMBOL(memcmp);
EXPORT_SYMBOL(memscan);
EXPORT_SYMBOL(copy_page);
-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(_clear_page);

EXPORT_SYMBOL(strcat);
EXPORT_SYMBOL(strchr);
Index: linux-2.6.10/include/asm-arm26/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-arm26/page.h 2004-12-24 13:35:22.000000000 -0800
+++ linux-2.6.10/include/asm-arm26/page.h 2005-01-05 10:09:51.000000000 -0800
@@ -25,7 +25,7 @@
preempt_enable(); \
} while (0)

-#define clear_page(page) memzero((void *)(page), PAGE_SIZE)
+#define clear_page(page, order) memzero((void *)(page), PAGE_SIZE << (order))
#define copy_page(to, from) __copy_user_page(to, from, 0);

#undef STRICT_MM_TYPECHECKS
Index: linux-2.6.10/include/asm-sparc64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-sparc64/page.h 2004-12-24 13:34:32.000000000 -0800
+++ linux-2.6.10/include/asm-sparc64/page.h 2005-01-05 10:09:51.000000000 -0800
@@ -14,8 +14,8 @@

#ifndef __ASSEMBLY__

-extern void _clear_page(void *page);
-#define clear_page(X) _clear_page((void *)(X))
+extern void _clear_page(void *page, unsigned long order);
+#define clear_page(X,Y) _clear_page((void *)(X),(Y))
struct page;
extern void clear_user_page(void *addr, unsigned long vaddr, struct page *page);
#define copy_page(X,Y) memcpy((void *)(X), (void *)(Y), PAGE_SIZE)
Index: linux-2.6.10/arch/sparc64/lib/clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/sparc64/lib/clear_page.S 2004-12-24 13:35:23.000000000 -0800
+++ linux-2.6.10/arch/sparc64/lib/clear_page.S 2005-01-05 10:09:51.000000000 -0800
@@ -28,9 +28,12 @@
.text

.globl _clear_page
-_clear_page: /* %o0=dest */
+_clear_page: /* %o0=dest, %o1=order */
+ sethi %hi(PAGE_SIZE/64), %o2
+ clr %o4
+ or %o2, %lo(PAGE_SIZE/64), %o2
ba,pt %xcc, clear_page_common
- clr %o4
+ sllx %o2, %o1, %o1

/* This thing is pretty important, it shows up
* on the profiles via do_anonymous_page().
@@ -69,16 +72,16 @@
flush %g6
wrpr %o4, 0x0, %pstate

+ sethi %hi(PAGE_SIZE/64), %o1
mov 1, %o4
+ or %o1, %lo(PAGE_SIZE/64), %o1

clear_page_common:
VISEntryHalf
membar #StoreLoad | #StoreStore | #LoadStore
fzero %f0
- sethi %hi(PAGE_SIZE/64), %o1
mov %o0, %g1 ! remember vaddr for tlbflush
fzero %f2
- or %o1, %lo(PAGE_SIZE/64), %o1
faddd %f0, %f2, %f4
fmuld %f0, %f2, %f6
faddd %f0, %f2, %f8
Index: linux-2.6.10/drivers/net/tc35815.c
===================================================================
--- linux-2.6.10.orig/drivers/net/tc35815.c 2005-01-05 09:43:48.000000000 -0800
+++ linux-2.6.10/drivers/net/tc35815.c 2005-01-05 10:09:51.000000000 -0800
@@ -657,7 +657,7 @@
dma_cache_wback_inv((unsigned long)lp->fd_buf, PAGE_SIZE * FD_PAGE_NUM);
#endif
} else {
- clear_page(lp->fd_buf);
+ clear_page(lp->fd_buf, 0);
#ifdef __mips__
dma_cache_wback_inv((unsigned long)lp->fd_buf, PAGE_SIZE * FD_PAGE_NUM);
#endif
Index: linux-2.6.10/mm/page_alloc.c
===================================================================
--- linux-2.6.10.orig/mm/page_alloc.c 2005-01-05 09:32:52.000000000 -0800
+++ linux-2.6.10/mm/page_alloc.c 2005-01-05 10:09:51.000000000 -0800
@@ -550,10 +550,14 @@
* or two.
*/
static inline void prep_zero_page(struct page *page, int order) {
- int i;

- for(i = 0; i < 1 << order; i++)
- clear_highpage(page + i);
+ if (PageHighMem(page)) {
+ int i;
+
+ for(i = 0; i < 1 << order; i++)
+ clear_highpage(page + i);
+ } else
+ clear_page(page_address(page), order);
}

static struct page *
Index: linux-2.6.10/include/linux/highmem.h
===================================================================
--- linux-2.6.10.orig/include/linux/highmem.h 2005-01-05 10:09:44.000000000 -0800
+++ linux-2.6.10/include/linux/highmem.h 2005-01-05 10:10:08.000000000 -0800
@@ -45,7 +45,7 @@
static inline void clear_highpage(struct page *page)
{
void *kaddr = kmap_atomic(page, KM_USER0);
- clear_page(kaddr);
+ clear_page(kaddr, 0);
kunmap_atomic(kaddr, KM_USER0);
}


2005-01-06 13:53:11

by Andi Kleen

[permalink] [raw]
Subject: Re: Prezeroing V3 [2/4]: Extension of clear_page to take an order parameter

Christoph Lameter <[email protected]> writes:

> Here is an updated version that is independent of the first patch and
> contains all the necessary modifications to make clear_page take a second
> parameter.

I still think the clear_page order addition is completely pointless,
because for > order 0 you probably want a cache bypassing store
in a separate function.

Removing it would also make the patch much less intrusive.

-Andi

2005-01-06 17:55:01

by Christoph Lameter

[permalink] [raw]
Subject: Re: Prezeroing V3 [2/4]: Extension of clear_page to take an order parameter

On Thu, 6 Jan 2005, Andi Kleen wrote:

> Christoph Lameter <[email protected]> writes:
>
> > Here is an updated version that is independent of the first patch and
> > contains all the necessary modifications to make clear_page take a second
> > parameter.
>
> I still think the clear_page order addition is completely pointless,
> because for > order 0 you probably want a cache bypassing store
> in a separate function.

I would think that having clear_page avoid loading cache
lines from memory should be general improvement.

Bypassing the cache may be beneficial for clear_page in general but I
would like to test that first.

If this is not a win then it may be better to implement the bypassing the
cache through a zero driver.

> Removing it would also make the patch much less intrusive.

Right. I also thought about that. I will likely offer the clear_page patch
as an optional component in V4. Being able to specify an order with
clear_page also helps in other situations like clearing huge pages.

2005-01-08 21:12:57

by Hugh Dickins

[permalink] [raw]
Subject: Re: Prezeroing V3 [1/4]: Allow request for zeroed memory

On Tue, 4 Jan 2005, Christoph Lameter wrote:
> This patch introduces __GFP_ZERO as an additional gfp_mask element to allow
> to request zeroed pages from the page allocator.
> ...
> --- linux-2.6.10.orig/mm/memory.c 2005-01-04 12:16:41.000000000 -0800
> +++ linux-2.6.10/mm/memory.c 2005-01-04 12:16:49.000000000 -0800
> @@ -1650,10 +1650,9 @@
>
> if (unlikely(anon_vma_prepare(vma)))
> goto no_mem;
> - page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
> + page = alloc_page_vma(GFP_HIGHZERO, vma, addr);
> if (!page)
> goto no_mem;
> - clear_user_highpage(page, addr);
>
> spin_lock(&mm->page_table_lock);
> page_table = pte_offset_map(pmd, addr);

Christoph, a late comment: doesn't this effectively replace
do_anonymous_page's clear_user_highpage by clear_highpage, which would
be a bad idea (inefficient? or corrupting?) on those few architectures
which actually do something with that user addr?

Hugh

2005-01-08 22:09:28

by David Miller

[permalink] [raw]
Subject: Re: Prezeroing V3 [1/4]: Allow request for zeroed memory

On Sat, 8 Jan 2005 21:12:10 +0000 (GMT)
Hugh Dickins <[email protected]> wrote:

> Christoph, a late comment: doesn't this effectively replace
> do_anonymous_page's clear_user_highpage by clear_highpage, which would
> be a bad idea (inefficient? or corrupting?) on those few architectures
> which actually do something with that user addr?

Good catch, it probably does. We really do need to use
the page clearing routines that pass in the user virtual
address when preparing new anonymous pages or else we'll
get cache aliasing problems on sparc, sparc64, and mips
at the very least. That is what the virtual address argument
was added for to begin with.

The other way to deal with this is to make whatever routine
the kscrubd thing invokes do all the cache flushing et al.
magic so that the above works when taking pages from the
pre-zero'd pool (only, if no pre-zero'd pages are available
we sill need to invoke clear_user_highpage() with the proper
virtual address).

2005-01-10 17:17:02

by Christoph Lameter

[permalink] [raw]
Subject: Re: Prezeroing V3 [1/4]: Allow request for zeroed memory

On Sat, 8 Jan 2005, Hugh Dickins wrote:

> Christoph, a late comment: doesn't this effectively replace
> do_anonymous_page's clear_user_highpage by clear_highpage, which would
> be a bad idea (inefficient? or corrupting?) on those few architectures
> which actually do something with that user addr?

Yes. Right my ia64 centric vision got me again. Thanks for all the other
patches that were posted. I hope this is now all cleared up?

2005-01-10 18:16:00

by Linus Torvalds

[permalink] [raw]
Subject: Re: Prezeroing V3 [1/4]: Allow request for zeroed memory



On Mon, 10 Jan 2005, Christoph Lameter wrote:
>
> Yes. Right my ia64 centric vision got me again. Thanks for all the other
> patches that were posted. I hope this is now all cleared up?

Hmm.. I fixed things up, but I didn't exactly do it like the posted
patches.

Currently the BK tree
- doesn't use __GFP_ZERO with anonymous user-mapped pages (which is what
you wrote this whole thing for ;)

Potential fix: declare a per-architecture "alloc_user_highpage(vaddr)"
that does the proper magic on virtually indexed machines, and on others
it just does a "alloc_page(GFP_HIGHUSER | __GFP_ZERO)".

- verifies that nobody ever asks for a HIGHMEM allocation together with
__GFP_ZERO (nobody does - a quick grep shows that 99% of all uses are
statically clearly fine (there's a few HIGHMEM zero-page users, but
they are all GFP_KERNEL or similar), with just two special cases:

- get_zeroed_page() - which can't use HIGHMEM anyway
- shm.c does "mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO"
and that's fine because while the mapping gfp masks may lack
GFP_FS and GFP_IO, they are always supposed to be ok with
waiting.

- moves "kernel_map_pages()" into "prep_new_page()" to fix the
DEBUG_PAGEALLOC issue (Chris Wright).

So that should take care of the known problems.

Linus

2005-01-10 20:24:17

by Christoph Lameter

[permalink] [raw]
Subject: Re: Prezeroing V3 [1/4]: Allow request for zeroed memory

On Mon, 10 Jan 2005, Linus Torvalds wrote:

> Currently the BK tree
> - doesn't use __GFP_ZERO with anonymous user-mapped pages (which is what
> you wrote this whole thing for ;)
>
> Potential fix: declare a per-architecture "alloc_user_highpage(vaddr)"
> that does the proper magic on virtually indexed machines, and on others
> it just does a "alloc_page(GFP_HIGHUSER | __GFP_ZERO)".

The following patch adds an alloc_zeroed_user_highpage(vma, vaddr). It
also uses zeroed pages on COW. clear_user_highpage is now only used by
that function. Fold it into alloc_zeroed_user_highpage?

This is against last hours bitkeeper tree. mm/memory.o compiles fine but
I was not able to build a ia64 kernel due to some pieces that seem to be
missing in last hours tree.

Index: linus/include/asm-ia64/page.h
===================================================================
--- linus.orig/include/asm-ia64/page.h 2004-10-20 12:04:58.000000000 -0700
+++ linus/include/asm-ia64/page.h 2005-01-10 12:05:55.000000000 -0800
@@ -75,6 +75,16 @@
flush_dcache_page(page); \
} while (0)

+
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+({ \
+ struct page *page = alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr); \
+ flush_dcache_page(page); \
+ page; \
+})
+
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)

#ifdef CONFIG_VIRTUAL_MEM_MAP
Index: linus/include/asm-h8300/page.h
===================================================================
--- linus.orig/include/asm-h8300/page.h 2004-10-20 12:04:58.000000000 -0700
+++ linus/include/asm-h8300/page.h 2005-01-10 11:53:17.000000000 -0800
@@ -30,6 +30,9 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
/*
* These are used to make use of C type-checking..
*/
Index: linus/mm/memory.c
===================================================================
--- linus.orig/mm/memory.c 2005-01-10 11:44:39.000000000 -0800
+++ linus/mm/memory.c 2005-01-10 12:05:21.000000000 -0800
@@ -84,20 +84,6 @@
EXPORT_SYMBOL(vmalloc_earlyreserve);

/*
- * We special-case the C-O-W ZERO_PAGE, because it's such
- * a common occurrence (no need to read the page to know
- * that it's zero - better for the cache and memory subsystem).
- */
-static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address)
-{
- if (from == ZERO_PAGE(address)) {
- clear_user_highpage(to, address);
- return;
- }
- copy_user_highpage(to, from, address);
-}
-
-/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
*/
@@ -1329,11 +1315,16 @@

if (unlikely(anon_vma_prepare(vma)))
goto no_new_page;
- new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
- if (!new_page)
- goto no_new_page;
- copy_cow_page(old_page,new_page,address);
-
+ if (old_page == ZERO_PAGE(address)) {
+ new_page = alloc_zeroed_user_highpage(vma, address);
+ if (!new_page)
+ goto no_new_page;
+ } else {
+ new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+ if (!new_page)
+ goto no_new_page;
+ copy_user_highpage(new_page, old_page, address);
+ }
/*
* Re-check the pte - we dropped the lock
*/
@@ -1795,10 +1786,9 @@

if (unlikely(anon_vma_prepare(vma)))
goto no_mem;
- page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+ page = alloc_zeroed_user_highpage(vma, addr);
if (!page)
goto no_mem;
- clear_user_highpage(page, addr);

spin_lock(&mm->page_table_lock);
page_table = pte_offset_map(pmd, addr);
Index: linus/include/asm-m32r/page.h
===================================================================
--- linus.orig/include/asm-m32r/page.h 2004-10-20 12:04:58.000000000 -0700
+++ linus/include/asm-m32r/page.h 2005-01-10 12:08:03.000000000 -0800
@@ -17,6 +17,9 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
/*
* These are used to make use of C type-checking..
*/
Index: linus/include/asm-alpha/page.h
===================================================================
--- linus.orig/include/asm-alpha/page.h 2004-10-20 12:04:57.000000000 -0700
+++ linus/include/asm-alpha/page.h 2005-01-10 11:54:37.000000000 -0800
@@ -18,6 +18,9 @@
extern void clear_page(void *page);
#define clear_user_page(page, vaddr, pg) clear_page(page)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vmaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
extern void copy_page(void * _to, void * _from);
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

Index: linus/include/asm-m68knommu/page.h
===================================================================
--- linus.orig/include/asm-m68knommu/page.h 2005-01-10 09:53:05.000000000 -0800
+++ linus/include/asm-m68knommu/page.h 2005-01-10 11:54:27.000000000 -0800
@@ -30,6 +30,9 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
/*
* These are used to make use of C type-checking..
*/
Index: linus/include/asm-cris/page.h
===================================================================
--- linus.orig/include/asm-cris/page.h 2004-10-20 12:04:57.000000000 -0700
+++ linus/include/asm-cris/page.h 2005-01-10 11:55:06.000000000 -0800
@@ -21,6 +21,9 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
/*
* These are used to make use of C type-checking..
*/
Index: linus/include/linux/highmem.h
===================================================================
--- linus.orig/include/linux/highmem.h 2005-01-06 12:58:48.000000000 -0800
+++ linus/include/linux/highmem.h 2005-01-10 12:08:56.000000000 -0800
@@ -42,6 +42,17 @@
smp_wmb();
}

+#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+static inline struct page* alloc_zeroed_user_highpage(struct vm_area_struct *vma,
+ unsigned long vaddr)
+{
+ struct page *page = alloc_page_vma(GFP_HIGHUSER, vma, vaddr);
+
+ clear_user_highpage(page, vaddr);
+ return page;
+}
+#endif
+
static inline void clear_highpage(struct page *page)
{
void *kaddr = kmap_atomic(page, KM_USER0);
Index: linus/include/asm-i386/page.h
===================================================================
--- linus.orig/include/asm-i386/page.h 2005-01-06 12:58:47.000000000 -0800
+++ linus/include/asm-i386/page.h 2005-01-10 12:09:43.000000000 -0800
@@ -36,6 +36,9 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
/*
* These are used to make use of C type-checking..
*/
Index: linus/include/asm-x86_64/page.h
===================================================================
--- linus.orig/include/asm-x86_64/page.h 2005-01-06 12:58:48.000000000 -0800
+++ linus/include/asm-x86_64/page.h 2005-01-10 11:56:04.000000000 -0800
@@ -38,6 +38,8 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
/*
* These are used to make use of C type-checking..
*/
Index: linus/include/asm-s390/page.h
===================================================================
--- linus.orig/include/asm-s390/page.h 2004-10-20 12:04:59.000000000 -0700
+++ linus/include/asm-s390/page.h 2005-01-10 11:56:33.000000000 -0800
@@ -106,6 +106,9 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
/* Pure 2^n version of get_order */
extern __inline__ int get_order(unsigned long size)
{

2005-01-10 23:57:29

by Christoph Lameter

[permalink] [raw]
Subject: Prezeroing V4 [1/4]: Arch specific page zeroing during page fault

This patch fixes the __GFP_ZERO related code by adding a new function
alloc_zeroed_user_highpage that is then used in the anonymous page fault
handler and in the COW code to allocate pages. The function can be defined
per arch to setup special processing for user pages by defining
__HAVE_ARCH_ALLOC_ZEROED_USER_PAGE.

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.10/include/asm-ia64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-ia64/page.h 2004-12-24 13:34:00.000000000 -0800
+++ linux-2.6.10/include/asm-ia64/page.h 2005-01-10 13:53:59.000000000 -0800
@@ -75,6 +75,16 @@
flush_dcache_page(page); \
} while (0)

+
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+({ \
+ struct page *page = alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr); \
+ flush_dcache_page(page); \
+ page; \
+})
+
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)

#ifdef CONFIG_VIRTUAL_MEM_MAP
Index: linux-2.6.10/include/asm-h8300/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-h8300/page.h 2004-12-24 13:35:25.000000000 -0800
+++ linux-2.6.10/include/asm-h8300/page.h 2005-01-10 13:53:59.000000000 -0800
@@ -30,6 +30,9 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
/*
* These are used to make use of C type-checking..
*/
Index: linux-2.6.10/mm/memory.c
===================================================================
--- linux-2.6.10.orig/mm/memory.c 2005-01-10 13:48:11.000000000 -0800
+++ linux-2.6.10/mm/memory.c 2005-01-10 13:54:30.000000000 -0800
@@ -84,20 +84,6 @@
EXPORT_SYMBOL(vmalloc_earlyreserve);

/*
- * We special-case the C-O-W ZERO_PAGE, because it's such
- * a common occurrence (no need to read the page to know
- * that it's zero - better for the cache and memory subsystem).
- */
-static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address)
-{
- if (from == ZERO_PAGE(address)) {
- clear_user_highpage(to, address);
- return;
- }
- copy_user_highpage(to, from, address);
-}
-
-/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
*/
@@ -1329,11 +1315,16 @@

if (unlikely(anon_vma_prepare(vma)))
goto no_new_page;
- new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
- if (!new_page)
- goto no_new_page;
- copy_cow_page(old_page,new_page,address);
-
+ if (old_page == ZERO_PAGE(address)) {
+ new_page = alloc_zeroed_user_highpage(vma, address);
+ if (!new_page)
+ goto no_new_page;
+ } else {
+ new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+ if (!new_page)
+ goto no_new_page;
+ copy_user_highpage(new_page, old_page, address);
+ }
/*
* Re-check the pte - we dropped the lock
*/
@@ -1795,7 +1786,7 @@

if (unlikely(anon_vma_prepare(vma)))
goto no_mem;
- page = alloc_page_vma(GFP_HIGHZERO, vma, addr);
+ page = alloc_zeroed_user_highpage(vma, addr);
if (!page)
goto no_mem;

Index: linux-2.6.10/include/asm-m32r/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-m32r/page.h 2004-12-24 13:34:29.000000000 -0800
+++ linux-2.6.10/include/asm-m32r/page.h 2005-01-10 13:53:59.000000000 -0800
@@ -17,6 +17,9 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
/*
* These are used to make use of C type-checking..
*/
Index: linux-2.6.10/include/asm-alpha/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-alpha/page.h 2004-12-24 13:35:24.000000000 -0800
+++ linux-2.6.10/include/asm-alpha/page.h 2005-01-10 13:53:59.000000000 -0800
@@ -18,6 +18,9 @@
extern void clear_page(void *page);
#define clear_user_page(page, vaddr, pg) clear_page(page)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vmaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
extern void copy_page(void * _to, void * _from);
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

Index: linux-2.6.10/include/asm-m68knommu/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-m68knommu/page.h 2005-01-10 13:48:11.000000000 -0800
+++ linux-2.6.10/include/asm-m68knommu/page.h 2005-01-10 13:53:59.000000000 -0800
@@ -30,6 +30,9 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
/*
* These are used to make use of C type-checking..
*/
Index: linux-2.6.10/include/asm-cris/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-cris/page.h 2004-12-24 13:34:30.000000000 -0800
+++ linux-2.6.10/include/asm-cris/page.h 2005-01-10 13:53:59.000000000 -0800
@@ -21,6 +21,9 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
/*
* These are used to make use of C type-checking..
*/
Index: linux-2.6.10/include/linux/highmem.h
===================================================================
--- linux-2.6.10.orig/include/linux/highmem.h 2005-01-10 13:48:11.000000000 -0800
+++ linux-2.6.10/include/linux/highmem.h 2005-01-10 13:53:59.000000000 -0800
@@ -42,6 +42,17 @@
smp_wmb();
}

+#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+static inline struct page* alloc_zeroed_user_highpage(struct vm_area_struct *vma,
+ unsigned long vaddr)
+{
+ struct page *page = alloc_page_vma(GFP_HIGHUSER, vma, vaddr);
+
+ clear_user_highpage(page, vaddr);
+ return page;
+}
+#endif
+
static inline void clear_highpage(struct page *page)
{
void *kaddr = kmap_atomic(page, KM_USER0);
Index: linux-2.6.10/include/asm-i386/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-i386/page.h 2005-01-10 13:48:11.000000000 -0800
+++ linux-2.6.10/include/asm-i386/page.h 2005-01-10 13:53:59.000000000 -0800
@@ -36,6 +36,9 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
/*
* These are used to make use of C type-checking..
*/
Index: linux-2.6.10/include/asm-x86_64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-x86_64/page.h 2005-01-10 13:48:11.000000000 -0800
+++ linux-2.6.10/include/asm-x86_64/page.h 2005-01-10 13:53:59.000000000 -0800
@@ -38,6 +38,8 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
/*
* These are used to make use of C type-checking..
*/
Index: linux-2.6.10/include/asm-s390/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-s390/page.h 2004-12-24 13:34:01.000000000 -0800
+++ linux-2.6.10/include/asm-s390/page.h 2005-01-10 13:53:59.000000000 -0800
@@ -106,6 +106,9 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
/* Pure 2^n version of get_order */
extern __inline__ int get_order(unsigned long size)
{

2005-01-11 00:02:35

by Christoph Lameter

[permalink] [raw]
Subject: Prezeroing V4 [0/4]: Overview

Changes from V3 to V4:
o Drop __GFP_ZERO patch since its in Linus tree. Include new patch that allows
archs that need special measures around zeroing of user pages during a page
fault to maintain their special adaptations.
o Use zeroed pages during COW.
o Updates for clear_page for various platforms. Make clear_page an optional
patch and fall back to a series of clear_page without order if the patch
to expand clear_page patch has not been applied.
o x86_64 asm code fixed up
o Port patches to 2.6.10-bk13 and make it fit the bitmapless buddy allocator

The patches increasing the page fault rate (introduction of atomic pte
operations and anticipatory prefaulting) do so by reducing the locking
overhead and are therefore mainly of interest for applications running in
SMP systems with a high number of cpus. The single thread performance does
just show minor increases. Only the performance of multi-threaded
applications increases significantly.

The most expensive operation in the page fault handler is (apart of SMP
locking overhead) the zeroing of the page that is also done in the page fault
handler. This zeroing means that all cachelines of the faulted page (on Altix
that means all 128 cachelines of 128 byte each) must be loaded and later
written back. This patch allows to avoid having to load all cachelines
if only a part of the cachelines of that page is needed immediately after
the fault. Doing so will only be effective for sparsely accessed memory
which is typical for anonymous memory and pte maps. Prezeroed pages will
only be used for those purposes. Unzeroed pages will be used as usual for
file mapping, page caching etc etc.

The patch makes prezeroing very effective by:

1. Aggregating zeroing operations to only apply to pages of higher order,
which results in many pages that will later become zero 0 to be zeroed in one
step.
For that purpose the existing clear_page function is extended and made to
take an additional argument specifying the order of the page to be cleared.

2. Hardware support for offloading zeroing from the cpu. This avoids
the invalidation of the cpu caches by extensive zeroing operations.

The scrub daemon is invoked when a unzeroed page of a certain order has
been generated so that its worth running it. If no higher order pages are
present then the logic will favor hot zeroing rather than simply shifting
processing around. kscrubd typically runs only for a fraction of a second
and sleeps for long periods of time even under memory benchmarking. kscrubd
performs short bursts of zeroing when needed and tries to stay out off the
processor as much as possible.

The benefits of prezeroing are reduced to minimal quantities if all
cachelines of a page are touched. Prezeroing can only be effective
if the whole page is not immediately used after the page fault.

The patch is composed of 4 parts:

[1/4] GFP_ZERO fixups
Adds alloc_zeroed_user_highpage(vma, vaddr) that may be customized for
each arch by defining __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE. Includes
proper definitions for a large selection of arches, others fall back to
the default function in include/linux/highmem.h (and falls back to not
using prezeroed pages).

[2/4] Page Zeroing
Adds management of ZEROED and NOT_ZEROED pages and a background daemon
called scrubd. scrubd is disabled by default but can be enabled
by writing an order number to /proc/sys/vm/scrub_start. If a page
is coalesced of that order or higher then the scrub daemon will
start zeroing until all pages of order /proc/sys/vm/scrub_stop and
higher are zeroed and then go back to sleep.

In an SMP environment the scrub daemon is typically
running on the most idle cpu. Thus a single threaded application running
on one cpu may have the other cpu zeroing pages for it etc. The scrub
daemon is hardly noticable and usually finished zeroing quickly since
most processors are optimized for linear memory filling.

The following patches increase performance but may be omitted:


[2/4] SGI Altix Block Transfer Engine Support
Implements a driver to shift the zeroing off the cpu into hardware.
With hardware support the impact of zeroing on the system is reduced
to a minimum.

[4/4] Architecture specific clear_page updates
Adds second order argument to clear_page and updates all arches.
This allows the zeroing of large areas of memory without repeately
invoking clear_page() for the page allocator, scrubd and the huge
page allocator.


2005-01-11 00:08:35

by Christoph Lameter

[permalink] [raw]
Subject: Prezeroing V4 [3/4]: Altix SN2 BTE zero driver

o Zeroing driver implemented with the Block Transfer Engine in the Altix
SN2 SHub.

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.10/arch/ia64/sn/kernel/bte.c
===================================================================
--- linux-2.6.10.orig/arch/ia64/sn/kernel/bte.c 2004-12-24 13:34:58.000000000 -0800
+++ linux-2.6.10/arch/ia64/sn/kernel/bte.c 2005-01-10 13:54:52.000000000 -0800
@@ -4,6 +4,8 @@
* for more details.
*
* Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * Support for zeroing pages, Christoph Lameter, SGI, December 2004.
*/

#include <linux/config.h>
@@ -20,6 +22,8 @@
#include <linux/bootmem.h>
#include <linux/string.h>
#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/scrub.h>

#include <asm/sn/bte.h>

@@ -30,7 +34,7 @@
/* two interfaces on two btes */
#define MAX_INTERFACES_TO_TRY 4

-static struct bteinfo_s *bte_if_on_node(nasid_t nasid, int interface)
+static inline struct bteinfo_s *bte_if_on_node(nasid_t nasid, int interface)
{
nodepda_t *tmp_nodepda;

@@ -132,7 +136,6 @@
if (bte == NULL) {
continue;
}
-
if (spin_trylock(&bte->spinlock)) {
if (!(*bte->most_rcnt_na & BTE_WORD_AVAILABLE) ||
(BTE_LNSTAT_LOAD(bte) & BTE_ACTIVE)) {
@@ -157,7 +160,7 @@
}
} while (1);

- if (notification == NULL) {
+ if (notification == NULL || (mode & BTE_NOTIFY_AND_GET_POINTER)) {
/* User does not want to be notified. */
bte->most_rcnt_na = &bte->notify;
} else {
@@ -192,6 +195,8 @@

itc_end = ia64_get_itc() + (40000000 * local_cpu_data->cyc_per_usec);

+ if (mode & BTE_NOTIFY_AND_GET_POINTER)
+ *(u64 volatile **)(notification) = &bte->notify;
spin_unlock_irqrestore(&bte->spinlock, irq_flags);

if (notification != NULL) {
@@ -449,5 +454,47 @@
mynodepda->bte_if[i].cleanup_active = 0;
mynodepda->bte_if[i].bh_error = 0;
}
+}
+
+u64 *bte_zero_notify[MAX_COMPACT_NODES];
+
+#define ZERO_RATE_PER_SEC 500000000
+
+static int bte_start_bzero(void *p, unsigned long len)
+{
+ int rc;
+ int ticks;
+ int node = get_nasid();
+
+ /* Check limitations.
+ 1. System must be running (weird things happen during bootup)
+ 2. Size >64KB. Smaller requests cause too much bte traffic
+ */
+ if (len >= BTE_MAX_XFER || len < 60000 || system_state != SYSTEM_RUNNING)
+ return EINVAL;
+
+ rc = bte_zero(ia64_tpa(p), len, BTE_NOTIFY_AND_GET_POINTER, bte_zero_notify+node);
+ if (rc)
+ return rc;
+
+ ticks = (len*HZ)/ZERO_RATE_PER_SEC;
+ if (ticks) {
+ /* Wait the minimum time of the transfer */
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(ticks);
+ }
+ while (*(bte_zero_notify[node]) != BTE_WORD_BUSY) {
+ /* Then keep on checking until transfer is complete */
+ cpu_relax();
+ schedule();
+ }
+ return 0;
+}
+
+static struct zero_driver bte_bzero = {
+ .start = bte_start_bzero,
+};

+void sn_bte_bzero_init(void) {
+ register_zero_driver(&bte_bzero);
}
Index: linux-2.6.10/arch/ia64/sn/kernel/setup.c
===================================================================
--- linux-2.6.10.orig/arch/ia64/sn/kernel/setup.c 2005-01-10 13:48:08.000000000 -0800
+++ linux-2.6.10/arch/ia64/sn/kernel/setup.c 2005-01-10 13:54:52.000000000 -0800
@@ -244,6 +244,7 @@
int pxm;
int major = sn_sal_rev_major(), minor = sn_sal_rev_minor();
extern void sn_cpu_init(void);
+ extern void sn_bte_bzero_init(void);

/*
* If the generic code has enabled vga console support - lets
@@ -334,6 +335,7 @@
screen_info = sn_screen_info;

sn_timer_init();
+ sn_bte_bzero_init();
}

/**
Index: linux-2.6.10/include/asm-ia64/sn/bte.h
===================================================================
--- linux-2.6.10.orig/include/asm-ia64/sn/bte.h 2004-12-24 13:34:45.000000000 -0800
+++ linux-2.6.10/include/asm-ia64/sn/bte.h 2005-01-10 13:54:52.000000000 -0800
@@ -48,6 +48,8 @@
#define BTE_ZERO_FILL (BTE_NOTIFY | IBCT_ZFIL_MODE)
/* Use a reserved bit to let the caller specify a wait for any BTE */
#define BTE_WACQUIRE (0x4000)
+/* Return the pointer to the notification cacheline to the user */
+#define BTE_NOTIFY_AND_GET_POINTER (0x8000)
/* Use the BTE on the node with the destination memory */
#define BTE_USE_DEST (BTE_WACQUIRE << 1)
/* Use any available BTE interface on any node for the transfer */

2005-01-11 00:27:58

by Christoph Lameter

[permalink] [raw]
Subject: Prezeroing V4 [4/4]: Extend clear_page to take an order parameter


- Extend clear_page to take an order parameter.

Architecture support:
---------------------

Known to work:

ia64
i386
x86_64
sparc64
m68k

Trivial modification expected to simply work:

arm
cris
h8300
m68knommu
ppc
ppc64
sh64
v850
parisc
sparc
um

Modification made but it would be good to have some feedback from the arch maintainers:

s390
alpha
sh
mips
m32r

Index: linux-2.6.10/include/asm-ia64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-ia64/page.h 2005-01-10 13:53:59.000000000 -0800
+++ linux-2.6.10/include/asm-ia64/page.h 2005-01-10 14:23:21.000000000 -0800
@@ -56,7 +56,7 @@
# ifdef __KERNEL__
# define STRICT_MM_TYPECHECKS

-extern void clear_page (void *page);
+extern void clear_page (void *page, int order);
extern void copy_page (void *to, void *from);

/*
@@ -65,7 +65,7 @@
*/
#define clear_user_page(addr, vaddr, page) \
do { \
- clear_page(addr); \
+ clear_page(addr, 0); \
flush_dcache_page(page); \
} while (0)

Index: linux-2.6.10/include/asm-i386/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-i386/page.h 2005-01-10 13:53:59.000000000 -0800
+++ linux-2.6.10/include/asm-i386/page.h 2005-01-10 14:23:22.000000000 -0800
@@ -18,7 +18,7 @@

#include <asm/mmx.h>

-#define clear_page(page) mmx_clear_page((void *)(page))
+#define clear_page(page, order) mmx_clear_page((void *)(page),order)
#define copy_page(to,from) mmx_copy_page(to,from)

#else
@@ -28,12 +28,12 @@
* Maybe the K6-III ?
*/

-#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE)

#endif

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
Index: linux-2.6.10/include/asm-x86_64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-x86_64/page.h 2005-01-10 13:53:59.000000000 -0800
+++ linux-2.6.10/include/asm-x86_64/page.h 2005-01-10 14:23:22.000000000 -0800
@@ -32,10 +32,10 @@
#ifdef __KERNEL__
#ifndef __ASSEMBLY__

-void clear_page(void *);
+void clear_page(void *, int);
void copy_page(void *, void *);

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
Index: linux-2.6.10/include/asm-sparc/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-sparc/page.h 2004-12-24 13:34:29.000000000 -0800
+++ linux-2.6.10/include/asm-sparc/page.h 2005-01-10 14:23:22.000000000 -0800
@@ -28,10 +28,10 @@

#ifndef __ASSEMBLY__

-#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE)
#define clear_user_page(addr, vaddr, page) \
- do { clear_page(addr); \
+ do { clear_page(addr, 0); \
sparc_flush_page_to_ram(page); \
} while (0)
#define copy_user_page(to, from, vaddr, page) \
Index: linux-2.6.10/include/asm-s390/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-s390/page.h 2005-01-10 13:53:59.000000000 -0800
+++ linux-2.6.10/include/asm-s390/page.h 2005-01-10 14:23:22.000000000 -0800
@@ -22,12 +22,12 @@

#ifndef __s390x__

-static inline void clear_page(void *page)
+static inline void clear_page(void *page, int order)
{
register_pair rp;

rp.subreg.even = (unsigned long) page;
- rp.subreg.odd = (unsigned long) 4096;
+ rp.subreg.odd = (unsigned long) 4096 << order;
asm volatile (" slr 1,1\n"
" mvcl %0,0"
: "+&a" (rp) : : "memory", "cc", "1" );
@@ -63,14 +63,19 @@

#else /* __s390x__ */

-static inline void clear_page(void *page)
+static inline void clear_page(void *page, int order)
{
- asm volatile (" lgr 2,%0\n"
+ int nr = 1 << order;
+
+ while (nr-- >0) {
+ asm volatile (" lgr 2,%0\n"
" lghi 3,4096\n"
" slgr 1,1\n"
" mvcl 2,0"
: : "a" ((void *) (page))
: "memory", "cc", "1", "2", "3" );
+ page += PAGE_SIZE;
+ }
}

static inline void copy_page(void *to, void *from)
@@ -103,7 +108,7 @@

#endif /* __s390x__ */

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
Index: linux-2.6.10/arch/i386/lib/mmx.c
===================================================================
--- linux-2.6.10.orig/arch/i386/lib/mmx.c 2004-12-24 13:34:48.000000000 -0800
+++ linux-2.6.10/arch/i386/lib/mmx.c 2005-01-10 14:23:22.000000000 -0800
@@ -128,7 +128,7 @@
* other MMX using processors do not.
*/

-static void fast_clear_page(void *page)
+static void fast_clear_page(void *page, int order)
{
int i;

@@ -138,7 +138,7 @@
" pxor %%mm0, %%mm0\n" : :
);

- for(i=0;i<4096/64;i++)
+ for(i=0;i<((4096/64) << order);i++)
{
__asm__ __volatile__ (
" movntq %%mm0, (%0)\n"
@@ -257,7 +257,7 @@
* Generic MMX implementation without K7 specific streaming
*/

-static void fast_clear_page(void *page)
+static void fast_clear_page(void *page, int order)
{
int i;

@@ -267,7 +267,7 @@
" pxor %%mm0, %%mm0\n" : :
);

- for(i=0;i<4096/128;i++)
+ for(i=0;i<((4096/128) << order);i++)
{
__asm__ __volatile__ (
" movq %%mm0, (%0)\n"
@@ -359,23 +359,23 @@
* Favour MMX for page clear and copy.
*/

-static void slow_zero_page(void * page)
+static void slow_clear_page(void * page, int order)
{
int d0, d1;
__asm__ __volatile__( \
"cld\n\t" \
"rep ; stosl" \
: "=&c" (d0), "=&D" (d1)
- :"a" (0),"1" (page),"0" (1024)
+ :"a" (0),"1" (page),"0" (1024 << order)
:"memory");
}
-
-void mmx_clear_page(void * page)
+
+void mmx_clear_page(void * page, int order)
{
if(unlikely(in_interrupt()))
- slow_zero_page(page);
+ slow_clear_page(page, order);
else
- fast_clear_page(page);
+ fast_clear_page(page, order);
}

static void slow_copy_page(void *to, void *from)
Index: linux-2.6.10/include/asm-x86_64/mmx.h
===================================================================
--- linux-2.6.10.orig/include/asm-x86_64/mmx.h 2004-12-24 13:34:57.000000000 -0800
+++ linux-2.6.10/include/asm-x86_64/mmx.h 2005-01-10 14:23:22.000000000 -0800
@@ -8,7 +8,7 @@
#include <linux/types.h>

extern void *_mmx_memcpy(void *to, const void *from, size_t size);
-extern void mmx_clear_page(void *page);
+extern void mmx_clear_page(void *page, int order);
extern void mmx_copy_page(void *to, void *from);

#endif
Index: linux-2.6.10/arch/ia64/lib/clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/ia64/lib/clear_page.S 2004-12-24 13:33:50.000000000 -0800
+++ linux-2.6.10/arch/ia64/lib/clear_page.S 2005-01-10 14:23:22.000000000 -0800
@@ -7,6 +7,7 @@
* 1/06/01 davidm Tuned for Itanium.
* 2/12/02 kchen Tuned for both Itanium and McKinley
* 3/08/02 davidm Some more tweaking
+ * 12/10/04 clameter Make it work on pages of order size
*/
#include <linux/config.h>

@@ -29,27 +30,33 @@
#define dst4 r11

#define dst_last r31
+#define totsize r14

GLOBAL_ENTRY(clear_page)
.prologue
- .regstk 1,0,0,0
- mov r16 = PAGE_SIZE/L3_LINE_SIZE-1 // main loop count, -1=repeat/until
+ .regstk 2,0,0,0
+ mov r16 = PAGE_SIZE/L3_LINE_SIZE // main loop count
+ mov totsize = PAGE_SIZE
.save ar.lc, saved_lc
mov saved_lc = ar.lc
-
+ ;;
.body
+ adds dst1 = 16, in0
mov ar.lc = (PREFETCH_LINES - 1)
mov dst_fetch = in0
- adds dst1 = 16, in0
adds dst2 = 32, in0
+ shl r16 = r16, in1
+ shl totsize = totsize, in1
;;
.fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
adds dst3 = 48, in0 // executing this multiple times is harmless
br.cloop.sptk.few .fetch
+ add r16 = -1,r16
+ add dst_last = totsize, dst_fetch
+ adds dst4 = 64, in0
;;
- addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
mov ar.lc = r16 // one L3 line per iteration
- adds dst4 = 64, in0
+ adds dst_last = -PREFETCH_LINES*L3_LINE_SIZE, dst_last
;;
#ifdef CONFIG_ITANIUM
// Optimized for Itanium
Index: linux-2.6.10/arch/x86_64/lib/clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/x86_64/lib/clear_page.S 2004-12-24 13:34:33.000000000 -0800
+++ linux-2.6.10/arch/x86_64/lib/clear_page.S 2005-01-10 14:23:22.000000000 -0800
@@ -1,12 +1,16 @@
/*
* Zero a page.
* rdi page
+ * rsi order
*/
.globl clear_page
.p2align 4
clear_page:
+ movl $4096/64,%eax
+ movl %esi, %ecx
+ shll %cl, %eax
+ movl %eax, %ecx
xorl %eax,%eax
- movl $4096/64,%ecx
.p2align 4
.Lloop:
decl %ecx
@@ -41,7 +45,10 @@

.section .altinstr_replacement,"ax"
clear_page_c:
- movl $4096/8,%ecx
+ movl $4096/8,%eax
+ movl %esi, %ecx
+ shll %cl, %eax
+ movl %eax, %ecx
xorl %eax,%eax
rep
stosq
Index: linux-2.6.10/include/asm-sh/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-sh/page.h 2004-12-24 13:35:28.000000000 -0800
+++ linux-2.6.10/include/asm-sh/page.h 2005-01-10 14:23:22.000000000 -0800
@@ -36,12 +36,22 @@
#ifdef __KERNEL__
#ifndef __ASSEMBLY__

-extern void (*clear_page)(void *to);
+extern void (*_clear_page)(void *to);
extern void (*copy_page)(void *to, void *from);

extern void clear_page_slow(void *to);
extern void copy_page_slow(void *to, void *from);

+static inline void clear_page(void *page, int order)
+{
+ unsigned int nr = 1 << order;
+
+ while (nr-- >0) {
+ _clear_page(page);
+ page += PAGE_SIZE;
+ }
+}
+
#if defined(CONFIG_SH7705_CACHE_32KB) && defined(CONFIG_MMU)
struct page;
extern void clear_user_page(void *to, unsigned long address, struct page *pg);
@@ -49,7 +59,7 @@
extern void __clear_user_page(void *to, void *orig_to);
extern void __copy_user_page(void *to, void *from, void *orig_to);
#elif defined(CONFIG_CPU_SH2) || defined(CONFIG_CPU_SH3) || !defined(CONFIG_MMU)
-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
#elif defined(CONFIG_CPU_SH4)
struct page;
Index: linux-2.6.10/include/asm-i386/mmx.h
===================================================================
--- linux-2.6.10.orig/include/asm-i386/mmx.h 2004-12-24 13:34:57.000000000 -0800
+++ linux-2.6.10/include/asm-i386/mmx.h 2005-01-10 14:23:22.000000000 -0800
@@ -8,7 +8,7 @@
#include <linux/types.h>

extern void *_mmx_memcpy(void *to, const void *from, size_t size);
-extern void mmx_clear_page(void *page);
+extern void mmx_clear_page(void *page, int order);
extern void mmx_copy_page(void *to, void *from);

#endif
Index: linux-2.6.10/arch/alpha/lib/clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/alpha/lib/clear_page.S 2004-12-24 13:35:25.000000000 -0800
+++ linux-2.6.10/arch/alpha/lib/clear_page.S 2005-01-10 14:23:22.000000000 -0800
@@ -6,11 +6,10 @@

.text
.align 4
- .global clear_page
- .ent clear_page
-clear_page:
+ .global _clear_page
+ .ent _clear_page
+_clear_page:
.prologue 0
-
lda $0,128
nop
unop
@@ -36,4 +35,4 @@
unop
nop

- .end clear_page
+ .end _clear_page
Index: linux-2.6.10/include/asm-sh64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-sh64/page.h 2004-12-24 13:34:33.000000000 -0800
+++ linux-2.6.10/include/asm-sh64/page.h 2005-01-10 14:23:22.000000000 -0800
@@ -50,12 +50,20 @@
extern void sh64_page_clear(void *page);
extern void sh64_page_copy(void *from, void *to);

-#define clear_page(page) sh64_page_clear(page)
+static inline void clear_page(page, order)
+{
+ int nr = 1 << order;
+
+ while (nr-- >0) {
+ sh64_page_clear(page++, 0);
+ }
+}
+
#define copy_page(to,from) sh64_page_copy(from, to)

#if defined(CONFIG_DCACHE_DISABLED)

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) sh_clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

#else
Index: linux-2.6.10/include/asm-h8300/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-h8300/page.h 2005-01-10 13:53:59.000000000 -0800
+++ linux-2.6.10/include/asm-h8300/page.h 2005-01-10 14:23:22.000000000 -0800
@@ -24,10 +24,10 @@
#define get_user_page(vaddr) __get_free_page(GFP_KERNEL)
#define free_user_page(page, addr) free_page(addr)

-#define clear_page(page) memset((page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((to), (from), PAGE_SIZE)

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
Index: linux-2.6.10/include/asm-arm/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-arm/page.h 2004-12-24 13:34:01.000000000 -0800
+++ linux-2.6.10/include/asm-arm/page.h 2005-01-10 14:23:22.000000000 -0800
@@ -128,7 +128,7 @@
preempt_enable(); \
} while (0)

-#define clear_page(page) memzero((void *)(page), PAGE_SIZE)
+#define clear_page(page, order) memzero((void *)(page), PAGE_SIZE << (order))
extern void copy_page(void *to, const void *from);

#undef STRICT_MM_TYPECHECKS
Index: linux-2.6.10/include/asm-ppc64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-ppc64/page.h 2005-01-10 13:48:11.000000000 -0800
+++ linux-2.6.10/include/asm-ppc64/page.h 2005-01-10 14:23:22.000000000 -0800
@@ -102,12 +102,12 @@
#define REGION_MASK (((1UL<<REGION_SIZE)-1UL)<<REGION_SHIFT)
#define REGION_STRIDE (1UL << REGION_SHIFT)

-static __inline__ void clear_page(void *addr)
+static __inline__ void clear_page(void *addr, unsigned int order)
{
unsigned long lines, line_size;

line_size = ppc64_caches.dline_size;
- lines = ppc64_caches.dlines_per_page;
+ lines = ppc64_caches.dlines_per_page << order;

__asm__ __volatile__(
"mtctr %1 # clear_page\n\
Index: linux-2.6.10/include/asm-m32r/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-m32r/page.h 2005-01-10 13:53:59.000000000 -0800
+++ linux-2.6.10/include/asm-m32r/page.h 2005-01-10 14:23:22.000000000 -0800
@@ -11,10 +11,22 @@
#ifdef __KERNEL__
#ifndef __ASSEMBLY__

-extern void clear_page(void *to);
+extern void _clear_page(void *to);
+
+static inline void clear_page(void *page, int order)
+{
+ unsigned int nr = 1 << order;
+
+ while (nr-- > 0) {
+ _clear_page(page);
+ page += PAGE_SIZE;
+ }
+}
+
+
extern void copy_page(void *to, void *from);

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
Index: linux-2.6.10/include/asm-alpha/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-alpha/page.h 2005-01-10 13:53:59.000000000 -0800
+++ linux-2.6.10/include/asm-alpha/page.h 2005-01-10 14:23:22.000000000 -0800
@@ -15,8 +15,20 @@

#define STRICT_MM_TYPECHECKS

-extern void clear_page(void *page);
-#define clear_user_page(page, vaddr, pg) clear_page(page)
+extern void _clear_page(void *page);
+
+static inline void clear_page(void *page, int order)
+{
+ int nr = 1 << order;
+
+ while (nr--)
+ {
+ _clear_page(page);
+ page += PAGE_SIZE;
+ }
+}
+
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)

#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vmaddr)
#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
Index: linux-2.6.10/arch/mips/mm/pg-sb1.c
===================================================================
--- linux-2.6.10.orig/arch/mips/mm/pg-sb1.c 2004-12-24 13:35:50.000000000 -0800
+++ linux-2.6.10/arch/mips/mm/pg-sb1.c 2005-01-10 14:23:22.000000000 -0800
@@ -42,7 +42,7 @@
#ifdef CONFIG_SIBYTE_DMA_PAGEOPS
static inline void clear_page_cpu(void *page)
#else
-void clear_page(void *page)
+void _clear_page(void *page)
#endif
{
unsigned char *addr = (unsigned char *) page;
@@ -172,14 +172,13 @@
IOADDR(A_DM_REGISTER(cpu, R_DM_DSCR_BASE)));
}

-void clear_page(void *page)
+void _clear_page(void *page)
{
int cpu = smp_processor_id();

/* if the page is above Kseg0, use old way */
if (KSEGX(page) != CAC_BASE)
return clear_page_cpu(page);
-
page_descr[cpu].dscr_a = PHYSADDR(page) | M_DM_DSCRA_ZERO_MEM | M_DM_DSCRA_L2C_DEST | M_DM_DSCRA_INTERRUPT;
page_descr[cpu].dscr_b = V_DM_DSCRB_SRC_LENGTH(PAGE_SIZE);
__raw_writeq(1, IOADDR(A_DM_REGISTER(cpu, R_DM_DSCR_COUNT)));
@@ -218,5 +217,5 @@

#endif

-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(_clear_page);
EXPORT_SYMBOL(copy_page);
Index: linux-2.6.10/include/asm-m68k/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-m68k/page.h 2004-12-24 13:35:49.000000000 -0800
+++ linux-2.6.10/include/asm-m68k/page.h 2005-01-10 14:23:22.000000000 -0800
@@ -50,7 +50,7 @@
);
}

-static inline void clear_page(void *page)
+static inline void clear_page(void *page, int order)
{
unsigned long tmp;
unsigned long *sp = page;
@@ -69,16 +69,16 @@
"dbra %1,1b\n\t"
: "=a" (sp), "=d" (tmp)
: "a" (page), "0" (sp),
- "1" ((PAGE_SIZE - 16) / 16 - 1));
+ "1" (((PAGE_SIZE<<(order)) - 16) / 16 - 1));
}

#else
-#define clear_page(page) memset((page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((to), (from), PAGE_SIZE)
#endif

#define clear_user_page(addr, vaddr, page) \
- do { clear_page(addr); \
+ do { clear_page(addr, 0); \
flush_dcache_page(page); \
} while (0)
#define copy_user_page(to, from, vaddr, page) \
Index: linux-2.6.10/include/asm-mips/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-mips/page.h 2004-12-24 13:34:31.000000000 -0800
+++ linux-2.6.10/include/asm-mips/page.h 2005-01-10 14:23:22.000000000 -0800
@@ -39,7 +39,18 @@
#ifdef __KERNEL__
#ifndef __ASSEMBLY__

-extern void clear_page(void * page);
+extern void _clear_page(void * page);
+
+static inline void clear_page(void *page, int order)
+{
+ unsigned int nr = 1 << order;
+
+ while (nr-- >0) {
+ _clear_page(page);
+ page += PAGE_SIZE;
+ }
+}
+
extern void copy_page(void * to, void * from);

extern unsigned long shm_align_mask;
@@ -57,7 +68,7 @@
{
extern void (*flush_data_cache_page)(unsigned long addr);

- clear_page(addr);
+ clear_page(addr, 0);
if (pages_do_alias((unsigned long) addr, vaddr))
flush_data_cache_page((unsigned long)addr);
}
Index: linux-2.6.10/include/asm-m68knommu/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-m68knommu/page.h 2005-01-10 13:53:59.000000000 -0800
+++ linux-2.6.10/include/asm-m68knommu/page.h 2005-01-10 14:23:22.000000000 -0800
@@ -24,10 +24,10 @@
#define get_user_page(vaddr) __get_free_page(GFP_KERNEL)
#define free_user_page(page, addr) free_page(addr)

-#define clear_page(page) memset((page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((to), (from), PAGE_SIZE)

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
Index: linux-2.6.10/include/asm-cris/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-cris/page.h 2005-01-10 13:53:59.000000000 -0800
+++ linux-2.6.10/include/asm-cris/page.h 2005-01-10 14:23:22.000000000 -0800
@@ -15,10 +15,10 @@

#ifdef __KERNEL__

-#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE)

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
Index: linux-2.6.10/include/asm-v850/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-v850/page.h 2004-12-24 13:35:00.000000000 -0800
+++ linux-2.6.10/include/asm-v850/page.h 2005-01-10 14:23:22.000000000 -0800
@@ -37,11 +37,11 @@

#define STRICT_MM_TYPECHECKS

-#define clear_page(page) memset ((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset ((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to, from) memcpy ((void *)(to), (void *)from, PAGE_SIZE)

#define clear_user_page(addr, vaddr, page) \
- do { clear_page(addr); \
+ do { clear_page(addr, 0); \
flush_dcache_page(page); \
} while (0)
#define copy_user_page(to, from, vaddr, page) \
Index: linux-2.6.10/include/asm-parisc/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-parisc/page.h 2004-12-24 13:34:26.000000000 -0800
+++ linux-2.6.10/include/asm-parisc/page.h 2005-01-10 14:23:22.000000000 -0800
@@ -13,7 +13,7 @@
#include <asm/types.h>
#include <asm/cache.h>

-#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) copy_user_page_asm((void *)(to), (void *)(from))

struct page;
Index: linux-2.6.10/arch/arm/mm/copypage-v6.c
===================================================================
--- linux-2.6.10.orig/arch/arm/mm/copypage-v6.c 2004-12-24 13:34:31.000000000 -0800
+++ linux-2.6.10/arch/arm/mm/copypage-v6.c 2005-01-10 14:23:22.000000000 -0800
@@ -47,7 +47,7 @@
*/
void v6_clear_user_page_nonaliasing(void *kaddr, unsigned long vaddr)
{
- clear_page(kaddr);
+ _clear_page(kaddr);
}

/*
@@ -116,7 +116,7 @@

set_pte(to_pte + offset, pfn_pte(__pa(kaddr) >> PAGE_SHIFT, to_pgprot));
flush_tlb_kernel_page(to);
- clear_page((void *)to);
+ _clear_page((void *)to);

spin_unlock(&v6_lock);
}
Index: linux-2.6.10/arch/m32r/mm/page.S
===================================================================
--- linux-2.6.10.orig/arch/m32r/mm/page.S 2004-12-24 13:34:57.000000000 -0800
+++ linux-2.6.10/arch/m32r/mm/page.S 2005-01-10 14:23:22.000000000 -0800
@@ -51,7 +51,7 @@
jmp r14

.text
- .global clear_page
+ .global _clear_page
/*
* clear_page (to)
*
@@ -60,7 +60,7 @@
* 16 * 256
*/
.align 4
-clear_page:
+_clear_page:
ldi r2, #255
ldi r4, #0
ld r3, @r0 /* cache line allocate */
Index: linux-2.6.10/include/asm-ppc/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-ppc/page.h 2004-12-24 13:34:29.000000000 -0800
+++ linux-2.6.10/include/asm-ppc/page.h 2005-01-10 14:23:22.000000000 -0800
@@ -85,7 +85,7 @@

struct page;
extern void clear_pages(void *page, int order);
-static inline void clear_page(void *page) { clear_pages(page, 0); }
+#define clear_page clear_pages
extern void copy_page(void *to, void *from);
extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg);
extern void copy_user_page(void *to, void *from, unsigned long vaddr,
Index: linux-2.6.10/arch/alpha/kernel/alpha_ksyms.c
===================================================================
--- linux-2.6.10.orig/arch/alpha/kernel/alpha_ksyms.c 2004-12-24 13:33:51.000000000 -0800
+++ linux-2.6.10/arch/alpha/kernel/alpha_ksyms.c 2005-01-10 14:23:22.000000000 -0800
@@ -88,7 +88,7 @@
EXPORT_SYMBOL(__memsetw);
EXPORT_SYMBOL(__constant_c_memset);
EXPORT_SYMBOL(copy_page);
-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(_clear_page);

EXPORT_SYMBOL(__direct_map_base);
EXPORT_SYMBOL(__direct_map_size);
Index: linux-2.6.10/arch/alpha/lib/ev6-clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/alpha/lib/ev6-clear_page.S 2004-12-24 13:35:24.000000000 -0800
+++ linux-2.6.10/arch/alpha/lib/ev6-clear_page.S 2005-01-10 14:23:22.000000000 -0800
@@ -6,9 +6,9 @@

.text
.align 4
- .global clear_page
- .ent clear_page
-clear_page:
+ .global _clear_page
+ .ent _clear_page
+_clear_page:
.prologue 0

lda $0,128
@@ -51,4 +51,4 @@
nop
nop

- .end clear_page
+ .end _clear_page
Index: linux-2.6.10/arch/sh/mm/init.c
===================================================================
--- linux-2.6.10.orig/arch/sh/mm/init.c 2004-12-24 13:35:24.000000000 -0800
+++ linux-2.6.10/arch/sh/mm/init.c 2005-01-10 14:23:22.000000000 -0800
@@ -57,7 +57,7 @@
#endif

void (*copy_page)(void *from, void *to);
-void (*clear_page)(void *to);
+void (*_clear_page)(void *to);

void show_mem(void)
{
@@ -255,7 +255,7 @@
* later in the boot process if a better method is available.
*/
copy_page = copy_page_slow;
- clear_page = clear_page_slow;
+ _clear_page = clear_page_slow;

/* this will put all low memory onto the freelists */
totalram_pages += free_all_bootmem_node(NODE_DATA(0));
Index: linux-2.6.10/arch/sh/mm/pg-dma.c
===================================================================
--- linux-2.6.10.orig/arch/sh/mm/pg-dma.c 2004-12-24 13:35:00.000000000 -0800
+++ linux-2.6.10/arch/sh/mm/pg-dma.c 2005-01-10 14:23:22.000000000 -0800
@@ -78,7 +78,7 @@
return ret;

copy_page = copy_page_dma;
- clear_page = clear_page_dma;
+ _clear_page = clear_page_dma;

return ret;
}
Index: linux-2.6.10/arch/sh/mm/pg-nommu.c
===================================================================
--- linux-2.6.10.orig/arch/sh/mm/pg-nommu.c 2004-12-24 13:34:32.000000000 -0800
+++ linux-2.6.10/arch/sh/mm/pg-nommu.c 2005-01-10 14:23:22.000000000 -0800
@@ -27,7 +27,7 @@
static int __init pg_nommu_init(void)
{
copy_page = copy_page_nommu;
- clear_page = clear_page_nommu;
+ _clear_page = clear_page_nommu;

return 0;
}
Index: linux-2.6.10/arch/mips/mm/pg-r4k.c
===================================================================
--- linux-2.6.10.orig/arch/mips/mm/pg-r4k.c 2004-12-24 13:34:49.000000000 -0800
+++ linux-2.6.10/arch/mips/mm/pg-r4k.c 2005-01-10 14:23:22.000000000 -0800
@@ -39,9 +39,9 @@

static unsigned int clear_page_array[0x130 / 4];

-void clear_page(void * page) __attribute__((alias("clear_page_array")));
+void _clear_page(void * page) __attribute__((alias("clear_page_array")));

-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(_clear_page);

/*
* Maximum sizes:
Index: linux-2.6.10/arch/m32r/kernel/m32r_ksyms.c
===================================================================
--- linux-2.6.10.orig/arch/m32r/kernel/m32r_ksyms.c 2004-12-24 13:34:29.000000000 -0800
+++ linux-2.6.10/arch/m32r/kernel/m32r_ksyms.c 2005-01-10 14:23:22.000000000 -0800
@@ -102,7 +102,7 @@
EXPORT_SYMBOL(memcmp);
EXPORT_SYMBOL(memscan);
EXPORT_SYMBOL(copy_page);
-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(_clear_page);

EXPORT_SYMBOL(strcat);
EXPORT_SYMBOL(strchr);
Index: linux-2.6.10/include/asm-arm26/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-arm26/page.h 2004-12-24 13:35:22.000000000 -0800
+++ linux-2.6.10/include/asm-arm26/page.h 2005-01-10 14:23:22.000000000 -0800
@@ -25,7 +25,7 @@
preempt_enable(); \
} while (0)

-#define clear_page(page) memzero((void *)(page), PAGE_SIZE)
+#define clear_page(page, order) memzero((void *)(page), PAGE_SIZE << (order))
#define copy_page(to, from) __copy_user_page(to, from, 0);

#undef STRICT_MM_TYPECHECKS
Index: linux-2.6.10/include/asm-sparc64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-sparc64/page.h 2004-12-24 13:34:32.000000000 -0800
+++ linux-2.6.10/include/asm-sparc64/page.h 2005-01-10 14:23:22.000000000 -0800
@@ -14,8 +14,8 @@

#ifndef __ASSEMBLY__

-extern void _clear_page(void *page);
-#define clear_page(X) _clear_page((void *)(X))
+extern void _clear_page(void *page, unsigned long order);
+#define clear_page(X,Y) _clear_page((void *)(X),(Y))
struct page;
extern void clear_user_page(void *addr, unsigned long vaddr, struct page *page);
#define copy_page(X,Y) memcpy((void *)(X), (void *)(Y), PAGE_SIZE)
Index: linux-2.6.10/arch/sparc64/lib/clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/sparc64/lib/clear_page.S 2004-12-24 13:35:23.000000000 -0800
+++ linux-2.6.10/arch/sparc64/lib/clear_page.S 2005-01-10 14:23:22.000000000 -0800
@@ -28,9 +28,12 @@
.text

.globl _clear_page
-_clear_page: /* %o0=dest */
+_clear_page: /* %o0=dest, %o1=order */
+ sethi %hi(PAGE_SIZE/64), %o2
+ clr %o4
+ or %o2, %lo(PAGE_SIZE/64), %o2
ba,pt %xcc, clear_page_common
- clr %o4
+ sllx %o2, %o1, %o1

/* This thing is pretty important, it shows up
* on the profiles via do_anonymous_page().
@@ -69,16 +72,16 @@
flush %g6
wrpr %o4, 0x0, %pstate

+ sethi %hi(PAGE_SIZE/64), %o1
mov 1, %o4
+ or %o1, %lo(PAGE_SIZE/64), %o1

clear_page_common:
VISEntryHalf
membar #StoreLoad | #StoreStore | #LoadStore
fzero %f0
- sethi %hi(PAGE_SIZE/64), %o1
mov %o0, %g1 ! remember vaddr for tlbflush
fzero %f2
- or %o1, %lo(PAGE_SIZE/64), %o1
faddd %f0, %f2, %f4
fmuld %f0, %f2, %f6
faddd %f0, %f2, %f8
Index: linux-2.6.10/drivers/net/tc35815.c
===================================================================
--- linux-2.6.10.orig/drivers/net/tc35815.c 2004-12-24 13:33:48.000000000 -0800
+++ linux-2.6.10/drivers/net/tc35815.c 2005-01-10 14:23:22.000000000 -0800
@@ -657,7 +657,7 @@
dma_cache_wback_inv((unsigned long)lp->fd_buf, PAGE_SIZE * FD_PAGE_NUM);
#endif
} else {
- clear_page(lp->fd_buf);
+ clear_page(lp->fd_buf, 0);
#ifdef __mips__
dma_cache_wback_inv((unsigned long)lp->fd_buf, PAGE_SIZE * FD_PAGE_NUM);
#endif
Index: linux-2.6.10/include/linux/highmem.h
===================================================================
--- linux-2.6.10.orig/include/linux/highmem.h 2005-01-10 13:53:59.000000000 -0800
+++ linux-2.6.10/include/linux/highmem.h 2005-01-10 14:23:22.000000000 -0800
@@ -56,7 +56,7 @@
static inline void clear_highpage(struct page *page)
{
void *kaddr = kmap_atomic(page, KM_USER0);
- clear_page(kaddr);
+ clear_page(kaddr, 0);
kunmap_atomic(kaddr, KM_USER0);
}

Index: linux-2.6.10/fs/afs/file.c
===================================================================
--- linux-2.6.10.orig/fs/afs/file.c 2004-12-24 13:35:59.000000000 -0800
+++ linux-2.6.10/fs/afs/file.c 2005-01-10 14:23:22.000000000 -0800
@@ -172,7 +172,7 @@
(size_t) PAGE_SIZE);
desc.buffer = kmap(page);

- clear_page(desc.buffer);
+ clear_page(desc.buffer, 0);

/* read the contents of the file from the server into the
* page */
Index: linux-2.6.10/fs/ntfs/compress.c
===================================================================
--- linux-2.6.10.orig/fs/ntfs/compress.c 2004-12-24 13:34:45.000000000 -0800
+++ linux-2.6.10/fs/ntfs/compress.c 2005-01-10 14:23:22.000000000 -0800
@@ -107,7 +107,7 @@
* FIXME: Using clear_page() will become wrong when we get
* PAGE_CACHE_SIZE != PAGE_SIZE but for now there is no problem.
*/
- clear_page(kp);
+ clear_page(kp, 0);
return;
}
kp_ofs = ni->initialized_size & ~PAGE_CACHE_MASK;
@@ -742,7 +742,7 @@
* for now there is no problem.
*/
if (likely(!cur_ofs))
- clear_page(page_address(page));
+ clear_page(page_address(page), 0);
else
memset(page_address(page) + cur_ofs, 0,
PAGE_CACHE_SIZE -
Index: linux-2.6.10/mm/page_alloc.c
===================================================================
--- linux-2.6.10.orig/mm/page_alloc.c 2005-01-10 14:21:06.000000000 -0800
+++ linux-2.6.10/mm/page_alloc.c 2005-01-10 14:23:22.000000000 -0800
@@ -639,6 +639,10 @@
{
int i;

+ if (!PageHighMem(page)) {
+ clear_page(page_address(page), order);
+ return;
+ }
for(i = 0; i < (1 << order); i++)
clear_highpage(page + i);
}
Index: linux-2.6.10/mm/hugetlb.c
===================================================================
--- linux-2.6.10.orig/mm/hugetlb.c 2005-01-10 13:48:11.000000000 -0800
+++ linux-2.6.10/mm/hugetlb.c 2005-01-10 14:23:22.000000000 -0800
@@ -89,8 +89,7 @@
spin_unlock(&hugetlb_lock);
set_page_count(page, 1);
page[1].mapping = (void *)free_huge_page;
- for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
- clear_highpage(&page[i]);
+ clear_page(page_address(page), HUGETLB_PAGE_ORDER);
return page;
}


2005-01-11 00:41:17

by Christoph Lameter

[permalink] [raw]
Subject: Prezeroing V4 [2/4]: Zeroing implementation

o Add page zeroing
o Add scrub daemon
o Add ability to view amount of zeroed information in /proc/meninfo

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.10/mm/page_alloc.c
===================================================================
--- linux-2.6.10.orig/mm/page_alloc.c 2005-01-10 13:48:11.000000000 -0800
+++ linux-2.6.10/mm/page_alloc.c 2005-01-10 14:44:22.000000000 -0800
@@ -12,6 +12,7 @@
* Zone balancing, Kanoj Sarcar, SGI, Jan 2000
* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
* (lots of bits borrowed from Ingo Molnar & Andrew Morton)
+ * Support for page zeroing, Christoph Lameter, SGI, Dec 2004
*/

#include <linux/config.h>
@@ -33,6 +34,7 @@
#include <linux/cpu.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
+#include <linux/scrub.h>

#include <asm/tlbflush.h>
#include "internal.h"
@@ -167,16 +169,16 @@
* zone->lock is already acquired when we use these.
* So, we don't need atomic page->flags operations here.
*/
-static inline unsigned long page_order(struct page *page) {
+static inline unsigned long page_zorder(struct page *page) {
return page->private;
}

-static inline void set_page_order(struct page *page, int order) {
- page->private = order;
+static inline void set_page_zorder(struct page *page, int order, int zero) {
+ page->private = order + (zero << 10);
__SetPagePrivate(page);
}

-static inline void rmv_page_order(struct page *page)
+static inline void rmv_page_zorder(struct page *page)
{
__ClearPagePrivate(page);
page->private = 0;
@@ -187,14 +189,15 @@
* we can do coalesce a page and its buddy if
* (a) the buddy is free &&
* (b) the buddy is on the buddy system &&
- * (c) a page and its buddy have the same order.
+ * (c) a page and its buddy have the same order and the same
+ * zeroing status.
* for recording page's order, we use page->private and PG_private.
*
*/
-static inline int page_is_buddy(struct page *page, int order)
+static inline int page_is_buddy(struct page *page, int order, int zero)
{
if (PagePrivate(page) &&
- (page_order(page) == order) &&
+ (page_zorder(page) == order + (zero << 10)) &&
!PageReserved(page) &&
page_count(page) == 0)
return 1;
@@ -225,22 +228,20 @@
* -- wli
*/

-static inline void __free_pages_bulk (struct page *page, struct page *base,
- struct zone *zone, unsigned int order)
+static inline int __free_pages_bulk (struct page *page, struct page *base,
+ struct zone *zone, unsigned int order, int zero)
{
unsigned long page_idx;
struct page *coalesced;
- int order_size = 1 << order;

if (unlikely(order))
destroy_compound_page(page, order);

page_idx = page - base;

- BUG_ON(page_idx & (order_size - 1));
+ BUG_ON(page_idx & (( 1 << order) - 1));
BUG_ON(bad_range(zone, page));

- zone->free_pages += order_size;
while (order < MAX_ORDER-1) {
struct free_area *area;
struct page *buddy;
@@ -250,20 +251,21 @@
buddy = base + buddy_idx;
if (bad_range(zone, buddy))
break;
- if (!page_is_buddy(buddy, order))
+ if (!page_is_buddy(buddy, order, zero))
break;
/* Move the buddy up one level. */
list_del(&buddy->lru);
- area = zone->free_area + order;
+ area = zone->free_area[zero] + order;
area->nr_free--;
- rmv_page_order(buddy);
+ rmv_page_zorder(buddy);
page_idx &= buddy_idx;
order++;
}
coalesced = base + page_idx;
- set_page_order(coalesced, order);
- list_add(&coalesced->lru, &zone->free_area[order].free_list);
- zone->free_area[order].nr_free++;
+ set_page_zorder(coalesced, order, zero);
+ list_add(&coalesced->lru, &zone->free_area[zero][order].free_list);
+ zone->free_area[zero][order].nr_free++;
+ return order;
}

static inline void free_pages_check(const char *function, struct page *page)
@@ -312,8 +314,11 @@
page = list_entry(list->prev, struct page, lru);
/* have to delete it as __free_pages_bulk list manipulates */
list_del(&page->lru);
- __free_pages_bulk(page, base, zone, order);
+ if (__free_pages_bulk(page, base, zone, order, NOT_ZEROED)
+ >= sysctl_scrub_start)
+ wakeup_kscrubd(zone);
ret++;
+ zone->free_pages += 1UL << order;
}
spin_unlock_irqrestore(&zone->lock, flags);
return ret;
@@ -341,6 +346,18 @@
free_pages_bulk(page_zone(page), 1, &list, order);
}

+void end_zero_page(struct page *page, unsigned int order)
+{
+ unsigned long flags;
+ struct zone * zone = page_zone(page);
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ __free_pages_bulk(page, zone->zone_mem_map, zone, order, ZEROED);
+ zone->zero_pages += 1UL << order;
+
+ spin_unlock_irqrestore(&zone->lock, flags);
+}

/*
* The order of subdivision here is critical for the IO subsystem.
@@ -358,7 +375,7 @@
*/
static inline struct page *
expand(struct zone *zone, struct page *page,
- int low, int high, struct free_area *area)
+ int low, int high, struct free_area *area, int zero)
{
unsigned long size = 1 << high;

@@ -369,7 +386,7 @@
BUG_ON(bad_range(zone, &page[size]));
list_add(&page[size].lru, &area->free_list);
area->nr_free++;
- set_page_order(&page[size], high);
+ set_page_zorder(&page[size], high, zero);
}
return page;
}
@@ -419,23 +436,44 @@
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
-static struct page *__rmqueue(struct zone *zone, unsigned int order)
+static void inline rmpage(struct page *page, struct free_area *area)
+{
+ list_del(&page->lru);
+ rmv_page_zorder(page);
+ area->nr_free--;
+}
+
+struct page *scrubd_rmpage(struct zone *zone, struct free_area *area)
+{
+ unsigned long flags;
+ struct page *page = NULL;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ if (!list_empty(&area->free_list)) {
+ page = list_entry(area->free_list.next, struct page, lru);
+ rmpage(page, area);
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return page;
+}
+
+static struct page *__rmqueue(struct zone *zone, unsigned int order, int zero)
{
- struct free_area * area;
+ struct free_area *area;
unsigned int current_order;
struct page *page;

for (current_order = order; current_order < MAX_ORDER; ++current_order) {
- area = zone->free_area + current_order;
+ area = zone->free_area[zero] + current_order;
if (list_empty(&area->free_list))
continue;

page = list_entry(area->free_list.next, struct page, lru);
- list_del(&page->lru);
- rmv_page_order(page);
- area->nr_free--;
+ rmpage(page, zone->free_area[zero] + current_order);
zone->free_pages -= 1UL << order;
- return expand(zone, page, order, current_order, area);
+ if (zero)
+ zone->zero_pages -= 1UL << order;
+ return expand(zone, page, order, current_order, area, zero);
}

return NULL;
@@ -447,7 +485,7 @@
* Returns the number of new pages which were placed at *list.
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
- unsigned long count, struct list_head *list)
+ unsigned long count, struct list_head *list, int zero)
{
unsigned long flags;
int i;
@@ -456,7 +494,7 @@

spin_lock_irqsave(&zone->lock, flags);
for (i = 0; i < count; ++i) {
- page = __rmqueue(zone, order);
+ page = __rmqueue(zone, order, zero);
if (page == NULL)
break;
allocated++;
@@ -503,7 +541,7 @@
ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));

for (order = MAX_ORDER - 1; order >= 0; --order)
- list_for_each(curr, &zone->free_area[order].free_list) {
+ list_for_each(curr, &zone->free_area[NOT_ZEROED][order].free_list) {
unsigned long start_pfn, i;

start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
@@ -595,7 +633,7 @@
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
*/
-static inline void prep_zero_page(struct page *page, int order)
+void prep_zero_page(struct page *page, unsigned int order)
{
int i;

@@ -608,7 +646,9 @@
{
unsigned long flags;
struct page *page = NULL;
- int cold = !!(gfp_flags & __GFP_COLD);
+ int nr_pages = 1 << order;
+ int zero = !!((gfp_flags & __GFP_ZERO) && zone->zero_pages >= nr_pages);
+ int cold = !!(gfp_flags & __GFP_COLD) + 2*zero;

if (order == 0) {
struct per_cpu_pages *pcp;
@@ -617,7 +657,7 @@
local_irq_save(flags);
if (pcp->count <= pcp->low)
pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, &pcp->list);
+ pcp->batch, &pcp->list, zero);
if (pcp->count) {
page = list_entry(pcp->list.next, struct page, lru);
list_del(&page->lru);
@@ -629,16 +669,25 @@

if (page == NULL) {
spin_lock_irqsave(&zone->lock, flags);
- page = __rmqueue(zone, order);
+ page = __rmqueue(zone, order, zero);
+ /*
+ * If we failed to obtain a zero and/or unzeroed page
+ * then we may still be able to obtain the other
+ * type of page.
+ */
+ if (!page) {
+ page = __rmqueue(zone, order, !zero);
+ zero = 0;
+ }
spin_unlock_irqrestore(&zone->lock, flags);
}

if (page != NULL) {
BUG_ON(bad_range(zone, page));
- mod_page_state_zone(zone, pgalloc, 1 << order);
+ mod_page_state_zone(zone, pgalloc, nr_pages);
prep_new_page(page, order);

- if (gfp_flags & __GFP_ZERO)
+ if ((gfp_flags & __GFP_ZERO) && !zero)
prep_zero_page(page, order);

if (order && (gfp_flags & __GFP_COMP))
@@ -667,7 +716,7 @@
return 0;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
- free_pages -= z->free_area[o].nr_free << o;
+ free_pages -= (z->free_area[NOT_ZEROED][o].nr_free + z->free_area[ZEROED][o].nr_free) << o;

/* Require fewer higher order pages to be free */
min >>= 1;
@@ -1045,7 +1094,7 @@
}

void __get_zone_counts(unsigned long *active, unsigned long *inactive,
- unsigned long *free, struct pglist_data *pgdat)
+ unsigned long *free, unsigned long *zero, struct pglist_data *pgdat)
{
struct zone *zones = pgdat->node_zones;
int i;
@@ -1053,27 +1102,31 @@
*active = 0;
*inactive = 0;
*free = 0;
+ *zero = 0;
for (i = 0; i < MAX_NR_ZONES; i++) {
*active += zones[i].nr_active;
*inactive += zones[i].nr_inactive;
*free += zones[i].free_pages;
+ *zero += zones[i].zero_pages;
}
}

void get_zone_counts(unsigned long *active,
- unsigned long *inactive, unsigned long *free)
+ unsigned long *inactive, unsigned long *free, unsigned long *zero)
{
struct pglist_data *pgdat;

*active = 0;
*inactive = 0;
*free = 0;
+ *zero = 0;
for_each_pgdat(pgdat) {
- unsigned long l, m, n;
- __get_zone_counts(&l, &m, &n, pgdat);
+ unsigned long l, m, n,o;
+ __get_zone_counts(&l, &m, &n, &o, pgdat);
*active += l;
*inactive += m;
*free += n;
+ *zero += o;
}
}

@@ -1110,6 +1163,7 @@

#define K(x) ((x) << (PAGE_SHIFT-10))

+const char *temp[3] = { "hot", "cold", "zero" };
/*
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
@@ -1122,6 +1176,7 @@
unsigned long active;
unsigned long inactive;
unsigned long free;
+ unsigned long zero;
struct zone *zone;

for_each_zone(zone) {
@@ -1142,10 +1197,10 @@

pageset = zone->pageset + cpu;

- for (temperature = 0; temperature < 2; temperature++)
+ for (temperature = 0; temperature < 3; temperature++)
printk("cpu %d %s: low %d, high %d, batch %d\n",
cpu,
- temperature ? "cold" : "hot",
+ temp[temperature],
pageset->pcp[temperature].low,
pageset->pcp[temperature].high,
pageset->pcp[temperature].batch);
@@ -1153,20 +1208,21 @@
}

get_page_state(&ps);
- get_zone_counts(&active, &inactive, &free);
+ get_zone_counts(&active, &inactive, &free, &zero);

printk("\nFree pages: %11ukB (%ukB HighMem)\n",
K(nr_free_pages()),
K(nr_free_highpages()));

printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
- "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
+ "unstable:%lu free:%u zero:%lu slab:%lu mapped:%lu pagetables:%lu\n",
active,
inactive,
ps.nr_dirty,
ps.nr_writeback,
ps.nr_unstable,
nr_free_pages(),
+ zero,
ps.nr_slab,
ps.nr_mapped,
ps.nr_page_table_pages);
@@ -1215,7 +1271,7 @@

spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
- nr = zone->free_area[order].nr_free;
+ nr = zone->free_area[NOT_ZEROED][order].nr_free + zone->free_area[ZEROED][order].nr_free;
total += nr << order;
printk("%lu*%lukB ", nr, K(1UL) << order);
}
@@ -1515,8 +1571,10 @@
{
int order;
for (order = 0; order < MAX_ORDER ; order++) {
- INIT_LIST_HEAD(&zone->free_area[order].free_list);
- zone->free_area[order].nr_free = 0;
+ INIT_LIST_HEAD(&zone->free_area[NOT_ZEROED][order].free_list);
+ INIT_LIST_HEAD(&zone->free_area[ZEROED][order].free_list);
+ zone->free_area[NOT_ZEROED][order].nr_free = 0;
+ zone->free_area[ZEROED][order].nr_free = 0;
}
}

@@ -1541,6 +1599,7 @@

pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
+ init_waitqueue_head(&pgdat->kscrubd_wait);
pgdat->kswapd_max_order = 0;

for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -1564,6 +1623,7 @@
spin_lock_init(&zone->lru_lock);
zone->zone_pgdat = pgdat;
zone->free_pages = 0;
+ zone->zero_pages = 0;

zone->temp_priority = zone->prev_priority = DEF_PRIORITY;

@@ -1597,6 +1657,13 @@
pcp->high = 2 * batch;
pcp->batch = 1 * batch;
INIT_LIST_HEAD(&pcp->list);
+
+ pcp = &zone->pageset[cpu].pcp[2]; /* zero pages */
+ pcp->count = 0;
+ pcp->low = 0;
+ pcp->high = 2 * batch;
+ pcp->batch = 1 * batch;
+ INIT_LIST_HEAD(&pcp->list);
}
printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
zone_names[j], realsize, batch);
@@ -1722,7 +1789,7 @@
spin_lock_irqsave(&zone->lock, flags);
seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
for (order = 0; order < MAX_ORDER; ++order)
- seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+ seq_printf(m, "%6lu ", zone->free_area[NOT_ZEROED][order].nr_free);
spin_unlock_irqrestore(&zone->lock, flags);
seq_putc(m, '\n');
}
Index: linux-2.6.10/include/linux/mmzone.h
===================================================================
--- linux-2.6.10.orig/include/linux/mmzone.h 2005-01-10 13:48:11.000000000 -0800
+++ linux-2.6.10/include/linux/mmzone.h 2005-01-10 13:54:50.000000000 -0800
@@ -51,7 +51,7 @@
};

struct per_cpu_pageset {
- struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
+ struct per_cpu_pages pcp[3]; /* 0: hot. 1: cold 2: cold zeroed pages */
#ifdef CONFIG_NUMA
unsigned long numa_hit; /* allocated in intended node */
unsigned long numa_miss; /* allocated in non intended node */
@@ -107,10 +107,14 @@
* ZONE_HIGHMEM > 896 MB only page cache and user processes
*/

+#define NOT_ZEROED 0
+#define ZEROED 1
+
struct zone {
/* Fields commonly accessed by the page allocator */
unsigned long free_pages;
unsigned long pages_min, pages_low, pages_high;
+ unsigned long zero_pages;
/*
* protection[] is a pre-calculated number of extra pages that must be
* available in a zone in order for __alloc_pages() to allocate memory
@@ -131,7 +135,7 @@
* free areas of different sizes
*/
spinlock_t lock;
- struct free_area free_area[MAX_ORDER];
+ struct free_area free_area[2][MAX_ORDER];


ZONE_PADDING(_pad1_)
@@ -266,6 +270,9 @@
wait_queue_head_t kswapd_wait;
struct task_struct *kswapd;
int kswapd_max_order;
+
+ wait_queue_head_t kscrubd_wait;
+ struct task_struct *kscrubd;
} pg_data_t;

#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
@@ -274,9 +281,9 @@
extern struct pglist_data *pgdat_list;

void __get_zone_counts(unsigned long *active, unsigned long *inactive,
- unsigned long *free, struct pglist_data *pgdat);
+ unsigned long *free, unsigned long *zero, struct pglist_data *pgdat);
void get_zone_counts(unsigned long *active, unsigned long *inactive,
- unsigned long *free);
+ unsigned long *free, unsigned long *zero);
void build_all_zonelists(void);
void wakeup_kswapd(struct zone *zone, int order);
int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
Index: linux-2.6.10/fs/proc/proc_misc.c
===================================================================
--- linux-2.6.10.orig/fs/proc/proc_misc.c 2005-01-10 13:48:10.000000000 -0800
+++ linux-2.6.10/fs/proc/proc_misc.c 2005-01-10 13:54:50.000000000 -0800
@@ -123,12 +123,13 @@
unsigned long inactive;
unsigned long active;
unsigned long free;
+ unsigned long zero;
unsigned long committed;
unsigned long allowed;
struct vmalloc_info vmi;

get_page_state(&ps);
- get_zone_counts(&active, &inactive, &free);
+ get_zone_counts(&active, &inactive, &free, &zero);

/*
* display in kilobytes.
@@ -148,6 +149,7 @@
len = sprintf(page,
"MemTotal: %8lu kB\n"
"MemFree: %8lu kB\n"
+ "MemZero: %8lu kB\n"
"Buffers: %8lu kB\n"
"Cached: %8lu kB\n"
"SwapCached: %8lu kB\n"
@@ -171,6 +173,7 @@
"VmallocChunk: %8lu kB\n",
K(i.totalram),
K(i.freeram),
+ K(zero),
K(i.bufferram),
K(get_page_cache_size()-total_swapcache_pages-i.bufferram),
K(total_swapcache_pages),
Index: linux-2.6.10/mm/readahead.c
===================================================================
--- linux-2.6.10.orig/mm/readahead.c 2005-01-10 13:48:11.000000000 -0800
+++ linux-2.6.10/mm/readahead.c 2005-01-10 13:54:50.000000000 -0800
@@ -573,7 +573,8 @@
unsigned long active;
unsigned long inactive;
unsigned long free;
+ unsigned long zero;

- __get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id()));
+ __get_zone_counts(&active, &inactive, &free, &zero, NODE_DATA(numa_node_id()));
return min(nr, (inactive + free) / 2);
}
Index: linux-2.6.10/drivers/base/node.c
===================================================================
--- linux-2.6.10.orig/drivers/base/node.c 2005-01-10 13:48:08.000000000 -0800
+++ linux-2.6.10/drivers/base/node.c 2005-01-10 13:54:50.000000000 -0800
@@ -42,13 +42,15 @@
unsigned long inactive;
unsigned long active;
unsigned long free;
+ unsigned long zero;

si_meminfo_node(&i, nid);
- __get_zone_counts(&active, &inactive, &free, NODE_DATA(nid));
+ __get_zone_counts(&active, &inactive, &free, &zero, NODE_DATA(nid));

n = sprintf(buf, "\n"
"Node %d MemTotal: %8lu kB\n"
"Node %d MemFree: %8lu kB\n"
+ "Node %d MemZero: %8lu kB\n"
"Node %d MemUsed: %8lu kB\n"
"Node %d Active: %8lu kB\n"
"Node %d Inactive: %8lu kB\n"
@@ -58,6 +60,7 @@
"Node %d LowFree: %8lu kB\n",
nid, K(i.totalram),
nid, K(i.freeram),
+ nid, K(zero),
nid, K(i.totalram - i.freeram),
nid, K(active),
nid, K(inactive),
Index: linux-2.6.10/include/linux/sched.h
===================================================================
--- linux-2.6.10.orig/include/linux/sched.h 2005-01-10 13:48:11.000000000 -0800
+++ linux-2.6.10/include/linux/sched.h 2005-01-10 13:54:50.000000000 -0800
@@ -731,6 +731,7 @@
#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
#define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */
#define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */
+#define PF_KSCRUBD 0x00800000 /* I am kscrubd */

#ifdef CONFIG_SMP
extern int set_cpus_allowed(task_t *p, cpumask_t new_mask);
Index: linux-2.6.10/mm/Makefile
===================================================================
--- linux-2.6.10.orig/mm/Makefile 2005-01-10 13:48:11.000000000 -0800
+++ linux-2.6.10/mm/Makefile 2005-01-10 13:54:50.000000000 -0800
@@ -5,7 +5,7 @@
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
- vmalloc.o
+ vmalloc.o scrubd.o

obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
page_alloc.o page-writeback.o pdflush.o \
Index: linux-2.6.10/mm/scrubd.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.10/mm/scrubd.c 2005-01-10 14:56:20.000000000 -0800
@@ -0,0 +1,134 @@
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/file.h>
+#include <linux/suspend.h>
+#include <linux/sysctl.h>
+#include <linux/scrub.h>
+
+unsigned int sysctl_scrub_start = 5; /* if a page of this order is coalesed then run kscrubd */
+unsigned int sysctl_scrub_stop = 2; /* Mininum order of page to zero */
+unsigned int sysctl_scrub_load = 999; /* Do not run scrubd if load > */
+
+/*
+ * sysctl handler for /proc/sys/vm/scrub_start
+ */
+int scrub_start_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+ proc_dointvec(table, write, file, buffer, length, ppos);
+ if (sysctl_scrub_start < MAX_ORDER) {
+ struct zone *zone;
+
+ for_each_zone(zone)
+ wakeup_kscrubd(zone);
+ }
+ return 0;
+}
+
+LIST_HEAD(zero_drivers);
+
+/*
+ * zero_highest_order_page takes a page off the freelist
+ * and then hands it off to block zeroing agents.
+ * The cleared pages are added to the back of
+ * the freelist where the page allocator may pick them up.
+ */
+int zero_highest_order_page(struct zone *z)
+{
+ int order;
+
+ for(order = MAX_ORDER-1; order >= sysctl_scrub_stop; order--) {
+ struct free_area *area = z->free_area[NOT_ZEROED] + order;
+ if (!list_empty(&area->free_list)) {
+ struct page *page = scrubd_rmpage(z, area);
+ struct list_head *l;
+ int size = PAGE_SIZE << order;
+
+ if (!page)
+ continue;
+
+ list_for_each(l, &zero_drivers) {
+ struct zero_driver *driver = list_entry(l, struct zero_driver, list);
+
+ if (driver->start(page_address(page), size) == 0)
+ goto done;
+ }
+
+ /* Unable to find a zeroing device that would
+ * deal with this page so just do it on our own.
+ * This will likely thrash the cpu caches.
+ */
+ cond_resched();
+ prep_zero_page(page, order);
+done:
+ end_zero_page(page, order);
+ cond_resched();
+ return 1 << order;
+ }
+ }
+ return 0;
+}
+
+/*
+ * scrub_pgdat() will work across all this node's zones.
+ */
+static void scrub_pgdat(pg_data_t *pgdat)
+{
+ int i;
+ unsigned long pages_zeroed;
+
+ if (system_state != SYSTEM_RUNNING)
+ return;
+
+ do {
+ pages_zeroed = 0;
+ for (i = 0; i < pgdat->nr_zones; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ pages_zeroed += zero_highest_order_page(zone);
+ }
+ } while (pages_zeroed);
+}
+
+/*
+ * The background scrub daemon, started as a kernel thread
+ * from the init process.
+ */
+static int kscrubd(void *p)
+{
+ pg_data_t *pgdat = (pg_data_t*)p;
+ struct task_struct *tsk = current;
+ DEFINE_WAIT(wait);
+ cpumask_t cpumask;
+
+ daemonize("kscrubd%d", pgdat->node_id);
+ cpumask = node_to_cpumask(pgdat->node_id);
+ if (!cpus_empty(cpumask))
+ set_cpus_allowed(tsk, cpumask);
+
+ tsk->flags |= PF_MEMALLOC | PF_KSCRUBD;
+
+ for ( ; ; ) {
+ if (current->flags & PF_FREEZE)
+ refrigerator(PF_FREEZE);
+ prepare_to_wait(&pgdat->kscrubd_wait, &wait, TASK_INTERRUPTIBLE);
+ schedule();
+ finish_wait(&pgdat->kscrubd_wait, &wait);
+
+ scrub_pgdat(pgdat);
+ }
+ return 0;
+}
+
+static int __init kscrubd_init(void)
+{
+ pg_data_t *pgdat;
+ for_each_pgdat(pgdat)
+ pgdat->kscrubd
+ = find_task_by_pid(kernel_thread(kscrubd, pgdat, CLONE_KERNEL));
+ return 0;
+}
+
+module_init(kscrubd_init)
Index: linux-2.6.10/include/linux/scrub.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.10/include/linux/scrub.h 2005-01-10 14:34:25.000000000 -0800
@@ -0,0 +1,49 @@
+#ifndef _LINUX_SCRUB_H
+#define _LINUX_SCRUB_H
+
+/*
+ * Definitions for scrubbing of memory include an interface
+ * for drivers that may that allow the zeroing of memory
+ * without invalidating the caches.
+ *
+ * Christoph Lameter, December 2004.
+ */
+
+struct zero_driver {
+ int (*start)(void *, unsigned long); /* Start bzero transfer */
+ struct list_head list;
+};
+
+extern struct list_head zero_drivers;
+
+extern unsigned int sysctl_scrub_start;
+extern unsigned int sysctl_scrub_stop;
+extern unsigned int sysctl_scrub_load;
+
+/* Registering and unregistering zero drivers */
+static inline void register_zero_driver(struct zero_driver *z)
+{
+ list_add(&z->list, &zero_drivers);
+}
+
+static inline void unregister_zero_driver(struct zero_driver *z)
+{
+ list_del(&z->list);
+}
+
+extern struct page *scrubd_rmpage(struct zone *zone, struct free_area *area);
+
+static void inline wakeup_kscrubd(struct zone *zone)
+{
+ if (avenrun[0] >= ((unsigned long)sysctl_scrub_load << FSHIFT))
+ return;
+ if (!waitqueue_active(&zone->zone_pgdat->kscrubd_wait))
+ return;
+ wake_up_interruptible(&zone->zone_pgdat->kscrubd_wait);
+}
+
+int scrub_start_handler(struct ctl_table *, int, struct file *,
+ void __user *, size_t *, loff_t *);
+
+extern void end_zero_page(struct page *page, unsigned int order);
+#endif
Index: linux-2.6.10/kernel/sysctl.c
===================================================================
--- linux-2.6.10.orig/kernel/sysctl.c 2005-01-10 13:48:11.000000000 -0800
+++ linux-2.6.10/kernel/sysctl.c 2005-01-10 13:54:50.000000000 -0800
@@ -40,6 +40,7 @@
#include <linux/times.h>
#include <linux/limits.h>
#include <linux/dcache.h>
+#include <linux/scrub.h>
#include <linux/syscalls.h>

#include <asm/uaccess.h>
@@ -827,6 +828,33 @@
.strategy = &sysctl_jiffies,
},
#endif
+ {
+ .ctl_name = VM_SCRUB_START,
+ .procname = "scrub_start",
+ .data = &sysctl_scrub_start,
+ .maxlen = sizeof(sysctl_scrub_start),
+ .mode = 0644,
+ .proc_handler = &scrub_start_handler,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = VM_SCRUB_STOP,
+ .procname = "scrub_stop",
+ .data = &sysctl_scrub_stop,
+ .maxlen = sizeof(sysctl_scrub_stop),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = VM_SCRUB_LOAD,
+ .procname = "scrub_load",
+ .data = &sysctl_scrub_load,
+ .maxlen = sizeof(sysctl_scrub_load),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
{ .ctl_name = 0 }
};

Index: linux-2.6.10/include/linux/sysctl.h
===================================================================
--- linux-2.6.10.orig/include/linux/sysctl.h 2005-01-10 13:48:11.000000000 -0800
+++ linux-2.6.10/include/linux/sysctl.h 2005-01-10 13:54:50.000000000 -0800
@@ -169,6 +169,9 @@
VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */
VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
+ VM_SCRUB_START=30, /* percentage * 10 at which to start scrubd */
+ VM_SCRUB_STOP=31, /* percentage * 10 at which to stop scrubd */
+ VM_SCRUB_LOAD=32, /* Load factor at which not to scrub anymore */
};


Index: linux-2.6.10/include/linux/gfp.h
===================================================================
--- linux-2.6.10.orig/include/linux/gfp.h 2005-01-10 13:48:11.000000000 -0800
+++ linux-2.6.10/include/linux/gfp.h 2005-01-10 13:54:50.000000000 -0800
@@ -132,4 +132,5 @@

void page_alloc_init(void);

+void prep_zero_page(struct page *, unsigned int order);
#endif /* __LINUX_GFP_H */

2005-01-11 00:57:09

by Chris Wright

[permalink] [raw]
Subject: Re: Prezeroing V4 [1/4]: Arch specific page zeroing during page fault

* Christoph Lameter ([email protected]) wrote:
> @@ -1795,7 +1786,7 @@
>
> if (unlikely(anon_vma_prepare(vma)))
> goto no_mem;
> - page = alloc_page_vma(GFP_HIGHZERO, vma, addr);
> + page = alloc_zeroed_user_highpage(vma, addr);

Oops, HIGHZERO is gone already in Linus' tree.

thanks,
-chris
--
Linux Security Modules http://lsm.immunix.org http://lsm.bkbits.net

2005-01-11 01:03:26

by Chris Wright

[permalink] [raw]
Subject: Re: Prezeroing V4 [1/4]: Arch specific page zeroing during page fault

* Christoph Lameter ([email protected]) wrote:
> Use bk13 as I indicated.

Ah, so you did, thanks ;-)
-chris
--
Linux Security Modules http://lsm.immunix.org http://lsm.bkbits.net

2005-01-11 00:58:20

by Christoph Lameter

[permalink] [raw]
Subject: Re: Prezeroing V4 [1/4]: Arch specific page zeroing during page fault

On Mon, 10 Jan 2005, Chris Wright wrote:

> * Christoph Lameter ([email protected]) wrote:
> > @@ -1795,7 +1786,7 @@
> >
> > if (unlikely(anon_vma_prepare(vma)))
> > goto no_mem;
> > - page = alloc_page_vma(GFP_HIGHZERO, vma, addr);
> > + page = alloc_zeroed_user_highpage(vma, addr);
>
> Oops, HIGHZERO is gone already in Linus' tree.

Use bk13 as I indicated.

2005-01-21 20:11:42

by Christoph Lameter

[permalink] [raw]
Subject: alloc_zeroed_user_highpage to fix the clear_user_highpage issue

This patch adds a new function alloc_zeroed_user_highpage that is then used in the
anonymous page fault handler and in the COW code to allocate zeroed pages. The function
can be defined per arch to setup special processing for user pages by defining
__HAVE_ARCH_ALLOC_ZEROED_USER_PAGE. For arches that do not need to do special things
for user pages, alloc_zeroed_user_highpage is defined to simply do

alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)

Patch against 2.6.11-rc1-bk9

This patch needs to update a number of archs. Wish there was a better way
to do this.

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.10/include/linux/highmem.h
===================================================================
--- linux-2.6.10.orig/include/linux/highmem.h 2005-01-21 10:43:59.000000000 -0800
+++ linux-2.6.10/include/linux/highmem.h 2005-01-21 10:44:27.000000000 -0800
@@ -42,6 +42,17 @@ static inline void clear_user_highpage(s
smp_wmb();
}

+#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+static inline struct page* alloc_zeroed_user_highpage(struct vm_area_struct *vma,
+ unsigned long vaddr)
+{
+ struct page *page = alloc_page_vma(GFP_HIGHUSER, vma, vaddr);
+
+ clear_user_highpage(page, vaddr);
+ return page;
+}
+#endif
+
static inline void clear_highpage(struct page *page)
{
void *kaddr = kmap_atomic(page, KM_USER0);
Index: linux-2.6.10/mm/memory.c
===================================================================
--- linux-2.6.10.orig/mm/memory.c 2005-01-21 10:43:59.000000000 -0800
+++ linux-2.6.10/mm/memory.c 2005-01-21 11:10:42.000000000 -0800
@@ -84,20 +84,6 @@ EXPORT_SYMBOL(high_memory);
EXPORT_SYMBOL(vmalloc_earlyreserve);

/*
- * We special-case the C-O-W ZERO_PAGE, because it's such
- * a common occurrence (no need to read the page to know
- * that it's zero - better for the cache and memory subsystem).
- */
-static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address)
-{
- if (from == ZERO_PAGE(address)) {
- clear_user_highpage(to, address);
- return;
- }
- copy_user_highpage(to, from, address);
-}
-
-/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
*/
@@ -1329,11 +1315,16 @@ static int do_wp_page(struct mm_struct *

if (unlikely(anon_vma_prepare(vma)))
goto no_new_page;
- new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
- if (!new_page)
- goto no_new_page;
- copy_cow_page(old_page,new_page,address);
-
+ if (old_page == ZERO_PAGE(address)) {
+ new_page = alloc_zeroed_user_highpage(vma, address);
+ if (!new_page)
+ goto no_new_page;
+ } else {
+ new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+ if (!new_page)
+ goto no_new_page;
+ copy_user_highpage(new_page, old_page, address);
+ }
/*
* Re-check the pte - we dropped the lock
*/
@@ -1795,10 +1786,9 @@ do_anonymous_page(struct mm_struct *mm,

if (unlikely(anon_vma_prepare(vma)))
goto no_mem;
- page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+ page = alloc_zeroed_user_highpage(vma, addr);
if (!page)
goto no_mem;
- clear_user_highpage(page, addr);

spin_lock(&mm->page_table_lock);
page_table = pte_offset_map(pmd, addr);
Index: linux-2.6.10/include/asm-ia64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-ia64/page.h 2004-12-24 13:34:00.000000000 -0800
+++ linux-2.6.10/include/asm-ia64/page.h 2005-01-21 10:44:27.000000000 -0800
@@ -75,6 +75,16 @@ do { \
flush_dcache_page(page); \
} while (0)

+
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+({ \
+ struct page *page = alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr); \
+ flush_dcache_page(page); \
+ page; \
+})
+
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)

#ifdef CONFIG_VIRTUAL_MEM_MAP
Index: linux-2.6.10/include/asm-i386/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-i386/page.h 2005-01-21 10:43:58.000000000 -0800
+++ linux-2.6.10/include/asm-i386/page.h 2005-01-21 10:44:27.000000000 -0800
@@ -36,6 +36,9 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
/*
* These are used to make use of C type-checking..
*/
Index: linux-2.6.10/include/asm-x86_64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-x86_64/page.h 2005-01-21 10:43:59.000000000 -0800
+++ linux-2.6.10/include/asm-x86_64/page.h 2005-01-21 10:44:27.000000000 -0800
@@ -38,6 +38,8 @@ void copy_page(void *, void *);
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
/*
* These are used to make use of C type-checking..
*/
Index: linux-2.6.10/include/asm-m32r/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-m32r/page.h 2004-12-24 13:34:29.000000000 -0800
+++ linux-2.6.10/include/asm-m32r/page.h 2005-01-21 10:44:27.000000000 -0800
@@ -17,6 +17,9 @@ extern void copy_page(void *to, void *fr
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
/*
* These are used to make use of C type-checking..
*/
Index: linux-2.6.10/include/asm-alpha/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-alpha/page.h 2004-12-24 13:35:24.000000000 -0800
+++ linux-2.6.10/include/asm-alpha/page.h 2005-01-21 10:44:27.000000000 -0800
@@ -18,6 +18,9 @@
extern void clear_page(void *page);
#define clear_user_page(page, vaddr, pg) clear_page(page)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vmaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
extern void copy_page(void * _to, void * _from);
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

Index: linux-2.6.10/include/asm-m68knommu/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-m68knommu/page.h 2005-01-21 10:43:58.000000000 -0800
+++ linux-2.6.10/include/asm-m68knommu/page.h 2005-01-21 10:44:27.000000000 -0800
@@ -30,6 +30,9 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
/*
* These are used to make use of C type-checking..
*/
Index: linux-2.6.10/include/asm-cris/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-cris/page.h 2004-12-24 13:34:30.000000000 -0800
+++ linux-2.6.10/include/asm-cris/page.h 2005-01-21 10:44:27.000000000 -0800
@@ -21,6 +21,9 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
/*
* These are used to make use of C type-checking..
*/
Index: linux-2.6.10/include/asm-s390/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-s390/page.h 2004-12-24 13:34:01.000000000 -0800
+++ linux-2.6.10/include/asm-s390/page.h 2005-01-21 10:44:27.000000000 -0800
@@ -106,6 +106,9 @@ static inline void copy_page(void *to, v
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
/* Pure 2^n version of get_order */
extern __inline__ int get_order(unsigned long size)
{
Index: linux-2.6.10/include/asm-h8300/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-h8300/page.h 2004-12-24 13:35:25.000000000 -0800
+++ linux-2.6.10/include/asm-h8300/page.h 2005-01-21 10:44:27.000000000 -0800
@@ -30,6 +30,9 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
/*
* These are used to make use of C type-checking..
*/

2005-01-21 20:18:07

by Christoph Lameter

[permalink] [raw]
Subject: Extend clear_page by an order parameter

The zeroing of a page of a arbitrary order in page_alloc.c and in hugetlb.c may benefit from a
clear_page that is capable of zeroing multiple pages at once (and scrubd
too but that is now an independent patch). The following patch extends
clear_page with a second parameter specifying the order of the page to be zeroed to allow an
efficient zeroing of pages. Hope I caught everything....

Patch against 2.6.11-rc1-bk9

Architecture support:
---------------------

Known to work:

ia64
i386
x86_64
sparc64
m68k

Trivial modification expected to simply work:

arm
cris
h8300
m68knommu
ppc
ppc64
sh64
v850
parisc
sparc
um

Modification made but it would be good to have some feedback from the arch maintainers:

s390
alpha
sh
mips
m32r

Index: linux-2.6.10/mm/page_alloc.c
===================================================================
--- linux-2.6.10.orig/mm/page_alloc.c 2005-01-21 10:43:59.000000000 -0800
+++ linux-2.6.10/mm/page_alloc.c 2005-01-21 11:51:39.000000000 -0800
@@ -591,11 +591,16 @@ void fastcall free_cold_page(struct page
free_hot_cold_page(page, 1);
}

-static inline void prep_zero_page(struct page *page, int order, int gfp_flags)
+void prep_zero_page(struct page *page, unsigned int order, unsigned int gfp_flags)
{
int i;

BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+ if (!PageHighMem(page)) {
+ clear_page(page_address(page), order);
+ return;
+ }
+
for(i = 0; i < (1 << order); i++)
clear_highpage(page + i);
}
Index: linux-2.6.10/mm/hugetlb.c
===================================================================
--- linux-2.6.10.orig/mm/hugetlb.c 2005-01-21 10:43:59.000000000 -0800
+++ linux-2.6.10/mm/hugetlb.c 2005-01-21 11:51:39.000000000 -0800
@@ -78,7 +78,6 @@ void free_huge_page(struct page *page)
struct page *alloc_huge_page(void)
{
struct page *page;
- int i;

spin_lock(&hugetlb_lock);
page = dequeue_huge_page();
@@ -89,8 +88,7 @@ struct page *alloc_huge_page(void)
spin_unlock(&hugetlb_lock);
set_page_count(page, 1);
page[1].mapping = (void *)free_huge_page;
- for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
- clear_highpage(&page[i]);
+ prep_zero_page(page, HUGETLB_PAGE_ORDER, GFP_HIGHUSER);
return page;
}

Index: linux-2.6.10/include/linux/highmem.h
===================================================================
--- linux-2.6.10.orig/include/linux/highmem.h 2005-01-21 10:43:59.000000000 -0800
+++ linux-2.6.10/include/linux/highmem.h 2005-01-21 11:51:39.000000000 -0800
@@ -45,7 +45,7 @@ static inline void clear_user_highpage(s
static inline void clear_highpage(struct page *page)
{
void *kaddr = kmap_atomic(page, KM_USER0);
- clear_page(kaddr);
+ clear_page(kaddr, 0);
kunmap_atomic(kaddr, KM_USER0);
}

Index: linux-2.6.10/drivers/net/tc35815.c
===================================================================
--- linux-2.6.10.orig/drivers/net/tc35815.c 2004-12-24 13:33:48.000000000 -0800
+++ linux-2.6.10/drivers/net/tc35815.c 2005-01-21 11:51:39.000000000 -0800
@@ -657,7 +657,7 @@ tc35815_init_queues(struct net_device *d
dma_cache_wback_inv((unsigned long)lp->fd_buf, PAGE_SIZE * FD_PAGE_NUM);
#endif
} else {
- clear_page(lp->fd_buf);
+ clear_page(lp->fd_buf, 0);
#ifdef __mips__
dma_cache_wback_inv((unsigned long)lp->fd_buf, PAGE_SIZE * FD_PAGE_NUM);
#endif
Index: linux-2.6.10/fs/afs/file.c
===================================================================
--- linux-2.6.10.orig/fs/afs/file.c 2004-12-24 13:35:59.000000000 -0800
+++ linux-2.6.10/fs/afs/file.c 2005-01-21 11:51:39.000000000 -0800
@@ -172,7 +172,7 @@ static int afs_file_readpage(struct file
(size_t) PAGE_SIZE);
desc.buffer = kmap(page);

- clear_page(desc.buffer);
+ clear_page(desc.buffer, 0);

/* read the contents of the file from the server into the
* page */
Index: linux-2.6.10/fs/ntfs/compress.c
===================================================================
--- linux-2.6.10.orig/fs/ntfs/compress.c 2004-12-24 13:34:45.000000000 -0800
+++ linux-2.6.10/fs/ntfs/compress.c 2005-01-21 11:51:39.000000000 -0800
@@ -107,7 +107,7 @@ static void zero_partial_compressed_page
* FIXME: Using clear_page() will become wrong when we get
* PAGE_CACHE_SIZE != PAGE_SIZE but for now there is no problem.
*/
- clear_page(kp);
+ clear_page(kp, 0);
return;
}
kp_ofs = ni->initialized_size & ~PAGE_CACHE_MASK;
@@ -742,7 +742,7 @@ lock_retry_remap:
* for now there is no problem.
*/
if (likely(!cur_ofs))
- clear_page(page_address(page));
+ clear_page(page_address(page), 0);
else
memset(page_address(page) + cur_ofs, 0,
PAGE_CACHE_SIZE -
Index: linux-2.6.10/include/asm-ia64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-ia64/page.h 2004-12-24 13:34:00.000000000 -0800
+++ linux-2.6.10/include/asm-ia64/page.h 2005-01-21 11:51:39.000000000 -0800
@@ -56,7 +56,7 @@
# ifdef __KERNEL__
# define STRICT_MM_TYPECHECKS

-extern void clear_page (void *page);
+extern void clear_page (void *page, int order);
extern void copy_page (void *to, void *from);

/*
@@ -65,7 +65,7 @@ extern void copy_page (void *to, void *f
*/
#define clear_user_page(addr, vaddr, page) \
do { \
- clear_page(addr); \
+ clear_page(addr, 0); \
flush_dcache_page(page); \
} while (0)

Index: linux-2.6.10/arch/ia64/lib/clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/ia64/lib/clear_page.S 2004-12-24 13:33:50.000000000 -0800
+++ linux-2.6.10/arch/ia64/lib/clear_page.S 2005-01-21 11:51:39.000000000 -0800
@@ -7,6 +7,7 @@
* 1/06/01 davidm Tuned for Itanium.
* 2/12/02 kchen Tuned for both Itanium and McKinley
* 3/08/02 davidm Some more tweaking
+ * 12/10/04 clameter Make it work on pages of order size
*/
#include <linux/config.h>

@@ -29,27 +30,33 @@
#define dst4 r11

#define dst_last r31
+#define totsize r14

GLOBAL_ENTRY(clear_page)
.prologue
- .regstk 1,0,0,0
- mov r16 = PAGE_SIZE/L3_LINE_SIZE-1 // main loop count, -1=repeat/until
+ .regstk 2,0,0,0
+ mov r16 = PAGE_SIZE/L3_LINE_SIZE // main loop count
+ mov totsize = PAGE_SIZE
.save ar.lc, saved_lc
mov saved_lc = ar.lc
-
+ ;;
.body
+ adds dst1 = 16, in0
mov ar.lc = (PREFETCH_LINES - 1)
mov dst_fetch = in0
- adds dst1 = 16, in0
adds dst2 = 32, in0
+ shl r16 = r16, in1
+ shl totsize = totsize, in1
;;
.fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
adds dst3 = 48, in0 // executing this multiple times is harmless
br.cloop.sptk.few .fetch
+ add r16 = -1,r16
+ add dst_last = totsize, dst_fetch
+ adds dst4 = 64, in0
;;
- addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
mov ar.lc = r16 // one L3 line per iteration
- adds dst4 = 64, in0
+ adds dst_last = -PREFETCH_LINES*L3_LINE_SIZE, dst_last
;;
#ifdef CONFIG_ITANIUM
// Optimized for Itanium
Index: linux-2.6.10/include/asm-i386/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-i386/page.h 2005-01-21 10:43:58.000000000 -0800
+++ linux-2.6.10/include/asm-i386/page.h 2005-01-21 11:51:39.000000000 -0800
@@ -18,7 +18,7 @@

#include <asm/mmx.h>

-#define clear_page(page) mmx_clear_page((void *)(page))
+#define clear_page(page, order) mmx_clear_page((void *)(page),order)
#define copy_page(to,from) mmx_copy_page(to,from)

#else
@@ -28,12 +28,12 @@
* Maybe the K6-III ?
*/

-#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE)

#endif

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

/*
Index: linux-2.6.10/include/asm-i386/mmx.h
===================================================================
--- linux-2.6.10.orig/include/asm-i386/mmx.h 2004-12-24 13:34:57.000000000 -0800
+++ linux-2.6.10/include/asm-i386/mmx.h 2005-01-21 11:51:39.000000000 -0800
@@ -8,7 +8,7 @@
#include <linux/types.h>

extern void *_mmx_memcpy(void *to, const void *from, size_t size);
-extern void mmx_clear_page(void *page);
+extern void mmx_clear_page(void *page, int order);
extern void mmx_copy_page(void *to, void *from);

#endif
Index: linux-2.6.10/arch/i386/lib/mmx.c
===================================================================
--- linux-2.6.10.orig/arch/i386/lib/mmx.c 2004-12-24 13:34:48.000000000 -0800
+++ linux-2.6.10/arch/i386/lib/mmx.c 2005-01-21 11:51:39.000000000 -0800
@@ -128,7 +128,7 @@ void *_mmx_memcpy(void *to, const void *
* other MMX using processors do not.
*/

-static void fast_clear_page(void *page)
+static void fast_clear_page(void *page, int order)
{
int i;

@@ -138,7 +138,7 @@ static void fast_clear_page(void *page)
" pxor %%mm0, %%mm0\n" : :
);

- for(i=0;i<4096/64;i++)
+ for(i=0;i<((4096/64) << order);i++)
{
__asm__ __volatile__ (
" movntq %%mm0, (%0)\n"
@@ -257,7 +257,7 @@ static void fast_copy_page(void *to, voi
* Generic MMX implementation without K7 specific streaming
*/

-static void fast_clear_page(void *page)
+static void fast_clear_page(void *page, int order)
{
int i;

@@ -267,7 +267,7 @@ static void fast_clear_page(void *page)
" pxor %%mm0, %%mm0\n" : :
);

- for(i=0;i<4096/128;i++)
+ for(i=0;i<((4096/128) << order);i++)
{
__asm__ __volatile__ (
" movq %%mm0, (%0)\n"
@@ -359,23 +359,23 @@ static void fast_copy_page(void *to, voi
* Favour MMX for page clear and copy.
*/

-static void slow_zero_page(void * page)
+static void slow_clear_page(void * page, int order)
{
int d0, d1;
__asm__ __volatile__( \
"cld\n\t" \
"rep ; stosl" \
: "=&c" (d0), "=&D" (d1)
- :"a" (0),"1" (page),"0" (1024)
+ :"a" (0),"1" (page),"0" (1024 << order)
:"memory");
}
-
-void mmx_clear_page(void * page)
+
+void mmx_clear_page(void * page, int order)
{
if(unlikely(in_interrupt()))
- slow_zero_page(page);
+ slow_clear_page(page, order);
else
- fast_clear_page(page);
+ fast_clear_page(page, order);
}

static void slow_copy_page(void *to, void *from)
Index: linux-2.6.10/include/asm-x86_64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-x86_64/page.h 2005-01-21 10:43:59.000000000 -0800
+++ linux-2.6.10/include/asm-x86_64/page.h 2005-01-21 11:51:39.000000000 -0800
@@ -32,10 +32,10 @@
#ifdef __KERNEL__
#ifndef __ASSEMBLY__

-void clear_page(void *);
+void clear_page(void *, int);
void copy_page(void *, void *);

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

/*
Index: linux-2.6.10/include/asm-x86_64/mmx.h
===================================================================
--- linux-2.6.10.orig/include/asm-x86_64/mmx.h 2004-12-24 13:34:57.000000000 -0800
+++ linux-2.6.10/include/asm-x86_64/mmx.h 2005-01-21 11:51:39.000000000 -0800
@@ -8,7 +8,7 @@
#include <linux/types.h>

extern void *_mmx_memcpy(void *to, const void *from, size_t size);
-extern void mmx_clear_page(void *page);
+extern void mmx_clear_page(void *page, int order);
extern void mmx_copy_page(void *to, void *from);

#endif
Index: linux-2.6.10/arch/x86_64/lib/clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/x86_64/lib/clear_page.S 2004-12-24 13:34:33.000000000 -0800
+++ linux-2.6.10/arch/x86_64/lib/clear_page.S 2005-01-21 11:51:39.000000000 -0800
@@ -1,12 +1,16 @@
/*
* Zero a page.
* rdi page
+ * rsi order
*/
.globl clear_page
.p2align 4
clear_page:
+ movl $4096/64,%eax
+ movl %esi, %ecx
+ shll %cl, %eax
+ movl %eax, %ecx
xorl %eax,%eax
- movl $4096/64,%ecx
.p2align 4
.Lloop:
decl %ecx
@@ -41,7 +45,10 @@ clear_page_end:

.section .altinstr_replacement,"ax"
clear_page_c:
- movl $4096/8,%ecx
+ movl $4096/8,%eax
+ movl %esi, %ecx
+ shll %cl, %eax
+ movl %eax, %ecx
xorl %eax,%eax
rep
stosq
Index: linux-2.6.10/include/asm-sparc/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-sparc/page.h 2004-12-24 13:34:29.000000000 -0800
+++ linux-2.6.10/include/asm-sparc/page.h 2005-01-21 11:51:39.000000000 -0800
@@ -28,10 +28,10 @@

#ifndef __ASSEMBLY__

-#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE)
#define clear_user_page(addr, vaddr, page) \
- do { clear_page(addr); \
+ do { clear_page(addr, 0); \
sparc_flush_page_to_ram(page); \
} while (0)
#define copy_user_page(to, from, vaddr, page) \
Index: linux-2.6.10/include/asm-s390/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-s390/page.h 2004-12-24 13:34:01.000000000 -0800
+++ linux-2.6.10/include/asm-s390/page.h 2005-01-21 11:51:39.000000000 -0800
@@ -22,12 +22,12 @@

#ifndef __s390x__

-static inline void clear_page(void *page)
+static inline void clear_page(void *page, int order)
{
register_pair rp;

rp.subreg.even = (unsigned long) page;
- rp.subreg.odd = (unsigned long) 4096;
+ rp.subreg.odd = (unsigned long) 4096 << order;
asm volatile (" slr 1,1\n"
" mvcl %0,0"
: "+&a" (rp) : : "memory", "cc", "1" );
@@ -63,14 +63,19 @@ static inline void copy_page(void *to, v

#else /* __s390x__ */

-static inline void clear_page(void *page)
+static inline void clear_page(void *page, int order)
{
- asm volatile (" lgr 2,%0\n"
+ int nr = 1 << order;
+
+ while (nr-- >0) {
+ asm volatile (" lgr 2,%0\n"
" lghi 3,4096\n"
" slgr 1,1\n"
" mvcl 2,0"
: : "a" ((void *) (page))
: "memory", "cc", "1", "2", "3" );
+ page += PAGE_SIZE;
+ }
}

static inline void copy_page(void *to, void *from)
@@ -103,7 +108,7 @@ static inline void copy_page(void *to, v

#endif /* __s390x__ */

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

/* Pure 2^n version of get_order */
Index: linux-2.6.10/include/asm-sh/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-sh/page.h 2004-12-24 13:35:28.000000000 -0800
+++ linux-2.6.10/include/asm-sh/page.h 2005-01-21 11:51:39.000000000 -0800
@@ -36,12 +36,22 @@
#ifdef __KERNEL__
#ifndef __ASSEMBLY__

-extern void (*clear_page)(void *to);
+extern void (*_clear_page)(void *to);
extern void (*copy_page)(void *to, void *from);

extern void clear_page_slow(void *to);
extern void copy_page_slow(void *to, void *from);

+static inline void clear_page(void *page, int order)
+{
+ unsigned int nr = 1 << order;
+
+ while (nr-- >0) {
+ _clear_page(page);
+ page += PAGE_SIZE;
+ }
+}
+
#if defined(CONFIG_SH7705_CACHE_32KB) && defined(CONFIG_MMU)
struct page;
extern void clear_user_page(void *to, unsigned long address, struct page *pg);
@@ -49,7 +59,7 @@ extern void copy_user_page(void *to, voi
extern void __clear_user_page(void *to, void *orig_to);
extern void __copy_user_page(void *to, void *from, void *orig_to);
#elif defined(CONFIG_CPU_SH2) || defined(CONFIG_CPU_SH3) || !defined(CONFIG_MMU)
-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
#elif defined(CONFIG_CPU_SH4)
struct page;
Index: linux-2.6.10/arch/alpha/lib/clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/alpha/lib/clear_page.S 2004-12-24 13:35:25.000000000 -0800
+++ linux-2.6.10/arch/alpha/lib/clear_page.S 2005-01-21 11:51:39.000000000 -0800
@@ -6,11 +6,10 @@

.text
.align 4
- .global clear_page
- .ent clear_page
-clear_page:
+ .global _clear_page
+ .ent _clear_page
+_clear_page:
.prologue 0
-
lda $0,128
nop
unop
@@ -36,4 +35,4 @@ clear_page:
unop
nop

- .end clear_page
+ .end _clear_page
Index: linux-2.6.10/include/asm-sh64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-sh64/page.h 2004-12-24 13:34:33.000000000 -0800
+++ linux-2.6.10/include/asm-sh64/page.h 2005-01-21 11:51:39.000000000 -0800
@@ -50,12 +50,20 @@ extern struct page *mem_map;
extern void sh64_page_clear(void *page);
extern void sh64_page_copy(void *from, void *to);

-#define clear_page(page) sh64_page_clear(page)
+static inline void clear_page(page, order)
+{
+ int nr = 1 << order;
+
+ while (nr-- >0) {
+ sh64_page_clear(page++, 0);
+ }
+}
+
#define copy_page(to,from) sh64_page_copy(from, to)

#if defined(CONFIG_DCACHE_DISABLED)

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) sh_clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

#else
Index: linux-2.6.10/include/asm-h8300/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-h8300/page.h 2004-12-24 13:35:25.000000000 -0800
+++ linux-2.6.10/include/asm-h8300/page.h 2005-01-21 11:51:39.000000000 -0800
@@ -24,10 +24,10 @@
#define get_user_page(vaddr) __get_free_page(GFP_KERNEL)
#define free_user_page(page, addr) free_page(addr)

-#define clear_page(page) memset((page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((to), (from), PAGE_SIZE)

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

/*
Index: linux-2.6.10/include/asm-arm/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-arm/page.h 2004-12-24 13:34:01.000000000 -0800
+++ linux-2.6.10/include/asm-arm/page.h 2005-01-21 11:51:39.000000000 -0800
@@ -128,7 +128,7 @@ extern void __cpu_copy_user_page(void *t
preempt_enable(); \
} while (0)

-#define clear_page(page) memzero((void *)(page), PAGE_SIZE)
+#define clear_page(page, order) memzero((void *)(page), PAGE_SIZE << (order))
extern void copy_page(void *to, const void *from);

#undef STRICT_MM_TYPECHECKS
Index: linux-2.6.10/include/asm-ppc64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-ppc64/page.h 2005-01-21 10:43:58.000000000 -0800
+++ linux-2.6.10/include/asm-ppc64/page.h 2005-01-21 11:51:39.000000000 -0800
@@ -102,12 +102,12 @@
#define REGION_MASK (((1UL<<REGION_SIZE)-1UL)<<REGION_SHIFT)
#define REGION_STRIDE (1UL << REGION_SHIFT)

-static __inline__ void clear_page(void *addr)
+static __inline__ void clear_page(void *addr, unsigned int order)
{
unsigned long lines, line_size;

line_size = ppc64_caches.dline_size;
- lines = ppc64_caches.dlines_per_page;
+ lines = ppc64_caches.dlines_per_page << order;

__asm__ __volatile__(
"mtctr %1 # clear_page\n\
Index: linux-2.6.10/include/asm-m32r/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-m32r/page.h 2004-12-24 13:34:29.000000000 -0800
+++ linux-2.6.10/include/asm-m32r/page.h 2005-01-21 11:51:39.000000000 -0800
@@ -11,10 +11,22 @@
#ifdef __KERNEL__
#ifndef __ASSEMBLY__

-extern void clear_page(void *to);
+extern void _clear_page(void *to);
+
+static inline void clear_page(void *page, int order)
+{
+ unsigned int nr = 1 << order;
+
+ while (nr-- > 0) {
+ _clear_page(page);
+ page += PAGE_SIZE;
+ }
+}
+
+
extern void copy_page(void *to, void *from);

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

/*
Index: linux-2.6.10/include/asm-alpha/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-alpha/page.h 2004-12-24 13:35:24.000000000 -0800
+++ linux-2.6.10/include/asm-alpha/page.h 2005-01-21 11:51:39.000000000 -0800
@@ -15,8 +15,20 @@

#define STRICT_MM_TYPECHECKS

-extern void clear_page(void *page);
-#define clear_user_page(page, vaddr, pg) clear_page(page)
+extern void _clear_page(void *page);
+
+static inline void clear_page(void *page, int order)
+{
+ int nr = 1 << order;
+
+ while (nr--)
+ {
+ _clear_page(page);
+ page += PAGE_SIZE;
+ }
+}
+
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)

extern void copy_page(void * _to, void * _from);
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
Index: linux-2.6.10/arch/mips/mm/pg-sb1.c
===================================================================
--- linux-2.6.10.orig/arch/mips/mm/pg-sb1.c 2004-12-24 13:35:50.000000000 -0800
+++ linux-2.6.10/arch/mips/mm/pg-sb1.c 2005-01-21 11:51:39.000000000 -0800
@@ -42,7 +42,7 @@
#ifdef CONFIG_SIBYTE_DMA_PAGEOPS
static inline void clear_page_cpu(void *page)
#else
-void clear_page(void *page)
+void _clear_page(void *page)
#endif
{
unsigned char *addr = (unsigned char *) page;
@@ -172,14 +172,13 @@ void sb1_dma_init(void)
IOADDR(A_DM_REGISTER(cpu, R_DM_DSCR_BASE)));
}

-void clear_page(void *page)
+void _clear_page(void *page)
{
int cpu = smp_processor_id();

/* if the page is above Kseg0, use old way */
if (KSEGX(page) != CAC_BASE)
return clear_page_cpu(page);
-
page_descr[cpu].dscr_a = PHYSADDR(page) | M_DM_DSCRA_ZERO_MEM | M_DM_DSCRA_L2C_DEST | M_DM_DSCRA_INTERRUPT;
page_descr[cpu].dscr_b = V_DM_DSCRB_SRC_LENGTH(PAGE_SIZE);
__raw_writeq(1, IOADDR(A_DM_REGISTER(cpu, R_DM_DSCR_COUNT)));
@@ -218,5 +217,5 @@ void copy_page(void *to, void *from)

#endif

-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(_clear_page);
EXPORT_SYMBOL(copy_page);
Index: linux-2.6.10/include/asm-m68k/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-m68k/page.h 2004-12-24 13:35:49.000000000 -0800
+++ linux-2.6.10/include/asm-m68k/page.h 2005-01-21 11:51:39.000000000 -0800
@@ -50,7 +50,7 @@ static inline void copy_page(void *to, v
);
}

-static inline void clear_page(void *page)
+static inline void clear_page(void *page, int order)
{
unsigned long tmp;
unsigned long *sp = page;
@@ -69,16 +69,16 @@ static inline void clear_page(void *page
"dbra %1,1b\n\t"
: "=a" (sp), "=d" (tmp)
: "a" (page), "0" (sp),
- "1" ((PAGE_SIZE - 16) / 16 - 1));
+ "1" (((PAGE_SIZE<<(order)) - 16) / 16 - 1));
}

#else
-#define clear_page(page) memset((page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((to), (from), PAGE_SIZE)
#endif

#define clear_user_page(addr, vaddr, page) \
- do { clear_page(addr); \
+ do { clear_page(addr, 0); \
flush_dcache_page(page); \
} while (0)
#define copy_user_page(to, from, vaddr, page) \
Index: linux-2.6.10/include/asm-mips/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-mips/page.h 2004-12-24 13:34:31.000000000 -0800
+++ linux-2.6.10/include/asm-mips/page.h 2005-01-21 11:51:39.000000000 -0800
@@ -39,7 +39,18 @@
#ifdef __KERNEL__
#ifndef __ASSEMBLY__

-extern void clear_page(void * page);
+extern void _clear_page(void * page);
+
+static inline void clear_page(void *page, int order)
+{
+ unsigned int nr = 1 << order;
+
+ while (nr-- >0) {
+ _clear_page(page);
+ page += PAGE_SIZE;
+ }
+}
+
extern void copy_page(void * to, void * from);

extern unsigned long shm_align_mask;
@@ -57,7 +68,7 @@ static inline void clear_user_page(void
{
extern void (*flush_data_cache_page)(unsigned long addr);

- clear_page(addr);
+ clear_page(addr, 0);
if (pages_do_alias((unsigned long) addr, vaddr))
flush_data_cache_page((unsigned long)addr);
}
Index: linux-2.6.10/include/asm-m68knommu/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-m68knommu/page.h 2005-01-21 10:43:58.000000000 -0800
+++ linux-2.6.10/include/asm-m68knommu/page.h 2005-01-21 11:51:39.000000000 -0800
@@ -24,10 +24,10 @@
#define get_user_page(vaddr) __get_free_page(GFP_KERNEL)
#define free_user_page(page, addr) free_page(addr)

-#define clear_page(page) memset((page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((to), (from), PAGE_SIZE)

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

/*
Index: linux-2.6.10/include/asm-cris/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-cris/page.h 2004-12-24 13:34:30.000000000 -0800
+++ linux-2.6.10/include/asm-cris/page.h 2005-01-21 11:51:39.000000000 -0800
@@ -15,10 +15,10 @@

#ifdef __KERNEL__

-#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE)

-#define clear_user_page(page, vaddr, pg) clear_page(page)
+#define clear_user_page(page, vaddr, pg) clear_page(page, 0)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

/*
Index: linux-2.6.10/include/asm-v850/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-v850/page.h 2004-12-24 13:35:00.000000000 -0800
+++ linux-2.6.10/include/asm-v850/page.h 2005-01-21 11:51:39.000000000 -0800
@@ -37,11 +37,11 @@

#define STRICT_MM_TYPECHECKS

-#define clear_page(page) memset ((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset ((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to, from) memcpy ((void *)(to), (void *)from, PAGE_SIZE)

#define clear_user_page(addr, vaddr, page) \
- do { clear_page(addr); \
+ do { clear_page(addr, 0); \
flush_dcache_page(page); \
} while (0)
#define copy_user_page(to, from, vaddr, page) \
Index: linux-2.6.10/include/asm-parisc/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-parisc/page.h 2004-12-24 13:34:26.000000000 -0800
+++ linux-2.6.10/include/asm-parisc/page.h 2005-01-21 11:51:39.000000000 -0800
@@ -13,7 +13,7 @@
#include <asm/types.h>
#include <asm/cache.h>

-#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+#define clear_page(page, order) memset((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) copy_user_page_asm((void *)(to), (void *)(from))

struct page;
Index: linux-2.6.10/arch/arm/mm/copypage-v6.c
===================================================================
--- linux-2.6.10.orig/arch/arm/mm/copypage-v6.c 2004-12-24 13:34:31.000000000 -0800
+++ linux-2.6.10/arch/arm/mm/copypage-v6.c 2005-01-21 11:51:39.000000000 -0800
@@ -47,7 +47,7 @@ void v6_copy_user_page_nonaliasing(void
*/
void v6_clear_user_page_nonaliasing(void *kaddr, unsigned long vaddr)
{
- clear_page(kaddr);
+ _clear_page(kaddr);
}

/*
@@ -116,7 +116,7 @@ void v6_clear_user_page_aliasing(void *k

set_pte(to_pte + offset, pfn_pte(__pa(kaddr) >> PAGE_SHIFT, to_pgprot));
flush_tlb_kernel_page(to);
- clear_page((void *)to);
+ _clear_page((void *)to);

spin_unlock(&v6_lock);
}
Index: linux-2.6.10/arch/m32r/mm/page.S
===================================================================
--- linux-2.6.10.orig/arch/m32r/mm/page.S 2004-12-24 13:34:57.000000000 -0800
+++ linux-2.6.10/arch/m32r/mm/page.S 2005-01-21 11:51:39.000000000 -0800
@@ -51,7 +51,7 @@ copy_page:
jmp r14

.text
- .global clear_page
+ .global _clear_page
/*
* clear_page (to)
*
@@ -60,7 +60,7 @@ copy_page:
* 16 * 256
*/
.align 4
-clear_page:
+_clear_page:
ldi r2, #255
ldi r4, #0
ld r3, @r0 /* cache line allocate */
Index: linux-2.6.10/include/asm-ppc/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-ppc/page.h 2004-12-24 13:34:29.000000000 -0800
+++ linux-2.6.10/include/asm-ppc/page.h 2005-01-21 11:51:39.000000000 -0800
@@ -85,7 +85,7 @@ typedef unsigned long pgprot_t;

struct page;
extern void clear_pages(void *page, int order);
-static inline void clear_page(void *page) { clear_pages(page, 0); }
+#define clear_page clear_pages
extern void copy_page(void *to, void *from);
extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg);
extern void copy_user_page(void *to, void *from, unsigned long vaddr,
Index: linux-2.6.10/arch/alpha/kernel/alpha_ksyms.c
===================================================================
--- linux-2.6.10.orig/arch/alpha/kernel/alpha_ksyms.c 2004-12-24 13:33:51.000000000 -0800
+++ linux-2.6.10/arch/alpha/kernel/alpha_ksyms.c 2005-01-21 11:51:39.000000000 -0800
@@ -88,7 +88,7 @@ EXPORT_SYMBOL(__memset);
EXPORT_SYMBOL(__memsetw);
EXPORT_SYMBOL(__constant_c_memset);
EXPORT_SYMBOL(copy_page);
-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(_clear_page);

EXPORT_SYMBOL(__direct_map_base);
EXPORT_SYMBOL(__direct_map_size);
Index: linux-2.6.10/arch/alpha/lib/ev6-clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/alpha/lib/ev6-clear_page.S 2004-12-24 13:35:24.000000000 -0800
+++ linux-2.6.10/arch/alpha/lib/ev6-clear_page.S 2005-01-21 11:51:39.000000000 -0800
@@ -6,9 +6,9 @@

.text
.align 4
- .global clear_page
- .ent clear_page
-clear_page:
+ .global _clear_page
+ .ent _clear_page
+_clear_page:
.prologue 0

lda $0,128
@@ -51,4 +51,4 @@ clear_page:
nop
nop

- .end clear_page
+ .end _clear_page
Index: linux-2.6.10/arch/sh/mm/init.c
===================================================================
--- linux-2.6.10.orig/arch/sh/mm/init.c 2004-12-24 13:35:24.000000000 -0800
+++ linux-2.6.10/arch/sh/mm/init.c 2005-01-21 11:51:39.000000000 -0800
@@ -57,7 +57,7 @@ bootmem_data_t discontig_node_bdata[MAX_
#endif

void (*copy_page)(void *from, void *to);
-void (*clear_page)(void *to);
+void (*_clear_page)(void *to);

void show_mem(void)
{
@@ -255,7 +255,7 @@ void __init mem_init(void)
* later in the boot process if a better method is available.
*/
copy_page = copy_page_slow;
- clear_page = clear_page_slow;
+ _clear_page = clear_page_slow;

/* this will put all low memory onto the freelists */
totalram_pages += free_all_bootmem_node(NODE_DATA(0));
Index: linux-2.6.10/arch/sh/mm/pg-dma.c
===================================================================
--- linux-2.6.10.orig/arch/sh/mm/pg-dma.c 2004-12-24 13:35:00.000000000 -0800
+++ linux-2.6.10/arch/sh/mm/pg-dma.c 2005-01-21 11:51:39.000000000 -0800
@@ -78,7 +78,7 @@ static int __init pg_dma_init(void)
return ret;

copy_page = copy_page_dma;
- clear_page = clear_page_dma;
+ _clear_page = clear_page_dma;

return ret;
}
Index: linux-2.6.10/arch/sh/mm/pg-nommu.c
===================================================================
--- linux-2.6.10.orig/arch/sh/mm/pg-nommu.c 2004-12-24 13:34:32.000000000 -0800
+++ linux-2.6.10/arch/sh/mm/pg-nommu.c 2005-01-21 11:51:39.000000000 -0800
@@ -27,7 +27,7 @@ static void clear_page_nommu(void *to)
static int __init pg_nommu_init(void)
{
copy_page = copy_page_nommu;
- clear_page = clear_page_nommu;
+ _clear_page = clear_page_nommu;

return 0;
}
Index: linux-2.6.10/arch/mips/mm/pg-r4k.c
===================================================================
--- linux-2.6.10.orig/arch/mips/mm/pg-r4k.c 2004-12-24 13:34:49.000000000 -0800
+++ linux-2.6.10/arch/mips/mm/pg-r4k.c 2005-01-21 11:51:39.000000000 -0800
@@ -39,9 +39,9 @@

static unsigned int clear_page_array[0x130 / 4];

-void clear_page(void * page) __attribute__((alias("clear_page_array")));
+void _clear_page(void * page) __attribute__((alias("clear_page_array")));

-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(_clear_page);

/*
* Maximum sizes:
Index: linux-2.6.10/arch/m32r/kernel/m32r_ksyms.c
===================================================================
--- linux-2.6.10.orig/arch/m32r/kernel/m32r_ksyms.c 2004-12-24 13:34:29.000000000 -0800
+++ linux-2.6.10/arch/m32r/kernel/m32r_ksyms.c 2005-01-21 11:51:39.000000000 -0800
@@ -102,7 +102,7 @@ EXPORT_SYMBOL(memmove);
EXPORT_SYMBOL(memcmp);
EXPORT_SYMBOL(memscan);
EXPORT_SYMBOL(copy_page);
-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(_clear_page);

EXPORT_SYMBOL(strcat);
EXPORT_SYMBOL(strchr);
Index: linux-2.6.10/include/asm-arm26/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-arm26/page.h 2004-12-24 13:35:22.000000000 -0800
+++ linux-2.6.10/include/asm-arm26/page.h 2005-01-21 11:51:39.000000000 -0800
@@ -25,7 +25,7 @@ extern void copy_page(void *to, const vo
preempt_enable(); \
} while (0)

-#define clear_page(page) memzero((void *)(page), PAGE_SIZE)
+#define clear_page(page, order) memzero((void *)(page), PAGE_SIZE << (order))
#define copy_page(to, from) __copy_user_page(to, from, 0);

#undef STRICT_MM_TYPECHECKS
Index: linux-2.6.10/include/asm-sparc64/page.h
===================================================================
--- linux-2.6.10.orig/include/asm-sparc64/page.h 2004-12-24 13:34:32.000000000 -0800
+++ linux-2.6.10/include/asm-sparc64/page.h 2005-01-21 11:51:39.000000000 -0800
@@ -14,8 +14,8 @@

#ifndef __ASSEMBLY__

-extern void _clear_page(void *page);
-#define clear_page(X) _clear_page((void *)(X))
+extern void _clear_page(void *page, unsigned long order);
+#define clear_page(X,Y) _clear_page((void *)(X),(Y))
struct page;
extern void clear_user_page(void *addr, unsigned long vaddr, struct page *page);
#define copy_page(X,Y) memcpy((void *)(X), (void *)(Y), PAGE_SIZE)
Index: linux-2.6.10/arch/sparc64/lib/clear_page.S
===================================================================
--- linux-2.6.10.orig/arch/sparc64/lib/clear_page.S 2004-12-24 13:35:23.000000000 -0800
+++ linux-2.6.10/arch/sparc64/lib/clear_page.S 2005-01-21 11:51:39.000000000 -0800
@@ -28,9 +28,12 @@
.text

.globl _clear_page
-_clear_page: /* %o0=dest */
+_clear_page: /* %o0=dest, %o1=order */
+ sethi %hi(PAGE_SIZE/64), %o2
+ clr %o4
+ or %o2, %lo(PAGE_SIZE/64), %o2
ba,pt %xcc, clear_page_common
- clr %o4
+ sllx %o2, %o1, %o1

/* This thing is pretty important, it shows up
* on the profiles via do_anonymous_page().
@@ -69,16 +72,16 @@ clear_user_page: /* %o0=dest, %o1=vaddr
flush %g6
wrpr %o4, 0x0, %pstate

+ sethi %hi(PAGE_SIZE/64), %o1
mov 1, %o4
+ or %o1, %lo(PAGE_SIZE/64), %o1

clear_page_common:
VISEntryHalf
membar #StoreLoad | #StoreStore | #LoadStore
fzero %f0
- sethi %hi(PAGE_SIZE/64), %o1
mov %o0, %g1 ! remember vaddr for tlbflush
fzero %f2
- or %o1, %lo(PAGE_SIZE/64), %o1
faddd %f0, %f2, %f4
fmuld %f0, %f2, %f6
faddd %f0, %f2, %f8

2005-01-21 20:22:18

by Christoph Lameter

[permalink] [raw]
Subject: A scrub daemon (prezeroing)

Adds management of ZEROED and NOT_ZEROED pages and a background daemon
called scrubd. scrubd is disabled by default but can be enabled
by writing an order number to /proc/sys/vm/scrub_start. If a page
is coalesced of that order or higher then the scrub daemon will
start zeroing until all pages of order /proc/sys/vm/scrub_stop and
higher are zeroed and then go back to sleep.

In an SMP environment the scrub daemon is typically
running on the most idle cpu. Thus a single threaded application running
on one cpu may have the other cpu zeroing pages for it etc. The scrub
daemon is hardly noticable and usually finished zeroing quickly since
most processors are optimized for linear memory filling.

Note that this patch does not depend on any other patches but other
patches would improve what scrubd does. The extension of clear_pages by an
order parameter would increase the speed of zeroing and the patch
introducing alloc_zeroed_user_highpage is necessary for user
pages to be allocated from the pool of zeroed pages.

Patch against 2.6.11-rc1-bk9

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.10/mm/page_alloc.c
===================================================================
--- linux-2.6.10.orig/mm/page_alloc.c 2005-01-21 10:43:59.000000000 -0800
+++ linux-2.6.10/mm/page_alloc.c 2005-01-21 12:01:44.000000000 -0800
@@ -12,6 +12,8 @@
* Zone balancing, Kanoj Sarcar, SGI, Jan 2000
* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
* (lots of bits borrowed from Ingo Molnar & Andrew Morton)
+ * Page zeroing by Christoph Lameter, SGI, Dec 2004 based on
+ * initial code for __GFP_ZERO support by Andrea Arcangeli, Oct 2004.
*/

#include <linux/config.h>
@@ -33,6 +35,7 @@
#include <linux/cpu.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
+#include <linux/scrub.h>

#include <asm/tlbflush.h>
#include "internal.h"
@@ -167,16 +170,16 @@ static void destroy_compound_page(struct
* zone->lock is already acquired when we use these.
* So, we don't need atomic page->flags operations here.
*/
-static inline unsigned long page_order(struct page *page) {
+static inline unsigned long page_zorder(struct page *page) {
return page->private;
}

-static inline void set_page_order(struct page *page, int order) {
- page->private = order;
+static inline void set_page_zorder(struct page *page, int order, int zero) {
+ page->private = order + (zero << 10);
__SetPagePrivate(page);
}

-static inline void rmv_page_order(struct page *page)
+static inline void rmv_page_zorder(struct page *page)
{
__ClearPagePrivate(page);
page->private = 0;
@@ -187,14 +190,15 @@ static inline void rmv_page_order(struct
* we can do coalesce a page and its buddy if
* (a) the buddy is free &&
* (b) the buddy is on the buddy system &&
- * (c) a page and its buddy have the same order.
+ * (c) a page and its buddy have the same order and the same
+ * zeroing status.
* for recording page's order, we use page->private and PG_private.
*
*/
-static inline int page_is_buddy(struct page *page, int order)
+static inline int page_is_buddy(struct page *page, int order, int zero)
{
if (PagePrivate(page) &&
- (page_order(page) == order) &&
+ (page_zorder(page) == order + (zero << 10)) &&
!PageReserved(page) &&
page_count(page) == 0)
return 1;
@@ -225,22 +229,20 @@ static inline int page_is_buddy(struct p
* -- wli
*/

-static inline void __free_pages_bulk (struct page *page, struct page *base,
- struct zone *zone, unsigned int order)
+static inline int __free_pages_bulk (struct page *page, struct page *base,
+ struct zone *zone, unsigned int order, int zero)
{
unsigned long page_idx;
struct page *coalesced;
- int order_size = 1 << order;

if (unlikely(order))
destroy_compound_page(page, order);

page_idx = page - base;

- BUG_ON(page_idx & (order_size - 1));
+ BUG_ON(page_idx & (( 1 << order) - 1));
BUG_ON(bad_range(zone, page));

- zone->free_pages += order_size;
while (order < MAX_ORDER-1) {
struct free_area *area;
struct page *buddy;
@@ -250,20 +252,21 @@ static inline void __free_pages_bulk (st
buddy = base + buddy_idx;
if (bad_range(zone, buddy))
break;
- if (!page_is_buddy(buddy, order))
+ if (!page_is_buddy(buddy, order, zero))
break;
/* Move the buddy up one level. */
list_del(&buddy->lru);
- area = zone->free_area + order;
+ area = zone->free_area[zero] + order;
area->nr_free--;
- rmv_page_order(buddy);
+ rmv_page_zorder(buddy);
page_idx &= buddy_idx;
order++;
}
coalesced = base + page_idx;
- set_page_order(coalesced, order);
- list_add(&coalesced->lru, &zone->free_area[order].free_list);
- zone->free_area[order].nr_free++;
+ set_page_zorder(coalesced, order, zero);
+ list_add(&coalesced->lru, &zone->free_area[zero][order].free_list);
+ zone->free_area[zero][order].nr_free++;
+ return order;
}

static inline void free_pages_check(const char *function, struct page *page)
@@ -312,8 +315,11 @@ free_pages_bulk(struct zone *zone, int c
page = list_entry(list->prev, struct page, lru);
/* have to delete it as __free_pages_bulk list manipulates */
list_del(&page->lru);
- __free_pages_bulk(page, base, zone, order);
+ if (__free_pages_bulk(page, base, zone, order, NOT_ZEROED)
+ >= sysctl_scrub_start)
+ wakeup_kscrubd(zone);
ret++;
+ zone->free_pages += 1UL << order;
}
spin_unlock_irqrestore(&zone->lock, flags);
return ret;
@@ -341,6 +347,18 @@ void __free_pages_ok(struct page *page,
free_pages_bulk(page_zone(page), 1, &list, order);
}

+void end_zero_page(struct page *page, unsigned int order)
+{
+ unsigned long flags;
+ struct zone * zone = page_zone(page);
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ __free_pages_bulk(page, zone->zone_mem_map, zone, order, ZEROED);
+ zone->zero_pages += 1UL << order;
+
+ spin_unlock_irqrestore(&zone->lock, flags);
+}

/*
* The order of subdivision here is critical for the IO subsystem.
@@ -358,7 +376,7 @@ void __free_pages_ok(struct page *page,
*/
static inline struct page *
expand(struct zone *zone, struct page *page,
- int low, int high, struct free_area *area)
+ int low, int high, struct free_area *area, int zero)
{
unsigned long size = 1 << high;

@@ -369,7 +387,7 @@ expand(struct zone *zone, struct page *p
BUG_ON(bad_range(zone, &page[size]));
list_add(&page[size].lru, &area->free_list);
area->nr_free++;
- set_page_order(&page[size], high);
+ set_page_zorder(&page[size], high, zero);
}
return page;
}
@@ -420,23 +438,44 @@ static void prep_new_page(struct page *p
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
-static struct page *__rmqueue(struct zone *zone, unsigned int order)
+static void inline rmpage(struct page *page, struct free_area *area)
+{
+ list_del(&page->lru);
+ rmv_page_zorder(page);
+ area->nr_free--;
+}
+
+struct page *scrubd_rmpage(struct zone *zone, struct free_area *area)
+{
+ unsigned long flags;
+ struct page *page = NULL;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ if (!list_empty(&area->free_list)) {
+ page = list_entry(area->free_list.next, struct page, lru);
+ rmpage(page, area);
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return page;
+}
+
+static struct page *__rmqueue(struct zone *zone, unsigned int order, int zero)
{
- struct free_area * area;
+ struct free_area *area;
unsigned int current_order;
struct page *page;

for (current_order = order; current_order < MAX_ORDER; ++current_order) {
- area = zone->free_area + current_order;
+ area = zone->free_area[zero] + current_order;
if (list_empty(&area->free_list))
continue;

page = list_entry(area->free_list.next, struct page, lru);
- list_del(&page->lru);
- rmv_page_order(page);
- area->nr_free--;
+ rmpage(page, zone->free_area[zero] + current_order);
zone->free_pages -= 1UL << order;
- return expand(zone, page, order, current_order, area);
+ if (zero)
+ zone->zero_pages -= 1UL << order;
+ return expand(zone, page, order, current_order, area, zero);
}

return NULL;
@@ -448,7 +487,7 @@ static struct page *__rmqueue(struct zon
* Returns the number of new pages which were placed at *list.
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
- unsigned long count, struct list_head *list)
+ unsigned long count, struct list_head *list, int zero)
{
unsigned long flags;
int i;
@@ -457,7 +496,7 @@ static int rmqueue_bulk(struct zone *zon

spin_lock_irqsave(&zone->lock, flags);
for (i = 0; i < count; ++i) {
- page = __rmqueue(zone, order);
+ page = __rmqueue(zone, order, zero);
if (page == NULL)
break;
allocated++;
@@ -504,7 +543,7 @@ void mark_free_pages(struct zone *zone)
ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));

for (order = MAX_ORDER - 1; order >= 0; --order)
- list_for_each(curr, &zone->free_area[order].free_list) {
+ list_for_each(curr, &zone->free_area[NOT_ZEROED][order].free_list) {
unsigned long start_pfn, i;

start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
@@ -591,7 +630,7 @@ void fastcall free_cold_page(struct page
free_hot_cold_page(page, 1);
}

-static inline void prep_zero_page(struct page *page, int order, int gfp_flags)
+void prep_zero_page(struct page *page, unsigned int order, unsigned int gfp_flags)
{
int i;

@@ -610,7 +649,9 @@ buffered_rmqueue(struct zone *zone, int
{
unsigned long flags;
struct page *page = NULL;
- int cold = !!(gfp_flags & __GFP_COLD);
+ int nr_pages = 1 << order;
+ int zero = !!((gfp_flags & __GFP_ZERO) && zone->zero_pages >= nr_pages);
+ int cold = !!(gfp_flags & __GFP_COLD) + 2*zero;

if (order == 0) {
struct per_cpu_pages *pcp;
@@ -619,7 +660,7 @@ buffered_rmqueue(struct zone *zone, int
local_irq_save(flags);
if (pcp->count <= pcp->low)
pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, &pcp->list);
+ pcp->batch, &pcp->list, zero);
if (pcp->count) {
page = list_entry(pcp->list.next, struct page, lru);
list_del(&page->lru);
@@ -631,16 +672,25 @@ buffered_rmqueue(struct zone *zone, int

if (page == NULL) {
spin_lock_irqsave(&zone->lock, flags);
- page = __rmqueue(zone, order);
+ page = __rmqueue(zone, order, zero);
+ /*
+ * If we failed to obtain a zero and/or unzeroed page
+ * then we may still be able to obtain the other
+ * type of page.
+ */
+ if (!page) {
+ page = __rmqueue(zone, order, !zero);
+ zero = 0;
+ }
spin_unlock_irqrestore(&zone->lock, flags);
}

if (page != NULL) {
BUG_ON(bad_range(zone, page));
- mod_page_state_zone(zone, pgalloc, 1 << order);
+ mod_page_state_zone(zone, pgalloc, nr_pages);
prep_new_page(page, order);

- if (gfp_flags & __GFP_ZERO)
+ if ((gfp_flags & __GFP_ZERO) && !zero)
prep_zero_page(page, order, gfp_flags);

if (order && (gfp_flags & __GFP_COMP))
@@ -669,7 +719,7 @@ int zone_watermark_ok(struct zone *z, in
return 0;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
- free_pages -= z->free_area[o].nr_free << o;
+ free_pages -= (z->free_area[NOT_ZEROED][o].nr_free + z->free_area[ZEROED][o].nr_free) << o;

/* Require fewer higher order pages to be free */
min >>= 1;
@@ -1046,7 +1096,7 @@ unsigned long __read_page_state(unsigned
}

void __get_zone_counts(unsigned long *active, unsigned long *inactive,
- unsigned long *free, struct pglist_data *pgdat)
+ unsigned long *free, unsigned long *zero, struct pglist_data *pgdat)
{
struct zone *zones = pgdat->node_zones;
int i;
@@ -1054,27 +1104,31 @@ void __get_zone_counts(unsigned long *ac
*active = 0;
*inactive = 0;
*free = 0;
+ *zero = 0;
for (i = 0; i < MAX_NR_ZONES; i++) {
*active += zones[i].nr_active;
*inactive += zones[i].nr_inactive;
*free += zones[i].free_pages;
+ *zero += zones[i].zero_pages;
}
}

void get_zone_counts(unsigned long *active,
- unsigned long *inactive, unsigned long *free)
+ unsigned long *inactive, unsigned long *free, unsigned long *zero)
{
struct pglist_data *pgdat;

*active = 0;
*inactive = 0;
*free = 0;
+ *zero = 0;
for_each_pgdat(pgdat) {
- unsigned long l, m, n;
- __get_zone_counts(&l, &m, &n, pgdat);
+ unsigned long l, m, n,o;
+ __get_zone_counts(&l, &m, &n, &o, pgdat);
*active += l;
*inactive += m;
*free += n;
+ *zero += o;
}
}

@@ -1111,6 +1165,7 @@ void si_meminfo_node(struct sysinfo *val

#define K(x) ((x) << (PAGE_SHIFT-10))

+const char *temp[3] = { "hot", "cold", "zero" };
/*
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
@@ -1123,6 +1178,7 @@ void show_free_areas(void)
unsigned long active;
unsigned long inactive;
unsigned long free;
+ unsigned long zero;
struct zone *zone;

for_each_zone(zone) {
@@ -1143,10 +1199,10 @@ void show_free_areas(void)

pageset = zone->pageset + cpu;

- for (temperature = 0; temperature < 2; temperature++)
+ for (temperature = 0; temperature < 3; temperature++)
printk("cpu %d %s: low %d, high %d, batch %d\n",
cpu,
- temperature ? "cold" : "hot",
+ temp[temperature],
pageset->pcp[temperature].low,
pageset->pcp[temperature].high,
pageset->pcp[temperature].batch);
@@ -1154,20 +1210,21 @@ void show_free_areas(void)
}

get_page_state(&ps);
- get_zone_counts(&active, &inactive, &free);
+ get_zone_counts(&active, &inactive, &free, &zero);

printk("\nFree pages: %11ukB (%ukB HighMem)\n",
K(nr_free_pages()),
K(nr_free_highpages()));

printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
- "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
+ "unstable:%lu free:%u zero:%lu slab:%lu mapped:%lu pagetables:%lu\n",
active,
inactive,
ps.nr_dirty,
ps.nr_writeback,
ps.nr_unstable,
nr_free_pages(),
+ zero,
ps.nr_slab,
ps.nr_mapped,
ps.nr_page_table_pages);
@@ -1216,7 +1273,7 @@ void show_free_areas(void)

spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
- nr = zone->free_area[order].nr_free;
+ nr = zone->free_area[NOT_ZEROED][order].nr_free + zone->free_area[ZEROED][order].nr_free;
total += nr << order;
printk("%lu*%lukB ", nr, K(1UL) << order);
}
@@ -1516,8 +1573,10 @@ void zone_init_free_lists(struct pglist_
{
int order;
for (order = 0; order < MAX_ORDER ; order++) {
- INIT_LIST_HEAD(&zone->free_area[order].free_list);
- zone->free_area[order].nr_free = 0;
+ INIT_LIST_HEAD(&zone->free_area[NOT_ZEROED][order].free_list);
+ INIT_LIST_HEAD(&zone->free_area[ZEROED][order].free_list);
+ zone->free_area[NOT_ZEROED][order].nr_free = 0;
+ zone->free_area[ZEROED][order].nr_free = 0;
}
}

@@ -1542,6 +1601,7 @@ static void __init free_area_init_core(s

pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
+ init_waitqueue_head(&pgdat->kscrubd_wait);
pgdat->kswapd_max_order = 0;

for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -1565,6 +1625,7 @@ static void __init free_area_init_core(s
spin_lock_init(&zone->lru_lock);
zone->zone_pgdat = pgdat;
zone->free_pages = 0;
+ zone->zero_pages = 0;

zone->temp_priority = zone->prev_priority = DEF_PRIORITY;

@@ -1598,6 +1659,13 @@ static void __init free_area_init_core(s
pcp->high = 2 * batch;
pcp->batch = 1 * batch;
INIT_LIST_HEAD(&pcp->list);
+
+ pcp = &zone->pageset[cpu].pcp[2]; /* zero pages */
+ pcp->count = 0;
+ pcp->low = 0;
+ pcp->high = 2 * batch;
+ pcp->batch = 1 * batch;
+ INIT_LIST_HEAD(&pcp->list);
}
printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
zone_names[j], realsize, batch);
@@ -1723,7 +1791,7 @@ static int frag_show(struct seq_file *m,
spin_lock_irqsave(&zone->lock, flags);
seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
for (order = 0; order < MAX_ORDER; ++order)
- seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+ seq_printf(m, "%6lu ", zone->free_area[NOT_ZEROED][order].nr_free);
spin_unlock_irqrestore(&zone->lock, flags);
seq_putc(m, '\n');
}
Index: linux-2.6.10/include/linux/mmzone.h
===================================================================
--- linux-2.6.10.orig/include/linux/mmzone.h 2005-01-21 10:43:59.000000000 -0800
+++ linux-2.6.10/include/linux/mmzone.h 2005-01-21 11:56:07.000000000 -0800
@@ -51,7 +51,7 @@ struct per_cpu_pages {
};

struct per_cpu_pageset {
- struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
+ struct per_cpu_pages pcp[3]; /* 0: hot. 1: cold 2: cold zeroed pages */
#ifdef CONFIG_NUMA
unsigned long numa_hit; /* allocated in intended node */
unsigned long numa_miss; /* allocated in non intended node */
@@ -107,10 +107,14 @@ struct per_cpu_pageset {
* ZONE_HIGHMEM > 896 MB only page cache and user processes
*/

+#define NOT_ZEROED 0
+#define ZEROED 1
+
struct zone {
/* Fields commonly accessed by the page allocator */
unsigned long free_pages;
unsigned long pages_min, pages_low, pages_high;
+ unsigned long zero_pages;
/*
* protection[] is a pre-calculated number of extra pages that must be
* available in a zone in order for __alloc_pages() to allocate memory
@@ -131,7 +135,7 @@ struct zone {
* free areas of different sizes
*/
spinlock_t lock;
- struct free_area free_area[MAX_ORDER];
+ struct free_area free_area[2][MAX_ORDER];


ZONE_PADDING(_pad1_)
@@ -266,6 +270,9 @@ typedef struct pglist_data {
wait_queue_head_t kswapd_wait;
struct task_struct *kswapd;
int kswapd_max_order;
+
+ wait_queue_head_t kscrubd_wait;
+ struct task_struct *kscrubd;
} pg_data_t;

#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
@@ -274,9 +281,9 @@ typedef struct pglist_data {
extern struct pglist_data *pgdat_list;

void __get_zone_counts(unsigned long *active, unsigned long *inactive,
- unsigned long *free, struct pglist_data *pgdat);
+ unsigned long *free, unsigned long *zero, struct pglist_data *pgdat);
void get_zone_counts(unsigned long *active, unsigned long *inactive,
- unsigned long *free);
+ unsigned long *free, unsigned long *zero);
void build_all_zonelists(void);
void wakeup_kswapd(struct zone *zone, int order);
int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
Index: linux-2.6.10/fs/proc/proc_misc.c
===================================================================
--- linux-2.6.10.orig/fs/proc/proc_misc.c 2005-01-21 10:43:58.000000000 -0800
+++ linux-2.6.10/fs/proc/proc_misc.c 2005-01-21 11:56:07.000000000 -0800
@@ -123,12 +123,13 @@ static int meminfo_read_proc(char *page,
unsigned long inactive;
unsigned long active;
unsigned long free;
+ unsigned long zero;
unsigned long committed;
unsigned long allowed;
struct vmalloc_info vmi;

get_page_state(&ps);
- get_zone_counts(&active, &inactive, &free);
+ get_zone_counts(&active, &inactive, &free, &zero);

/*
* display in kilobytes.
@@ -148,6 +149,7 @@ static int meminfo_read_proc(char *page,
len = sprintf(page,
"MemTotal: %8lu kB\n"
"MemFree: %8lu kB\n"
+ "MemZero: %8lu kB\n"
"Buffers: %8lu kB\n"
"Cached: %8lu kB\n"
"SwapCached: %8lu kB\n"
@@ -171,6 +173,7 @@ static int meminfo_read_proc(char *page,
"VmallocChunk: %8lu kB\n",
K(i.totalram),
K(i.freeram),
+ K(zero),
K(i.bufferram),
K(get_page_cache_size()-total_swapcache_pages-i.bufferram),
K(total_swapcache_pages),
Index: linux-2.6.10/mm/readahead.c
===================================================================
--- linux-2.6.10.orig/mm/readahead.c 2005-01-21 10:43:59.000000000 -0800
+++ linux-2.6.10/mm/readahead.c 2005-01-21 11:56:07.000000000 -0800
@@ -573,7 +573,8 @@ unsigned long max_sane_readahead(unsigne
unsigned long active;
unsigned long inactive;
unsigned long free;
+ unsigned long zero;

- __get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id()));
+ __get_zone_counts(&active, &inactive, &free, &zero, NODE_DATA(numa_node_id()));
return min(nr, (inactive + free) / 2);
}
Index: linux-2.6.10/drivers/base/node.c
===================================================================
--- linux-2.6.10.orig/drivers/base/node.c 2005-01-21 10:43:56.000000000 -0800
+++ linux-2.6.10/drivers/base/node.c 2005-01-21 11:56:07.000000000 -0800
@@ -42,13 +42,15 @@ static ssize_t node_read_meminfo(struct
unsigned long inactive;
unsigned long active;
unsigned long free;
+ unsigned long zero;

si_meminfo_node(&i, nid);
- __get_zone_counts(&active, &inactive, &free, NODE_DATA(nid));
+ __get_zone_counts(&active, &inactive, &free, &zero, NODE_DATA(nid));

n = sprintf(buf, "\n"
"Node %d MemTotal: %8lu kB\n"
"Node %d MemFree: %8lu kB\n"
+ "Node %d MemZero: %8lu kB\n"
"Node %d MemUsed: %8lu kB\n"
"Node %d Active: %8lu kB\n"
"Node %d Inactive: %8lu kB\n"
@@ -58,6 +60,7 @@ static ssize_t node_read_meminfo(struct
"Node %d LowFree: %8lu kB\n",
nid, K(i.totalram),
nid, K(i.freeram),
+ nid, K(zero),
nid, K(i.totalram - i.freeram),
nid, K(active),
nid, K(inactive),
Index: linux-2.6.10/include/linux/sched.h
===================================================================
--- linux-2.6.10.orig/include/linux/sched.h 2005-01-21 10:44:03.000000000 -0800
+++ linux-2.6.10/include/linux/sched.h 2005-01-21 11:56:07.000000000 -0800
@@ -736,6 +736,7 @@ do { if (atomic_dec_and_test(&(tsk)->usa
#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
#define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */
#define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */
+#define PF_KSCRUBD 0x00800000 /* I am kscrubd */

#ifdef CONFIG_SMP
extern int set_cpus_allowed(task_t *p, cpumask_t new_mask);
Index: linux-2.6.10/mm/Makefile
===================================================================
--- linux-2.6.10.orig/mm/Makefile 2005-01-21 10:43:59.000000000 -0800
+++ linux-2.6.10/mm/Makefile 2005-01-21 11:56:07.000000000 -0800
@@ -5,7 +5,7 @@
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
- vmalloc.o
+ vmalloc.o scrubd.o

obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
page_alloc.o page-writeback.o pdflush.o \
Index: linux-2.6.10/mm/scrubd.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.10/mm/scrubd.c 2005-01-21 11:56:07.000000000 -0800
@@ -0,0 +1,134 @@
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/file.h>
+#include <linux/suspend.h>
+#include <linux/sysctl.h>
+#include <linux/scrub.h>
+
+unsigned int sysctl_scrub_start = 5; /* if a page of this order is coalesed then run kscrubd */
+unsigned int sysctl_scrub_stop = 2; /* Mininum order of page to zero */
+unsigned int sysctl_scrub_load = 999; /* Do not run scrubd if load > */
+
+/*
+ * sysctl handler for /proc/sys/vm/scrub_start
+ */
+int scrub_start_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+ proc_dointvec(table, write, file, buffer, length, ppos);
+ if (sysctl_scrub_start < MAX_ORDER) {
+ struct zone *zone;
+
+ for_each_zone(zone)
+ wakeup_kscrubd(zone);
+ }
+ return 0;
+}
+
+LIST_HEAD(zero_drivers);
+
+/*
+ * zero_highest_order_page takes a page off the freelist
+ * and then hands it off to block zeroing agents.
+ * The cleared pages are added to the back of
+ * the freelist where the page allocator may pick them up.
+ */
+int zero_highest_order_page(struct zone *z)
+{
+ int order;
+
+ for(order = MAX_ORDER-1; order >= sysctl_scrub_stop; order--) {
+ struct free_area *area = z->free_area[NOT_ZEROED] + order;
+ if (!list_empty(&area->free_list)) {
+ struct page *page = scrubd_rmpage(z, area);
+ struct list_head *l;
+ int size = PAGE_SIZE << order;
+
+ if (!page)
+ continue;
+
+ list_for_each(l, &zero_drivers) {
+ struct zero_driver *driver = list_entry(l, struct zero_driver, list);
+
+ if (driver->start(page_address(page), size) == 0)
+ goto done;
+ }
+
+ /* Unable to find a zeroing device that would
+ * deal with this page so just do it on our own.
+ * This will likely thrash the cpu caches.
+ */
+ cond_resched();
+ prep_zero_page(page, order, 0);
+done:
+ end_zero_page(page, order);
+ cond_resched();
+ return 1 << order;
+ }
+ }
+ return 0;
+}
+
+/*
+ * scrub_pgdat() will work across all this node's zones.
+ */
+static void scrub_pgdat(pg_data_t *pgdat)
+{
+ int i;
+ unsigned long pages_zeroed;
+
+ if (system_state != SYSTEM_RUNNING)
+ return;
+
+ do {
+ pages_zeroed = 0;
+ for (i = 0; i < pgdat->nr_zones; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ pages_zeroed += zero_highest_order_page(zone);
+ }
+ } while (pages_zeroed);
+}
+
+/*
+ * The background scrub daemon, started as a kernel thread
+ * from the init process.
+ */
+static int kscrubd(void *p)
+{
+ pg_data_t *pgdat = (pg_data_t*)p;
+ struct task_struct *tsk = current;
+ DEFINE_WAIT(wait);
+ cpumask_t cpumask;
+
+ daemonize("kscrubd%d", pgdat->node_id);
+ cpumask = node_to_cpumask(pgdat->node_id);
+ if (!cpus_empty(cpumask))
+ set_cpus_allowed(tsk, cpumask);
+
+ tsk->flags |= PF_MEMALLOC | PF_KSCRUBD;
+
+ for ( ; ; ) {
+ if (current->flags & PF_FREEZE)
+ refrigerator(PF_FREEZE);
+ prepare_to_wait(&pgdat->kscrubd_wait, &wait, TASK_INTERRUPTIBLE);
+ schedule();
+ finish_wait(&pgdat->kscrubd_wait, &wait);
+
+ scrub_pgdat(pgdat);
+ }
+ return 0;
+}
+
+static int __init kscrubd_init(void)
+{
+ pg_data_t *pgdat;
+ for_each_pgdat(pgdat)
+ pgdat->kscrubd
+ = find_task_by_pid(kernel_thread(kscrubd, pgdat, CLONE_KERNEL));
+ return 0;
+}
+
+module_init(kscrubd_init)
Index: linux-2.6.10/include/linux/scrub.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.10/include/linux/scrub.h 2005-01-21 11:56:07.000000000 -0800
@@ -0,0 +1,49 @@
+#ifndef _LINUX_SCRUB_H
+#define _LINUX_SCRUB_H
+
+/*
+ * Definitions for scrubbing of memory include an interface
+ * for drivers that may that allow the zeroing of memory
+ * without invalidating the caches.
+ *
+ * Christoph Lameter, December 2004.
+ */
+
+struct zero_driver {
+ int (*start)(void *, unsigned long); /* Start bzero transfer */
+ struct list_head list;
+};
+
+extern struct list_head zero_drivers;
+
+extern unsigned int sysctl_scrub_start;
+extern unsigned int sysctl_scrub_stop;
+extern unsigned int sysctl_scrub_load;
+
+/* Registering and unregistering zero drivers */
+static inline void register_zero_driver(struct zero_driver *z)
+{
+ list_add(&z->list, &zero_drivers);
+}
+
+static inline void unregister_zero_driver(struct zero_driver *z)
+{
+ list_del(&z->list);
+}
+
+extern struct page *scrubd_rmpage(struct zone *zone, struct free_area *area);
+
+static void inline wakeup_kscrubd(struct zone *zone)
+{
+ if (avenrun[0] >= ((unsigned long)sysctl_scrub_load << FSHIFT))
+ return;
+ if (!waitqueue_active(&zone->zone_pgdat->kscrubd_wait))
+ return;
+ wake_up_interruptible(&zone->zone_pgdat->kscrubd_wait);
+}
+
+int scrub_start_handler(struct ctl_table *, int, struct file *,
+ void __user *, size_t *, loff_t *);
+
+extern void end_zero_page(struct page *page, unsigned int order);
+#endif
Index: linux-2.6.10/kernel/sysctl.c
===================================================================
--- linux-2.6.10.orig/kernel/sysctl.c 2005-01-21 10:43:59.000000000 -0800
+++ linux-2.6.10/kernel/sysctl.c 2005-01-21 11:56:07.000000000 -0800
@@ -40,6 +40,7 @@
#include <linux/times.h>
#include <linux/limits.h>
#include <linux/dcache.h>
+#include <linux/scrub.h>
#include <linux/syscalls.h>

#include <asm/uaccess.h>
@@ -827,6 +828,33 @@ static ctl_table vm_table[] = {
.strategy = &sysctl_jiffies,
},
#endif
+ {
+ .ctl_name = VM_SCRUB_START,
+ .procname = "scrub_start",
+ .data = &sysctl_scrub_start,
+ .maxlen = sizeof(sysctl_scrub_start),
+ .mode = 0644,
+ .proc_handler = &scrub_start_handler,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = VM_SCRUB_STOP,
+ .procname = "scrub_stop",
+ .data = &sysctl_scrub_stop,
+ .maxlen = sizeof(sysctl_scrub_stop),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = VM_SCRUB_LOAD,
+ .procname = "scrub_load",
+ .data = &sysctl_scrub_load,
+ .maxlen = sizeof(sysctl_scrub_load),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
{ .ctl_name = 0 }
};

Index: linux-2.6.10/include/linux/sysctl.h
===================================================================
--- linux-2.6.10.orig/include/linux/sysctl.h 2005-01-21 10:43:59.000000000 -0800
+++ linux-2.6.10/include/linux/sysctl.h 2005-01-21 11:56:07.000000000 -0800
@@ -169,6 +169,9 @@ enum
VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */
VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
+ VM_SCRUB_START=30, /* percentage * 10 at which to start scrubd */
+ VM_SCRUB_STOP=31, /* percentage * 10 at which to stop scrubd */
+ VM_SCRUB_LOAD=32, /* Load factor at which not to scrub anymore */
};


Index: linux-2.6.10/include/linux/gfp.h
===================================================================
--- linux-2.6.10.orig/include/linux/gfp.h 2005-01-21 10:43:59.000000000 -0800
+++ linux-2.6.10/include/linux/gfp.h 2005-01-21 11:56:07.000000000 -0800
@@ -131,4 +131,5 @@ extern void FASTCALL(free_cold_page(stru

void page_alloc_init(void);

+void prep_zero_page(struct page *, unsigned int order);
#endif /* __LINUX_GFP_H */

2005-01-21 22:35:52

by Paul Mackerras

[permalink] [raw]
Subject: Re: Extend clear_page by an order parameter

Christoph Lameter writes:

> The zeroing of a page of a arbitrary order in page_alloc.c and in hugetlb.c may benefit from a
> clear_page that is capable of zeroing multiple pages at once (and scrubd
> too but that is now an independent patch). The following patch extends
> clear_page with a second parameter specifying the order of the page to be zeroed to allow an
> efficient zeroing of pages. Hope I caught everything....

Wouldn't it be nicer to call the version that takes the order
parameter "clear_pages" and then define clear_page(p) as
clear_pages(p, 0) ?

Paul.

2005-01-21 23:51:42

by Christoph Lameter

[permalink] [raw]
Subject: Re: Extend clear_page by an order parameter

On Sat, 22 Jan 2005, Paul Mackerras wrote:

> Christoph Lameter writes:
>
> > The zeroing of a page of a arbitrary order in page_alloc.c and in hugetlb.c may benefit from a
> > clear_page that is capable of zeroing multiple pages at once (and scrubd
> > too but that is now an independent patch). The following patch extends
> > clear_page with a second parameter specifying the order of the page to be zeroed to allow an
> > efficient zeroing of pages. Hope I caught everything....
>
> Wouldn't it be nicer to call the version that takes the order
> parameter "clear_pages" and then define clear_page(p) as
> clear_pages(p, 0) ?

clear_page clears one page of the specified order. clear_page cannot clear
multiple pages. Calling the function clear_pages would give a wrong
impression on what the function does and may lead to attempts to specify
the number of zero order pages as a parameter instead of the order.

2005-01-22 00:39:33

by Paul Mackerras

[permalink] [raw]
Subject: Re: Extend clear_page by an order parameter

Christoph Lameter writes:

> clear_page clears one page of the specified order.

Now you're really being confusing. A cluster of 2^n contiguous pages
isn't one page by any normal definition. Call it "clear_page_cluster"
or "clear_page_order" or something, but not "clear_page".

Paul.

2005-01-22 00:48:34

by Andrew Morton

[permalink] [raw]
Subject: Re: Extend clear_page by an order parameter

Paul Mackerras <[email protected]> wrote:
>
> A cluster of 2^n contiguous pages
> isn't one page by any normal definition.

It is, actually, from the POV of the page allocator. It's a "higher order
page" and is controlled by a struct page*, just like a zero-order page...

2005-01-22 01:08:15

by Paul Mackerras

[permalink] [raw]
Subject: Re: Extend clear_page by an order parameter

Andrew Morton writes:

> It is, actually, from the POV of the page allocator. It's a "higher order
> page" and is controlled by a struct page*, just like a zero-order page...

OK. I still reckon it's confusing terminology for the rest of us who
don't have our heads deep in the page allocator code.

Paul.

2005-01-22 01:21:06

by Roman Zippel

[permalink] [raw]
Subject: Re: Extend clear_page by an order parameter

Hi,

On Fri, 21 Jan 2005, Andrew Morton wrote:

> Paul Mackerras <[email protected]> wrote:
> >
> > A cluster of 2^n contiguous pages
> > isn't one page by any normal definition.
>
> It is, actually, from the POV of the page allocator. It's a "higher order
> page" and is controlled by a struct page*, just like a zero-order page...

OTOH we also have alloc_page/alloc_pages.

bye, Roman

2005-01-22 01:26:03

by Paul Mackerras

[permalink] [raw]
Subject: Re: Extend clear_page by an order parameter

Andrew Morton writes:

> It is, actually, from the POV of the page allocator. It's a "higher order
> page" and is controlled by a struct page*, just like a zero-order page...

So why is the function that gets me one of these "higher order pages"
called "get_free_pages" with an "s"? :)

Christoph's patch is bigger than it needs to be because he has to
change all the occurrences of clear_page(x) to clear_page(x, 0), and
then he has to change a lot of architectures' clear_page functions to
be called _clear_page instead. If he picked a different name for the
"clear a higher order page" function it would end up being less
invasive as well as less confusing.

The argument that clear_page is called that because it clears a higher
order page won't wash; all the clear_page implementations in his patch
are perfectly capable of clearing any contiguous set of 2^order pages
(oops, I mean "zero-order pages"), not just a "higher order page".

Paul.

2005-01-22 01:54:57

by Christoph Lameter

[permalink] [raw]
Subject: Re: Extend clear_page by an order parameter

On Sat, 22 Jan 2005, Paul Mackerras wrote:

> Christoph's patch is bigger than it needs to be because he has to
> change all the occurrences of clear_page(x) to clear_page(x, 0), and
> then he has to change a lot of architectures' clear_page functions to
> be called _clear_page instead. If he picked a different name for the
> "clear a higher order page" function it would end up being less
> invasive as well as less confusing.

I had the name "zero_page" in V1 and V2 of the patch where it was
separate. Then someone complained about code duplication.

> The argument that clear_page is called that because it clears a higher
> order page won't wash; all the clear_page implementations in his patch
> are perfectly capable of clearing any contiguous set of 2^order pages
> (oops, I mean "zero-order pages"), not just a "higher order page".

clear_page is called clear_page because it clears one page of *any* order
not just higher orders. zero-order pages are not segregated nor are they
intrisincally better just because they contain more memory ;-).

2005-01-22 02:53:46

by Paul Mackerras

[permalink] [raw]
Subject: Re: Extend clear_page by an order parameter

Christoph Lameter writes:

> I had the name "zero_page" in V1 and V2 of the patch where it was
> separate. Then someone complained about code duplication.

Well, if you duplicated each arch's clear_page implementation in
zero_page, then yes, that would be unnecessary code duplication. I
would suggest that for architectures where the clear_page
implementation can easily be extended, rename it to clear_page_order
(or something) and #define clear_page(x) to be clear_page_order(x, 0).
For architectures where it can't, leave clear_page as clear_page and
define clear_page_order as an inline function that calls clear_page in
a loop.

> clear_page is called clear_page because it clears one page of *any* order
> not just higher orders. zero-order pages are not segregated nor are they
> intrisincally better just because they contain more memory ;-).

You have missed my point, which was about address constraints, not a
distinction between zero-order pages and higher-order pages.

Anyway, I remain of the opinion that your naming is inconsistent with
the naming of other functions that deal with zero-order and
higher-order pages, such as get_free_pages, alloc_pages, free_pages,
etc., and that your patch is unnecessarily intrusive. I guess it's up
to Andrew to decide which way we go.

Paul.

2005-01-23 07:46:22

by Andrew Morton

[permalink] [raw]
Subject: Re: Extend clear_page by an order parameter

Christoph Lameter <[email protected]> wrote:
>
> The zeroing of a page of a arbitrary order in page_alloc.c and in hugetlb.c may benefit from a
> clear_page that is capable of zeroing multiple pages at once (and scrubd
> too but that is now an independent patch). The following patch extends
> clear_page with a second parameter specifying the order of the page to be zeroed to allow an
> efficient zeroing of pages. Hope I caught everything....
>

Sorry, I take it back. As Paul says:

: Wouldn't it be nicer to call the version that takes the order
: parameter "clear_pages" and then define clear_page(p) as
: clear_pages(p, 0) ?

It would make the patch considerably smaller, and our naming is all over
the place anyway...

> -static inline void prep_zero_page(struct page *page, int order, int gfp_flags)
> +void prep_zero_page(struct page *page, unsigned int order, unsigned int gfp_flags)
> {
> int i;
>
> BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
> + if (!PageHighMem(page)) {
> + clear_page(page_address(page), order);
> + return;
> + }
> +
> for(i = 0; i < (1 << order); i++)
> clear_highpage(page + i);
> }

I'd have thought that we'd want to make the new clear_pages() handle
highmem pages too, if only from a regularity POV. x86 hugetlbpages could
use it then, if someone thinks up a fast page-clearer.

2005-01-24 16:37:37

by Christoph Lameter

[permalink] [raw]
Subject: Re: Extend clear_page by an order parameter

On Sat, 22 Jan 2005, Andrew Morton wrote:

> Christoph Lameter <[email protected]> wrote:
> >
> > The zeroing of a page of a arbitrary order in page_alloc.c and in hugetlb.c may benefit from a
> > clear_page that is capable of zeroing multiple pages at once (and scrubd
> > too but that is now an independent patch). The following patch extends
> > clear_page with a second parameter specifying the order of the page to be zeroed to allow an
> > efficient zeroing of pages. Hope I caught everything....
> >
>
> Sorry, I take it back. As Paul says:
>
> : Wouldn't it be nicer to call the version that takes the order
> : parameter "clear_pages" and then define clear_page(p) as
> : clear_pages(p, 0) ?

> It would make the patch considerably smaller, and our naming is all over
> the place anyway...

Sounds good. Note though that this just means renaming clear_page to
clear_pages for all arches which would increase the patch size for the
arch specific section.

> I'd have thought that we'd want to make the new clear_pages() handle
> highmem pages too, if only from a regularity POV. x86 hugetlbpages could
> use it then, if someone thinks up a fast page-clearer.

That would get us back to code duplication. We would have a clear_page (no
highmem support) and a clear_pages (supporting highmem). Then it may
also be better to pass the page struct to clear_pages instead of a memory address.

2005-01-24 21:33:16

by David Miller

[permalink] [raw]
Subject: Re: Extend clear_page by an order parameter

On Mon, 24 Jan 2005 08:37:15 -0800 (PST)
Christoph Lameter <[email protected]> wrote:

> Then it may also be better to pass the page struct to clear_pages
> instead of a memory address.

What is more generally available at the call sites at this time?
Consider both HIGHMEM and non-HIGHMEM setups in your estimation
please :-)

2005-01-24 21:33:18

by Christoph Lameter

[permalink] [raw]
Subject: Re: Extend clear_page by an order parameter

On Mon, 24 Jan 2005, David S. Miller wrote:

> On Mon, 24 Jan 2005 08:37:15 -0800 (PST)
> Christoph Lameter <[email protected]> wrote:
>
> > Then it may also be better to pass the page struct to clear_pages
> > instead of a memory address.
>
> What is more generally available at the call sites at this time?
> Consider both HIGHMEM and non-HIGHMEM setups in your estimation
> please :-)

The only call site is prep_zero_page which has a GFP flag, the order and
the pointer to struct page.

The patch makes the huge page code call prep_zero_page and scrubd will
also call prep_zero_page.

2005-02-09 09:58:36

by Michael Ellerman

[permalink] [raw]
Subject: [Patch] Fix oops in alloc_zeroed_user_highpage() when page is NULL

Hi All,

The generic and IA-64 versions of alloc_zeroed_user_highpage() don't check the return value from alloc_page_vma(). This can lead to an oops if we're OOM.

This fixes my oops on PPC64, but I haven't got an IA-64 machine/compiler handy.

Signed-off-by: Michael Ellerman <[email protected]>

diff -rN -p -u oombreakage-old/include/asm-ia64/page.h oombreakage-new/include/asm-ia64/page.h
--- oombreakage-old/include/asm-ia64/page.h 2005-02-04 04:10:37.000000000 +1100
+++ oombreakage-new/include/asm-ia64/page.h 2005-02-09 20:53:37.000000000 +1100
@@ -79,7 +79,8 @@ do { \
#define alloc_zeroed_user_highpage(vma, vaddr) \
({ \
struct page *page = alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr); \
- flush_dcache_page(page); \
+ if (page) \
+ flush_dcache_page(page); \
page; \
})

diff -rN -p -u oombreakage-old/include/linux/highmem.h oombreakage-new/include/linux/highmem.h
--- oombreakage-old/include/linux/highmem.h 2005-02-09 20:22:41.000000000 +1100
+++ oombreakage-new/include/linux/highmem.h 2005-02-09 20:47:01.000000000 +1100
@@ -48,7 +48,9 @@ alloc_zeroed_user_highpage(struct vm_are
{
struct page *page = alloc_page_vma(GFP_HIGHUSER, vma, vaddr);

- clear_user_highpage(page, vaddr);
+ if (page)
+ clear_user_highpage(page, vaddr);
+
return page;
}
#endif



2005-02-10 00:39:01

by Christoph Lameter

[permalink] [raw]
Subject: Re: [Patch] Fix oops in alloc_zeroed_user_highpage() when page is NULL

On Wed, 9 Feb 2005, Michael Ellerman wrote:

> The generic and IA-64 versions of alloc_zeroed_user_highpage() don't
> check the return value from alloc_page_vma(). This can lead to an oops
> if we're OOM. This fixes my oops on PPC64, but I haven't got an IA-64
> machine/compiler handy.

Patch looks okay to me. These are the only occurences as far as I can tell
after reviewing the alloc_zeroed_user_higpage implementations in
include/asm-*/page.h.