2002-09-20 04:51:01

by Andrew Morton

[permalink] [raw]
Subject: [patch] remove page->virtual

Hot off the presses. Seems to work.




The patch removes page->virtual for all architectures which do not
define WANT_PAGE_VIRTUAL. Hash for it instead.

Possibly we could define WANT_PAGE_VIRTUAL for CONFIG_HIHGMEM4G, but it
seems unlikely.

A lot of the pressure went off kmap() and page_address() as a result of
the move to kmap_atomic(). That should be the preferred way to address
CPU load in the set_page_address() and page_address() hashing and
locking.

If kmap_atomic is not usable then the next best approach is for users
to cache the result of kmap() in a local rather than calling
page_address() repeatedly.


One heavy user of kmap() and page_address() is the ext2 directory code.

On a 7G Quad PIII, running four concurrent instances of

while true
do
find /usr/src/linux > /dev/null
done

on ext2 with everything cached, profiling shows that the new hashed
set_page_address() and page_address() implementations consume 0.4% and
1.3% of CPU time respectively. I think that's OK. (Plus the tested code
was doing an unneeded lookup in set_page_address(), for debug purposes)


c0141684 865 1.31499 page_address
c013851c 871 1.32411 kmem_cache_alloc
c015a068 1038 1.57799 dget_locked
c014ddfc 1220 1.85467 vfs_getattr
c014dd80 1320 2.00669 generic_fillattr
c0144f88 1359 2.06598 sys_chdir
c0159e60 1384 2.10398 dput
c014e334 1385 2.1055 cp_new_stat64
c01090c0 1741 2.6467 system_call
c0145b6a 1753 2.66494 .text.lock.open
c0151114 1786 2.71511 path_release
c0155d3c 2172 3.30192 filldir64
c0187580 2473 3.7595 ext2_readdir
c01eb950 2562 3.8948 atomic_dec_and_lock
c01eb690 2814 4.2779 __generic_copy_to_user
c015aba4 3194 4.85558 __d_lookup
c015207c 3567 5.42262 path_lookup
c01515dc 3775 5.73883 link_path_walk
c01eb99e 3811 5.79355 .text.lock.dec_and_lock
c01546b3 5847 8.88872 .text.lock.namei
c01884f2 6914 10.5108 .text.lock.dir



include/linux/mm.h | 48 ++++++++----------
init/main.c | 1
kernel/ksyms.c | 3 +
mm/highmem.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++---
mm/page_alloc.c | 5 +
5 files changed, 164 insertions(+), 34 deletions(-)

--- 2.5.36/include/linux/mm.h~remove-page-virtual Thu Sep 19 20:36:27 2002
+++ 2.5.36-akpm/include/linux/mm.h Thu Sep 19 21:03:06 2002
@@ -176,7 +176,7 @@ struct page {
* Architectures with slow multiplication can define
* WANT_PAGE_VIRTUAL in asm/page.h
*/
-#if defined(CONFIG_HIGHMEM) || defined(WANT_PAGE_VIRTUAL)
+#if defined(WANT_PAGE_VIRTUAL)
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* CONFIG_HIGMEM || WANT_PAGE_VIRTUAL */
@@ -289,38 +289,34 @@ static inline void set_page_zone(struct
page->flags |= zone_num << ZONE_SHIFT;
}

-/*
- * In order to avoid #ifdefs within C code itself, we define
- * set_page_address to a noop for non-highmem machines, where
- * the field isn't useful.
- * The same is true for page_address() in arch-dependent code.
- */
-#if defined(CONFIG_HIGHMEM) || defined(WANT_PAGE_VIRTUAL)
+#define lowmem_page_address(page) \
+ __va( ( ((page) - page_zone(page)->zone_mem_map) \
+ + page_zone(page)->zone_start_pfn) << PAGE_SHIFT)
+
+#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
+#define HASHED_PAGE_VIRTUAL
+#endif

+#if defined(WANT_PAGE_VIRTUAL)
+#define page_address(page) ((page)->virtual)
#define set_page_address(page, address) \
do { \
(page)->virtual = (address); \
} while(0)
+#define page_address_init() do { } while(0)
+#endif

-#else /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
-#define set_page_address(page, address) do { } while(0)
-#endif /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
+#if defined(HASHED_PAGE_VIRTUAL)
+void *page_address(struct page *page);
+void set_page_address(struct page *page, void *virtual);
+void page_address_init(void);
+#endif

-/*
- * Permanent address of a page. Obviously must never be
- * called on a highmem page.
- */
-#if defined(CONFIG_HIGHMEM) || defined(WANT_PAGE_VIRTUAL)
-
-#define page_address(page) ((page)->virtual)
-
-#else /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
-
-#define page_address(page) \
- __va( ( ((page) - page_zone(page)->zone_mem_map) \
- + page_zone(page)->zone_start_pfn) << PAGE_SHIFT)
-
-#endif /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
+#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
+#define page_address(page) lowmem_page_address(page)
+#define set_page_address(page, address) do { } while(0)
+#define page_address_init() do { } while(0)
+#endif

/*
* Return true if this page is mapped into pagetables. Subtle: test pte.direct
--- 2.5.36/init/main.c~remove-page-virtual Thu Sep 19 20:36:27 2002
+++ 2.5.36-akpm/init/main.c Thu Sep 19 20:36:27 2002
@@ -436,6 +436,7 @@ asmlinkage void __init start_kernel(void
initrd_start = 0;
}
#endif
+ page_address_init();
mem_init();
kmem_cache_sizes_init();
pidhash_init();
--- 2.5.36/kernel/ksyms.c~remove-page-virtual Thu Sep 19 20:36:27 2002
+++ 2.5.36-akpm/kernel/ksyms.c Thu Sep 19 20:36:27 2002
@@ -133,6 +133,9 @@ EXPORT_SYMBOL(highmem_start_page);
EXPORT_SYMBOL(kmap_prot);
EXPORT_SYMBOL(kmap_pte);
#endif
+#ifdef HASHED_PAGE_VIRTUAL
+EXPORT_SYMBOL(page_address);
+#endif
EXPORT_SYMBOL(get_user_pages);

/* filesystem internal functions */
--- 2.5.36/mm/highmem.c~remove-page-virtual Thu Sep 19 20:36:27 2002
+++ 2.5.36-akpm/mm/highmem.c Thu Sep 19 20:46:18 2002
@@ -22,6 +22,7 @@
#include <linux/mempool.h>
#include <linux/blkdev.h>
#include <linux/init.h>
+#include <linux/hash.h>
#include <asm/pgalloc.h>

static mempool_t *page_pool, *isa_page_pool;
@@ -88,7 +89,7 @@ static void flush_all_zero_pkmaps(void)
page = pte_page(pkmap_page_table[i]);
pte_clear(&pkmap_page_table[i]);

- page->virtual = NULL;
+ set_page_address(page, NULL);
}
flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
}
@@ -126,8 +127,8 @@ start:
spin_lock(&kmap_lock);

/* Somebody else might have mapped it while we slept */
- if (page->virtual)
- return (unsigned long) page->virtual;
+ if (page_address(page))
+ return (unsigned long)page_address(page);

/* Re-start */
goto start;
@@ -137,7 +138,7 @@ start:
set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));

pkmap_count[last_pkmap_nr] = 1;
- page->virtual = (void *) vaddr;
+ set_page_address(page, (void *)vaddr);

return vaddr;
}
@@ -153,7 +154,7 @@ void *kmap_high(struct page *page)
* We cannot call this from interrupts, as it may block
*/
spin_lock(&kmap_lock);
- vaddr = (unsigned long) page->virtual;
+ vaddr = (unsigned long)page_address(page);
if (!vaddr)
vaddr = map_new_virtual(page);
pkmap_count[PKMAP_NR(vaddr)]++;
@@ -170,7 +171,7 @@ void kunmap_high(struct page *page)
int need_wakeup;

spin_lock(&kmap_lock);
- vaddr = (unsigned long) page->virtual;
+ vaddr = (unsigned long)page_address(page);
if (!vaddr)
BUG();
nr = PKMAP_NR(vaddr);
@@ -467,7 +468,7 @@ void blk_queue_bounce(request_queue_t *q
*bio_orig = bio;
}

-#if CONFIG_DEBUG_HIGHMEM
+#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_HIGHMEM)
void check_highmem_ptes(void)
{
int idx, type;
@@ -482,3 +483,129 @@ void check_highmem_ptes(void)
}
#endif

+#if defined(HASHED_PAGE_VIRTUAL)
+
+#define PA_HASH_ORDER 7
+
+/*
+ * Describes one page->virtual association
+ */
+static struct page_address_map {
+ struct page *page;
+ void *virtual;
+ struct list_head list;
+} page_address_maps[LAST_PKMAP];
+
+/*
+ * page_address_map freelist, allocated from page_address_maps.
+ */
+static struct list_head page_address_pool; /* freelist */
+static spinlock_t pool_lock; /* yech */
+
+/*
+ * Hash table bucket
+ */
+static struct page_address_slot {
+ struct list_head lh;
+ spinlock_t lock;
+} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
+
+static struct page_address_slot *page_slot(struct page *page)
+{
+ return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
+}
+
+void *page_address(struct page *page)
+{
+ unsigned long flags;
+ void *ret;
+ struct page_address_slot *pas;
+
+ if (!PageHighMem(page))
+ return lowmem_page_address(page);
+
+ pas = page_slot(page);
+ ret = NULL;
+ spin_lock_irqsave(&pas->lock, flags);
+ if (!list_empty(&pas->lh)) {
+ struct page_address_map *pam;
+
+ list_for_each_entry(pam, &pas->lh, list) {
+ if (pam->page == page) {
+ ret = pam->virtual;
+ goto done;
+ }
+ }
+ }
+done:
+ spin_unlock_irqrestore(&pas->lock, flags);
+ return ret;
+}
+
+void set_page_address(struct page *page, void *virtual)
+{
+ unsigned long flags;
+ struct page_address_slot *pas;
+ struct page_address_map *pam;
+
+ BUG_ON(!PageHighMem(page));
+
+ if (virtual) {
+ void *addr = page_address(page);
+ if (addr) {
+ printk("eek!\n");
+ if (addr != virtual)
+ printk("double eek!\n");
+ }
+ }
+
+ pas = page_slot(page);
+ if (virtual) {
+ BUG_ON(list_empty(&page_address_pool));
+
+ spin_lock_irqsave(&pool_lock, flags);
+ pam = list_entry(page_address_pool.next,
+ struct page_address_map, list);
+ list_del(&pam->list);
+ spin_unlock_irqrestore(&pool_lock, flags);
+
+ pam->page = page;
+ pam->virtual = virtual;
+
+ spin_lock_irqsave(&pas->lock, flags);
+ list_add_tail(&pam->list, &pas->lh);
+ spin_unlock_irqrestore(&pas->lock, flags);
+ } else {
+ spin_lock_irqsave(&pas->lock, flags);
+ list_for_each_entry(pam, &pas->lh, list) {
+ if (pam->page == page) {
+ list_del(&pam->list);
+ spin_unlock_irqrestore(&pas->lock, flags);
+ spin_lock_irqsave(&pool_lock, flags);
+ list_add_tail(&pam->list, &page_address_pool);
+ spin_unlock_irqrestore(&pool_lock, flags);
+ goto done;
+ }
+ }
+ spin_unlock_irqrestore(&pas->lock, flags);
+ printk("aargh!\n");
+ }
+done:
+ return;
+}
+
+void __init page_address_init(void)
+{
+ int i;
+
+ INIT_LIST_HEAD(&page_address_pool);
+ for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
+ list_add(&page_address_maps[i].list, &page_address_pool);
+ for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
+ INIT_LIST_HEAD(&page_address_htable[i].lh);
+ spin_lock_init(&page_address_htable[i].lock);
+ }
+ spin_lock_init(&pool_lock);
+}
+
+#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
--- 2.5.36/mm/page_alloc.c~remove-page-virtual Thu Sep 19 20:36:27 2002
+++ 2.5.36-akpm/mm/page_alloc.c Thu Sep 19 20:36:27 2002
@@ -917,12 +917,15 @@ void __init free_area_init_core(pg_data_
set_page_count(page, 0);
SetPageReserved(page);
INIT_LIST_HEAD(&page->list);
+#ifdef WANT_PAGE_VIRTUAL
if (j != ZONE_HIGHMEM)
/*
* The shift left won't overflow because the
* ZONE_NORMAL is below 4G.
*/
- set_page_address(page, __va(zone_start_pfn << PAGE_SHIFT));
+ set_page_address(page,
+ __va(zone_start_pfn << PAGE_SHIFT));
+#endif
zone_start_pfn++;
}


.


2002-09-20 05:04:23

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [patch] remove page->virtual

On Thu, Sep 19, 2002 at 09:55:52PM -0700, Andrew Morton wrote:
> set_page_address() and page_address() implementations consume 0.4% and
> 1.3% of CPU time respectively. I think that's OK. (Plus the tested code
> was doing an unneeded lookup in set_page_address(), for debug purposes)

Looks yummy. I'll take it for a spin tonight on my benchmark-o-matic.
Clearing some more air in ZONE_NORMAL is always welcome here.


On Thu, Sep 19, 2002 at 09:55:52PM -0700, Andrew Morton wrote:
> c01884f2 6914 10.5108 .text.lock.dir
> c01546b3 5847 8.88872 .text.lock.namei
> c01eb99e 3811 5.79355 .text.lock.dec_and_lock
> c01515dc 3775 5.73883 link_path_walk
> c015207c 3567 5.42262 path_lookup
> c015aba4 3194 4.85558 __d_lookup
> c01eb690 2814 4.2779 __generic_copy_to_user
> c01eb950 2562 3.8948 atomic_dec_and_lock
> c0187580 2473 3.7595 ext2_readdir
> c0155d3c 2172 3.30192 filldir64
> c0151114 1786 2.71511 path_release
> c0145b6a 1753 2.66494 .text.lock.open

What's going on here? fs stuff is really hurting. At any rate, the
overhead of the address calculation and hashtable lookup is microscopic
according to this, and I want space.


Cheers,
Bill

2002-09-20 05:36:47

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch] remove page->virtual

William Lee Irwin III wrote:
>
> On Thu, Sep 19, 2002 at 09:55:52PM -0700, Andrew Morton wrote:
> > set_page_address() and page_address() implementations consume 0.4% and
> > 1.3% of CPU time respectively. I think that's OK. (Plus the tested code
> > was doing an unneeded lookup in set_page_address(), for debug purposes)
>
> Looks yummy. I'll take it for a spin tonight on my benchmark-o-matic.
> Clearing some more air in ZONE_NORMAL is always welcome here.

Ta.

> On Thu, Sep 19, 2002 at 09:55:52PM -0700, Andrew Morton wrote:
> > c01884f2 6914 10.5108 .text.lock.dir
> > c01546b3 5847 8.88872 .text.lock.namei
> > c01eb99e 3811 5.79355 .text.lock.dec_and_lock
> > c01515dc 3775 5.73883 link_path_walk
> > c015207c 3567 5.42262 path_lookup
> > c015aba4 3194 4.85558 __d_lookup
> > c01eb690 2814 4.2779 __generic_copy_to_user
> > c01eb950 2562 3.8948 atomic_dec_and_lock
> > c0187580 2473 3.7595 ext2_readdir
> > c0155d3c 2172 3.30192 filldir64
> > c0151114 1786 2.71511 path_release
> > c0145b6a 1753 2.66494 .text.lock.open
>
> What's going on here? fs stuff is really hurting. At any rate, the
> overhead of the address calculation and hashtable lookup is microscopic
> according to this, and I want space.


c0182f90 5949 14.5552 ext2_readdir

lock_kernel()

c014f65c 5790 14.1662 path_lookup

Confused. oprofile claims read_lock(&current->fs->lock) but
it's surely dcache_lock.

c01e6e80 4275 10.4595 atomic_dec_and_lock

dput/iput

c014eba4 2264 5.53924 link_path_walk
c0158050 1988 4.86397 __d_lookup
c01426e8 1903 4.656 sys_chdir
c01e6bc0 1735 4.24496 __generic_copy_to_user
c015319c 1344 3.28831 filldir64
c014e6b4 1205 2.94823 path_release
c0109070 1065 2.6057 system_call
c014b934 929 2.27295 cp_new_stat64
c014b380 822 2.01116 generic_fillattr
c0157250 712 1.74202 dput
c014b3fc 622 1.52182 vfs_getattr
c0157468 556 1.36034 dget_locked