2001-11-26 12:26:05

by Momchil Velikov

[permalink] [raw]
Subject: [PATCH] Scalable page cache

Hi,

This patch:

- replaces the global page cache hash table with a per mapping
splay tree;

- eliminates the ``pagecache_lock'', instead ``i_shared_lock''
is used so serialize access during insertion/deletion
into/from the tree;

The goals of the patch are to:

- to improve scalability (via the elimination of the global
lock);

- reduce the memory/cache footprint (via to the
``page_hash_table'' elimination);

The patch is against 2.4.16-pre1. Comments are welcome.

Regards,
-velco

diff -Nru a/arch/arm/mm/init.c b/arch/arm/mm/init.c
--- a/arch/arm/mm/init.c Sun Nov 25 16:39:44 2001
+++ b/arch/arm/mm/init.c Sun Nov 25 16:39:44 2001
@@ -103,7 +103,7 @@
* #define PG_skip 10
* #define PageSkip(page) (machine_is_riscpc() && test_bit(PG_skip, &(page)->flags))
* if (PageSkip(page)) {
- * page = page->next_hash;
+ * page = page->left;
* if (page == NULL)
* break;
* }
diff -Nru a/arch/arm/mm/small_page.c b/arch/arm/mm/small_page.c
--- a/arch/arm/mm/small_page.c Sun Nov 25 16:39:44 2001
+++ b/arch/arm/mm/small_page.c Sun Nov 25 16:39:44 2001
@@ -38,15 +38,15 @@
* Theory:
* We "misuse" the Linux memory management system. We use alloc_page
* to allocate a page and then mark it as reserved. The Linux memory
- * management system will then ignore the "offset", "next_hash" and
- * "pprev_hash" entries in the mem_map for this page.
+ * management system will then ignore the "offset", "left" and
+ * "right" entries in the mem_map for this page.
*
* We then use a bitstring in the "offset" field to mark which segments
* of the page are in use, and manipulate this as required during the
* allocation and freeing of these small pages.
*
* We also maintain a queue of pages being used for this purpose using
- * the "next_hash" and "pprev_hash" entries of mem_map;
+ * the "left" and "right" entries of mem_map;
*/

struct order {
@@ -81,20 +81,20 @@
if (page->pprev_hash)
PAGE_BUG(page);
#endif
- page->next_hash = *p;
+ page->left = *p;
if (*p)
- (*p)->pprev_hash = &page->next_hash;
+ (struct page **)(*p)->right = &page->left;
*p = page;
- page->pprev_hash = p;
+ (struct page **)page->right = p;
}

static void remove_page_from_queue(struct page *page)
{
- if (page->pprev_hash) {
- if (page->next_hash)
- page->next_hash->pprev_hash = page->pprev_hash;
- *page->pprev_hash = page->next_hash;
- page->pprev_hash = NULL;
+ if (page->right) {
+ if (page->left)
+ page->left->right = page->right;
+ *(struct page **)page->right = page->left;
+ page->right = NULL;
}
}

diff -Nru a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
--- a/arch/sparc64/mm/init.c Sun Nov 25 16:39:44 2001
+++ b/arch/sparc64/mm/init.c Sun Nov 25 16:39:44 2001
@@ -83,18 +83,18 @@
if (pgd_cache_size > high / 4) {
struct page *page, *page2;
for (page2 = NULL, page = (struct page *)pgd_quicklist; page;) {
- if ((unsigned long)page->pprev_hash == 3) {
+ if ((unsigned long)page->right == 3) {
if (page2)
- page2->next_hash = page->next_hash;
+ page2->left = page->left;
else
- (struct page *)pgd_quicklist = page->next_hash;
- page->next_hash = NULL;
- page->pprev_hash = NULL;
+ (struct page *)pgd_quicklist = page->left;
+ page->left = NULL;
+ page->right = NULL;
pgd_cache_size -= 2;
__free_page(page);
freed++;
if (page2)
- page = page2->next_hash;
+ page = page2->left;
else
page = (struct page *)pgd_quicklist;
if (pgd_cache_size <= low / 4)
@@ -102,7 +102,7 @@
continue;
}
page2 = page;
- page = page->next_hash;
+ page = page->left;
}
}
#endif
diff -Nru a/drivers/block/rd.c b/drivers/block/rd.c
--- a/drivers/block/rd.c Sun Nov 25 16:39:44 2001
+++ b/drivers/block/rd.c Sun Nov 25 16:39:44 2001
@@ -243,7 +243,6 @@

do {
int count;
- struct page ** hash;
struct page * page;
char * src, * dst;
int unlock = 0;
@@ -253,8 +252,7 @@
count = size;
size -= count;

- hash = page_hash(mapping, index);
- page = __find_get_page(mapping, index, hash);
+ page = __find_get_page(mapping, index);
if (!page) {
page = grab_cache_page(mapping, index);
err = -ENOMEM;
diff -Nru a/fs/inode.c b/fs/inode.c
--- a/fs/inode.c Sun Nov 25 16:39:44 2001
+++ b/fs/inode.c Sun Nov 25 16:39:44 2001
@@ -110,6 +110,7 @@
sema_init(&inode->i_sem, 1);
sema_init(&inode->i_zombie, 1);
spin_lock_init(&inode->i_data.i_shared_lock);
+ inode->i_data.root = 0;
}
}

diff -Nru a/include/linux/fs.h b/include/linux/fs.h
--- a/include/linux/fs.h Sun Nov 25 16:39:44 2001
+++ b/include/linux/fs.h Sun Nov 25 16:39:44 2001
@@ -403,6 +403,7 @@
struct vm_area_struct *i_mmap; /* list of private mappings */
struct vm_area_struct *i_mmap_shared; /* list of shared mappings */
spinlock_t i_shared_lock; /* and spinlock protecting it */
+ struct page *root;
int gfp_mask; /* how to allocate the pages */
};

diff -Nru a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h Sun Nov 25 16:39:44 2001
+++ b/include/linux/mm.h Sun Nov 25 16:39:44 2001
@@ -152,15 +152,14 @@
struct list_head list; /* ->mapping has some page lists. */
struct address_space *mapping; /* The inode (or ...) we belong to. */
unsigned long index; /* Our offset within mapping. */
- struct page *next_hash; /* Next page sharing our hash bucket in
- the pagecache hash table. */
+ struct page *left; /* Left child in the mapping tree */
+ struct page *right; /* Right child in the mapping tree */
atomic_t count; /* Usage count, see below. */
unsigned long flags; /* atomic flags, some possibly
updated asynchronously */
struct list_head lru; /* Pageout list, eg. active_list;
protected by pagemap_lru_lock !! */
wait_queue_head_t wait; /* Page locked? Stand in line... */
- struct page **pprev_hash; /* Complement to *next_hash. */
struct buffer_head * buffers; /* Buffer maps us to a disk block. */
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
@@ -228,9 +227,9 @@
* using the page->list list_head. These fields are also used for
* freelist managemet (when page->count==0).
*
- * There is also a hash table mapping (mapping,index) to the page
- * in memory if present. The lists for this hash table use the fields
- * page->next_hash and page->pprev_hash.
+ * There is also a per-mapping splay tree mapping (index) to the page
+ * in memory if present. The tree uses the fields page->left and
+ * page->right.
*
* All process pages can do I/O:
* - inode pages may need to be read from disk,
diff -Nru a/include/linux/pagemap.h b/include/linux/pagemap.h
--- a/include/linux/pagemap.h Sun Nov 25 16:39:44 2001
+++ b/include/linux/pagemap.h Sun Nov 25 16:39:44 2001
@@ -41,53 +41,26 @@
*/
#define page_cache_entry(x) virt_to_page(x)

-extern unsigned int page_hash_bits;
-#define PAGE_HASH_BITS (page_hash_bits)
-#define PAGE_HASH_SIZE (1 << PAGE_HASH_BITS)
-
-extern atomic_t page_cache_size; /* # of pages currently in the hash table */
-extern struct page **page_hash_table;
-
-extern void page_cache_init(unsigned long);
-
-/*
- * We use a power-of-two hash table to avoid a modulus,
- * and get a reasonable hash by knowing roughly how the
- * inode pointer and indexes are distributed (ie, we
- * roughly know which bits are "significant")
- *
- * For the time being it will work for struct address_space too (most of
- * them sitting inside the inodes). We might want to change it later.
- */
-static inline unsigned long _page_hashfn(struct address_space * mapping, unsigned long index)
-{
-#define i (((unsigned long) mapping)/(sizeof(struct inode) & ~ (sizeof(struct inode) - 1)))
-#define s(x) ((x)+((x)>>PAGE_HASH_BITS))
- return s(i+index) & (PAGE_HASH_SIZE-1);
-#undef i
-#undef s
-}
-
-#define page_hash(mapping,index) (page_hash_table+_page_hashfn(mapping,index))
+extern atomic_t page_cache_size; /* # of pages currently in the page cache */

extern struct page * __find_get_page(struct address_space *mapping,
- unsigned long index, struct page **hash);
+ unsigned long index);
#define find_get_page(mapping, index) \
- __find_get_page(mapping, index, page_hash(mapping, index))
+ __find_get_page(mapping, index)
extern struct page * __find_lock_page (struct address_space * mapping,
- unsigned long index, struct page **hash);
+ unsigned long index);
extern struct page * find_or_create_page(struct address_space *mapping,
unsigned long index, unsigned int gfp_mask);

extern void FASTCALL(lock_page(struct page *page));
extern void FASTCALL(unlock_page(struct page *page));
#define find_lock_page(mapping, index) \
- __find_lock_page(mapping, index, page_hash(mapping, index))
+ __find_lock_page(mapping, index)
extern struct page *find_trylock_page(struct address_space *, unsigned long);

extern void add_to_page_cache(struct page * page, struct address_space *mapping, unsigned long index);
extern void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index);
-extern int add_to_page_cache_unique(struct page * page, struct address_space *mapping, unsigned long index, struct page **hash);
+extern int add_to_page_cache_unique(struct page * page, struct address_space *mapping, unsigned long index);

extern void ___wait_on_page(struct page *);

diff -Nru a/init/main.c b/init/main.c
--- a/init/main.c Sun Nov 25 16:39:44 2001
+++ b/init/main.c Sun Nov 25 16:39:44 2001
@@ -597,7 +597,6 @@
proc_caches_init();
vfs_caches_init(mempages);
buffer_init(mempages);
- page_cache_init(mempages);
#if defined(CONFIG_ARCH_S390)
ccwcache_init();
#endif
diff -Nru a/kernel/ksyms.c b/kernel/ksyms.c
--- a/kernel/ksyms.c Sun Nov 25 16:39:44 2001
+++ b/kernel/ksyms.c Sun Nov 25 16:39:44 2001
@@ -215,8 +215,6 @@
EXPORT_SYMBOL(generic_file_mmap);
EXPORT_SYMBOL(generic_ro_fops);
EXPORT_SYMBOL(generic_buffer_fdatasync);
-EXPORT_SYMBOL(page_hash_bits);
-EXPORT_SYMBOL(page_hash_table);
EXPORT_SYMBOL(file_lock_list);
EXPORT_SYMBOL(locks_init_lock);
EXPORT_SYMBOL(locks_copy_lock);
diff -Nru a/mm/Makefile b/mm/Makefile
--- a/mm/Makefile Sun Nov 25 16:39:44 2001
+++ b/mm/Makefile Sun Nov 25 16:39:44 2001
@@ -14,7 +14,7 @@
obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
- shmem.o
+ shmem.o page_splay.o

obj-$(CONFIG_HIGHMEM) += highmem.o

diff -Nru a/mm/filemap.c b/mm/filemap.c
--- a/mm/filemap.c Sun Nov 25 16:39:44 2001
+++ b/mm/filemap.c Sun Nov 25 16:39:44 2001
@@ -31,6 +31,8 @@

#include <linux/highmem.h>

+#include "page_splay.h"
+
/*
* Shared mappings implemented 30.11.1994. It's not fully working yet,
* though.
@@ -44,8 +46,6 @@
*/

atomic_t page_cache_size = ATOMIC_INIT(0);
-unsigned int page_hash_bits;
-struct page **page_hash_table;

int vm_max_readahead = 31;
int vm_min_readahead = 3;
@@ -53,35 +53,20 @@
EXPORT_SYMBOL(vm_min_readahead);


-spinlock_t pagecache_lock ____cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
/*
* NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock
- * with the pagecache_lock held.
+ * with the mapping lock held.
*
* Ordering:
* swap_lock ->
* pagemap_lru_lock ->
- * pagecache_lock
+ * mapping lock
*/
spinlock_t pagemap_lru_lock ____cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;

#define CLUSTER_PAGES (1 << page_cluster)
#define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster)

-static void FASTCALL(add_page_to_hash_queue(struct page * page, struct page **p));
-static void add_page_to_hash_queue(struct page * page, struct page **p)
-{
- struct page *next = *p;
-
- *p = page;
- page->next_hash = next;
- page->pprev_hash = p;
- if (next)
- next->pprev_hash = &page->next_hash;
- if (page->buffers)
- PAGE_BUG(page);
- atomic_inc(&page_cache_size);
-}

static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page)
{
@@ -101,18 +86,6 @@
page->mapping = NULL;
}

-static inline void remove_page_from_hash_queue(struct page * page)
-{
- struct page *next = page->next_hash;
- struct page **pprev = page->pprev_hash;
-
- if (next)
- next->pprev_hash = pprev;
- *pprev = next;
- page->pprev_hash = NULL;
- atomic_dec(&page_cache_size);
-}
-
/*
* Remove a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
@@ -121,18 +94,20 @@
void __remove_inode_page(struct page *page)
{
if (PageDirty(page)) BUG();
+ page_splay_tree_delete (&page->mapping->root, page->index);
remove_page_from_inode_queue(page);
- remove_page_from_hash_queue(page);
+ atomic_dec(&page_cache_size);
}

void remove_inode_page(struct page *page)
{
+ struct address_space *mapping = page->mapping;
if (!PageLocked(page))
PAGE_BUG(page);

- spin_lock(&pagecache_lock);
+ spin_lock(&mapping->i_shared_lock);
__remove_inode_page(page);
- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);
}

static inline int sync_page(struct page *page)
@@ -153,10 +128,10 @@
struct address_space *mapping = page->mapping;

if (mapping) {
- spin_lock(&pagecache_lock);
+ spin_lock(&mapping->i_shared_lock);
list_del(&page->list);
list_add(&page->list, &mapping->dirty_pages);
- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);

if (mapping->host)
mark_inode_dirty_pages(mapping->host);
@@ -176,11 +151,12 @@
{
struct list_head *head, *curr;
struct page * page;
+ struct address_space *mapping = inode->i_mapping;

- head = &inode->i_mapping->clean_pages;
+ head = &mapping->clean_pages;

spin_lock(&pagemap_lru_lock);
- spin_lock(&pagecache_lock);
+ spin_lock(&mapping->i_shared_lock);
curr = head->next;

while (curr != head) {
@@ -211,7 +187,7 @@
continue;
}

- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);
spin_unlock(&pagemap_lru_lock);
}

@@ -250,8 +226,9 @@
page_cache_release(page);
}

-static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *));
-static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial)
+static int FASTCALL(truncate_list_pages(struct address_space *, struct list_head *, unsigned long, unsigned *));
+static int truncate_list_pages(struct address_space *mapping,
+ struct list_head *head, unsigned long start, unsigned *partial)
{
struct list_head *curr;
struct page * page;
@@ -280,7 +257,7 @@
/* Restart on this page */
list_add(head, curr);

- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);
unlocked = 1;

if (!failed) {
@@ -301,7 +278,7 @@
schedule();
}

- spin_lock(&pagecache_lock);
+ spin_lock(&mapping->i_shared_lock);
goto restart;
}
curr = curr->prev;
@@ -325,24 +302,25 @@
unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
int unlocked;

- spin_lock(&pagecache_lock);
+ spin_lock(&mapping->i_shared_lock);
do {
- unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial);
- unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial);
- unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial);
+ unlocked = truncate_list_pages(mapping, &mapping->clean_pages, start, &partial);
+ unlocked |= truncate_list_pages(mapping, &mapping->dirty_pages, start, &partial);
+ unlocked |= truncate_list_pages(mapping, &mapping->locked_pages, start, &partial);
} while (unlocked);
/* Traversed all three lists without dropping the lock */
- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);
}

-static inline int invalidate_this_page2(struct page * page,
+static inline int invalidate_this_page2(struct address_space *mapping,
+ struct page * page,
struct list_head * curr,
struct list_head * head)
{
int unlocked = 1;

/*
- * The page is locked and we hold the pagecache_lock as well
+ * The page is locked and we hold the mapping lock as well
* so both page_count(page) and page->buffers stays constant here.
*/
if (page_count(page) == 1 + !!page->buffers) {
@@ -351,7 +329,7 @@
list_add_tail(head, curr);

page_cache_get(page);
- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);
truncate_complete_page(page);
} else {
if (page->buffers) {
@@ -360,7 +338,7 @@
list_add_tail(head, curr);

page_cache_get(page);
- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);
block_invalidate_page(page);
} else
unlocked = 0;
@@ -372,8 +350,8 @@
return unlocked;
}

-static int FASTCALL(invalidate_list_pages2(struct list_head *));
-static int invalidate_list_pages2(struct list_head *head)
+static int FASTCALL(invalidate_list_pages2(struct address_space *mapping, struct list_head *));
+static int invalidate_list_pages2(struct address_space *mapping, struct list_head *head)
{
struct list_head *curr;
struct page * page;
@@ -387,7 +365,7 @@
if (!TryLockPage(page)) {
int __unlocked;

- __unlocked = invalidate_this_page2(page, curr, head);
+ __unlocked = invalidate_this_page2(mapping, page, curr, head);
UnlockPage(page);
unlocked |= __unlocked;
if (!__unlocked) {
@@ -400,7 +378,7 @@
list_add(head, curr);

page_cache_get(page);
- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);
unlocked = 1;
wait_on_page(page);
}
@@ -411,7 +389,7 @@
schedule();
}

- spin_lock(&pagecache_lock);
+ spin_lock(&mapping->i_shared_lock);
goto restart;
}
return unlocked;
@@ -426,32 +404,13 @@
{
int unlocked;

- spin_lock(&pagecache_lock);
+ spin_lock(&mapping->i_shared_lock);
do {
- unlocked = invalidate_list_pages2(&mapping->clean_pages);
- unlocked |= invalidate_list_pages2(&mapping->dirty_pages);
- unlocked |= invalidate_list_pages2(&mapping->locked_pages);
+ unlocked = invalidate_list_pages2(mapping, &mapping->clean_pages);
+ unlocked |= invalidate_list_pages2(mapping, &mapping->dirty_pages);
+ unlocked |= invalidate_list_pages2(mapping, &mapping->locked_pages);
} while (unlocked);
- spin_unlock(&pagecache_lock);
-}
-
-static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
-{
- goto inside;
-
- for (;;) {
- page = page->next_hash;
-inside:
- if (!page)
- goto not_found;
- if (page->mapping != mapping)
- continue;
- if (page->index == offset)
- break;
- }
-
-not_found:
- return page;
+ spin_unlock(&mapping->i_shared_lock);
}

/*
@@ -489,13 +448,14 @@
return error;
}

-static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *))
+static int do_buffer_fdatasync(struct address_space *mapping,
+ struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *))
{
struct list_head *curr;
struct page *page;
int retval = 0;

- spin_lock(&pagecache_lock);
+ spin_lock(&mapping->i_shared_lock);
curr = head->next;
while (curr != head) {
page = list_entry(curr, struct page, list);
@@ -508,7 +468,7 @@
continue;

page_cache_get(page);
- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);
lock_page(page);

/* The buffers could have been free'd while we waited for the page lock */
@@ -516,11 +476,11 @@
retval |= fn(page);

UnlockPage(page);
- spin_lock(&pagecache_lock);
+ spin_lock(&mapping->i_shared_lock);
curr = page->list.next;
page_cache_release(page);
}
- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);

return retval;
}
@@ -531,17 +491,18 @@
*/
int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
{
+ struct address_space *mapping = inode->i_mapping;
int retval;

/* writeout dirty buffers on pages from both clean and dirty lists */
- retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
- retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page);
- retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page);
+ retval = do_buffer_fdatasync(mapping, &mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
+ retval |= do_buffer_fdatasync(mapping, &mapping->clean_pages, start_idx, end_idx, writeout_one_page);
+ retval |= do_buffer_fdatasync(mapping, &mapping->locked_pages, start_idx, end_idx, writeout_one_page);

/* now wait for locked buffers on pages from both clean and dirty lists */
- retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, waitfor_one_page);
- retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page);
- retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page);
+ retval |= do_buffer_fdatasync(mapping, &mapping->dirty_pages, start_idx, end_idx, waitfor_one_page);
+ retval |= do_buffer_fdatasync(mapping, &mapping->clean_pages, start_idx, end_idx, waitfor_one_page);
+ retval |= do_buffer_fdatasync(mapping, &mapping->locked_pages, start_idx, end_idx, waitfor_one_page);

return retval;
}
@@ -586,7 +547,7 @@
{
int (*writepage)(struct page *) = mapping->a_ops->writepage;

- spin_lock(&pagecache_lock);
+ spin_lock(&mapping->i_shared_lock);

while (!list_empty(&mapping->dirty_pages)) {
struct page *page = list_entry(mapping->dirty_pages.next, struct page, list);
@@ -598,7 +559,7 @@
continue;

page_cache_get(page);
- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);

lock_page(page);

@@ -609,9 +570,9 @@
UnlockPage(page);

page_cache_release(page);
- spin_lock(&pagecache_lock);
+ spin_lock(&mapping->i_shared_lock);
}
- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);
}

/**
@@ -623,7 +584,7 @@
*/
void filemap_fdatawait(struct address_space * mapping)
{
- spin_lock(&pagecache_lock);
+ spin_lock(&mapping->i_shared_lock);

while (!list_empty(&mapping->locked_pages)) {
struct page *page = list_entry(mapping->locked_pages.next, struct page, list);
@@ -635,14 +596,14 @@
continue;

page_cache_get(page);
- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);

___wait_on_page(page);

page_cache_release(page);
- spin_lock(&pagecache_lock);
+ spin_lock(&mapping->i_shared_lock);
}
- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);
}

/*
@@ -658,10 +619,11 @@

page->index = index;
page_cache_get(page);
- spin_lock(&pagecache_lock);
+ spin_lock (&mapping->i_shared_lock);
add_page_to_inode_queue(mapping, page);
- add_page_to_hash_queue(page, page_hash(mapping, index));
- spin_unlock(&pagecache_lock);
+ page_splay_tree_insert (&mapping->root, page);
+ atomic_inc(&page_cache_size);
+ spin_unlock(&mapping->i_shared_lock);

lru_cache_add(page);
}
@@ -671,8 +633,7 @@
* owned by us, but unreferenced, not uptodate and with no errors.
*/
static inline void __add_to_page_cache(struct page * page,
- struct address_space *mapping, unsigned long offset,
- struct page **hash)
+ struct address_space *mapping, unsigned long offset)
{
unsigned long flags;

@@ -681,34 +642,34 @@
page_cache_get(page);
page->index = offset;
add_page_to_inode_queue(mapping, page);
- add_page_to_hash_queue(page, hash);
+ page_splay_tree_insert (&mapping->root, page);
+ atomic_inc(&page_cache_size);
}

void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
{
- spin_lock(&pagecache_lock);
- __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
- spin_unlock(&pagecache_lock);
+ spin_lock(&mapping->i_shared_lock);
+ __add_to_page_cache(page, mapping, offset);
+ spin_unlock(&mapping->i_shared_lock);
lru_cache_add(page);
}

int add_to_page_cache_unique(struct page * page,
- struct address_space *mapping, unsigned long offset,
- struct page **hash)
+ struct address_space *mapping, unsigned long offset)
{
int err;
struct page *alias;

- spin_lock(&pagecache_lock);
- alias = __find_page_nolock(mapping, offset, *hash);
+ spin_lock(&mapping->i_shared_lock);
+ alias = page_splay_tree_find (&mapping->root, offset);

err = 1;
if (!alias) {
- __add_to_page_cache(page,mapping,offset,hash);
+ __add_to_page_cache(page,mapping,offset);
err = 0;
}

- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);
if (!err)
lru_cache_add(page);
return err;
@@ -722,12 +683,11 @@
static int page_cache_read(struct file * file, unsigned long offset)
{
struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
- struct page **hash = page_hash(mapping, offset);
struct page *page;

- spin_lock(&pagecache_lock);
- page = __find_page_nolock(mapping, offset, *hash);
- spin_unlock(&pagecache_lock);
+ spin_lock(&mapping->i_shared_lock);
+ page = page_splay_tree_find (&mapping->root, offset);
+ spin_unlock(&mapping->i_shared_lock);
if (page)
return 0;

@@ -735,7 +695,7 @@
if (!page)
return -ENOMEM;

- if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
+ if (!add_to_page_cache_unique(page, mapping, offset)) {
int error = mapping->a_ops->readpage(file, page);
page_cache_release(page);
return error;
@@ -844,7 +804,7 @@
* hashed page atomically.
*/
struct page * __find_get_page(struct address_space *mapping,
- unsigned long offset, struct page **hash)
+ unsigned long offset)
{
struct page *page;

@@ -852,11 +812,11 @@
* We scan the hash list read-only. Addition to and removal from
* the hash-list needs a held write-lock.
*/
- spin_lock(&pagecache_lock);
- page = __find_page_nolock(mapping, offset, *hash);
+ spin_lock(&mapping->i_shared_lock);
+ page = page_splay_tree_find (&mapping->root, offset);
if (page)
page_cache_get(page);
- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);
return page;
}

@@ -866,15 +826,14 @@
struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
{
struct page *page;
- struct page **hash = page_hash(mapping, offset);

- spin_lock(&pagecache_lock);
- page = __find_page_nolock(mapping, offset, *hash);
+ spin_lock(&mapping->i_shared_lock);
+ page = page_splay_tree_find (&mapping->root, offset);
if (page) {
if (TryLockPage(page))
page = NULL;
}
- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);
return page;
}

@@ -883,9 +842,9 @@
* will return with it held (but it may be dropped
* during blocking operations..
*/
-static struct page * FASTCALL(__find_lock_page_helper(struct address_space *, unsigned long, struct page *));
+static struct page * FASTCALL(__find_lock_page_helper(struct address_space *, unsigned long));
static struct page * __find_lock_page_helper(struct address_space *mapping,
- unsigned long offset, struct page *hash)
+ unsigned long offset)
{
struct page *page;

@@ -894,13 +853,13 @@
* the hash-list needs a held write-lock.
*/
repeat:
- page = __find_page_nolock(mapping, offset, hash);
+ page = page_splay_tree_find (&mapping->root, offset);
if (page) {
page_cache_get(page);
if (TryLockPage(page)) {
- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);
lock_page(page);
- spin_lock(&pagecache_lock);
+ spin_lock(&mapping->i_shared_lock);

/* Has the page been re-allocated while we slept? */
if (page->mapping != mapping || page->index != offset) {
@@ -918,13 +877,13 @@
* it's still valid once we own it.
*/
struct page * __find_lock_page (struct address_space *mapping,
- unsigned long offset, struct page **hash)
+ unsigned long offset)
{
struct page *page;

- spin_lock(&pagecache_lock);
- page = __find_lock_page_helper(mapping, offset, *hash);
- spin_unlock(&pagecache_lock);
+ spin_lock(&mapping->i_shared_lock);
+ page = __find_lock_page_helper(mapping, offset);
+ spin_unlock(&mapping->i_shared_lock);
return page;
}

@@ -934,23 +893,22 @@
struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask)
{
struct page *page;
- struct page **hash = page_hash(mapping, index);

- spin_lock(&pagecache_lock);
- page = __find_lock_page_helper(mapping, index, *hash);
- spin_unlock(&pagecache_lock);
+ spin_lock(&mapping->i_shared_lock);
+ page = __find_lock_page_helper(mapping, index);
+ spin_unlock(&mapping->i_shared_lock);
if (!page) {
struct page *newpage = alloc_page(gfp_mask);
page = ERR_PTR(-ENOMEM);
if (newpage) {
- spin_lock(&pagecache_lock);
- page = __find_lock_page_helper(mapping, index, *hash);
+ spin_lock(&mapping->i_shared_lock);
+ page = __find_lock_page_helper(mapping, index);
if (likely(!page)) {
page = newpage;
- __add_to_page_cache(page, mapping, index, hash);
+ __add_to_page_cache(page, mapping, index);
newpage = NULL;
}
- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);
if (newpage == NULL)
lru_cache_add(page);
else
@@ -977,10 +935,9 @@
*/
struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
{
- struct page *page, **hash;
+ struct page *page;

- hash = page_hash(mapping, index);
- page = __find_get_page(mapping, index, hash);
+ page = __find_get_page(mapping, index);

if ( page ) {
if ( !TryLockPage(page) ) {
@@ -1005,7 +962,7 @@
if ( unlikely(!page) )
return NULL; /* Failed to allocate a page */

- if ( unlikely(add_to_page_cache_unique(page, mapping, index, hash)) ) {
+ if ( unlikely(add_to_page_cache_unique(page, mapping, index)) ) {
/* Someone else grabbed the page already. */
page_cache_release(page);
return NULL;
@@ -1330,7 +1287,7 @@
}

for (;;) {
- struct page *page, **hash;
+ struct page *page;
unsigned long end_index, nr, ret;

end_index = inode->i_size >> PAGE_CACHE_SHIFT;
@@ -1349,15 +1306,14 @@
/*
* Try to find the data in the page cache..
*/
- hash = page_hash(mapping, index);

- spin_lock(&pagecache_lock);
- page = __find_page_nolock(mapping, index, *hash);
+ spin_lock(&mapping->i_shared_lock);
+ page = page_splay_tree_find (&mapping->root, index);
if (!page)
goto no_cached_page;
found_page:
page_cache_get(page);
- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);

if (!Page_Uptodate(page))
goto page_not_up_to_date;
@@ -1451,7 +1407,7 @@
* We get here with the page cache lock held.
*/
if (!cached_page) {
- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);
cached_page = page_cache_alloc(mapping);
if (!cached_page) {
desc->error = -ENOMEM;
@@ -1462,8 +1418,8 @@
* Somebody may have added the page while we
* dropped the page cache lock. Check for that.
*/
- spin_lock(&pagecache_lock);
- page = __find_page_nolock(mapping, index, *hash);
+ spin_lock(&mapping->i_shared_lock);
+ page = page_splay_tree_find (&mapping->root, index);
if (page)
goto found_page;
}
@@ -1472,8 +1428,8 @@
* Ok, add the new page to the hash-queues...
*/
page = cached_page;
- __add_to_page_cache(page, mapping, index, hash);
- spin_unlock(&pagecache_lock);
+ __add_to_page_cache(page, mapping, index);
+ spin_unlock(&mapping->i_shared_lock);
lru_cache_add(page);
cached_page = NULL;

@@ -1873,7 +1829,7 @@
struct file *file = area->vm_file;
struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
struct inode *inode = mapping->host;
- struct page *page, **hash;
+ struct page *page;
unsigned long size, pgoff, endoff;

pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
@@ -1895,9 +1851,8 @@
/*
* Do we have something in the page cache already?
*/
- hash = page_hash(mapping, pgoff);
retry_find:
- page = __find_get_page(mapping, pgoff, hash);
+ page = __find_get_page(mapping, pgoff);
if (!page)
goto no_cached_page;

@@ -2582,13 +2537,13 @@
{
unsigned char present = 0;
struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping;
- struct page * page, ** hash = page_hash(as, pgoff);
+ struct page * page;

- spin_lock(&pagecache_lock);
- page = __find_page_nolock(as, pgoff, *hash);
+ spin_lock(&as->i_shared_lock);
+ page = page_splay_tree_find (&as->root, pgoff);
if ((page) && (Page_Uptodate(page)))
present = 1;
- spin_unlock(&pagecache_lock);
+ spin_unlock(&as->i_shared_lock);

return present;
}
@@ -2731,11 +2686,10 @@
int (*filler)(void *,struct page*),
void *data)
{
- struct page **hash = page_hash(mapping, index);
struct page *page, *cached_page = NULL;
int err;
repeat:
- page = __find_get_page(mapping, index, hash);
+ page = __find_get_page(mapping, index);
if (!page) {
if (!cached_page) {
cached_page = page_cache_alloc(mapping);
@@ -2743,7 +2697,7 @@
return ERR_PTR(-ENOMEM);
}
page = cached_page;
- if (add_to_page_cache_unique(page, mapping, index, hash))
+ if (add_to_page_cache_unique(page, mapping, index))
goto repeat;
cached_page = NULL;
err = filler(data, page);
@@ -2799,9 +2753,9 @@
static inline struct page * __grab_cache_page(struct address_space *mapping,
unsigned long index, struct page **cached_page)
{
- struct page *page, **hash = page_hash(mapping, index);
+ struct page *page;
repeat:
- page = __find_lock_page(mapping, index, hash);
+ page = __find_lock_page(mapping, index);
if (!page) {
if (!*cached_page) {
*cached_page = page_cache_alloc(mapping);
@@ -2809,7 +2763,7 @@
return NULL;
}
page = *cached_page;
- if (add_to_page_cache_unique(page, mapping, index, hash))
+ if (add_to_page_cache_unique(page, mapping, index))
goto repeat;
*cached_page = NULL;
}
@@ -3072,29 +3026,3 @@
goto out_status;
}

-void __init page_cache_init(unsigned long mempages)
-{
- unsigned long htable_size, order;
-
- htable_size = mempages;
- htable_size *= sizeof(struct page *);
- for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
- ;
-
- do {
- unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
-
- page_hash_bits = 0;
- while((tmp >>= 1UL) != 0UL)
- page_hash_bits++;
-
- page_hash_table = (struct page **)
- __get_free_pages(GFP_ATOMIC, order);
- } while(page_hash_table == NULL && --order > 0);
-
- printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
- (1 << page_hash_bits), order, (PAGE_SIZE << order));
- if (!page_hash_table)
- panic("Failed to allocate page hash table\n");
- memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
-}
diff -Nru a/mm/page_splay.c b/mm/page_splay.c
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/mm/page_splay.c Sun Nov 25 16:39:44 2001
@@ -0,0 +1,136 @@
+/* Self-adjusting binary search tree (splay tree).
+ *
+ * Copyright (C) 1999, 2000, 2001 Momchil Velikov
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/mm.h>
+
+struct page *
+page_splay_tree_splay (struct page *t, unsigned long k)
+{
+ struct page N, *l, *r, *x;
+
+ if (t == 0)
+ return 0;
+
+ N.left = N.right = 0;
+ l = r = &N;
+
+ for (;;) {
+ if (k < t->index) {
+ x = t->left;
+ if (x == 0)
+ break;
+ if (k < x->index) {
+ t->left = x->right;
+ x->right = t;
+ t = x;
+ if (t->left == 0)
+ break;
+ }
+ r->left = t;
+ r = t;
+ t = t->left;
+ } else if (k > t->index) {
+ x = t->right;
+ if (x == 0)
+ break;
+ if (k > x->index) {
+ t->right = x->left;
+ x->left = t;
+ t = x;
+ if (t->right == 0)
+ break;
+ }
+ l->right = t;
+ l = t;
+ t = t->right;
+ }
+ else
+ break;
+ }
+
+ l->right = r->left = 0;
+
+ l->right = t->left;
+ r->left = t->right;
+ t->left = N.right;
+ t->right = N.left;
+
+ return t;
+}
+
+int
+page_splay_tree_insert (struct page **pt, struct page *k)
+{
+ struct page *t;
+
+ t = *pt;
+ if (t == 0) {
+ *pt = k;
+ return 0;
+ }
+
+ t = page_splay_tree_splay (t, k->index);
+
+ if (k->index < t->index) {
+ k->left = t->left;
+ k->right = t;
+ t->left = 0;
+
+ *pt = k;
+ return 0;
+ }
+
+ if (k->index > t->index) {
+ k->right = t->right;
+ k->left = t;
+ t->right = 0;
+
+ *pt = k;
+ return 0;
+ }
+
+ return -1;
+}
+
+struct page *
+page_splay_tree_delete (struct page **r, unsigned long k)
+{
+ struct page *t, *x;
+
+ if ((t = *r) == 0)
+ return 0;
+
+ t = page_splay_tree_splay (t, k);
+
+ if (t->index == k) {
+ if (t->left == 0)
+ x = t->right;
+ else {
+ x = page_splay_tree_splay (t->left, k);
+ x->right = t->right;
+ }
+
+ *r = x;
+
+ t->left = t->right = 0;
+ return t;
+ } else {
+ *r = t;
+ return 0;
+ }
+}
diff -Nru a/mm/page_splay.h b/mm/page_splay.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/mm/page_splay.h Sun Nov 25 16:39:44 2001
@@ -0,0 +1,36 @@
+/* Self-adjusting binary search tree (splay tree).
+ *
+ * Copyright (C) 1999, 2000, 2001 Momchil Velikov
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef _PAGE_SPLAY_H
+#define _PAGE_SPLAY_H
+
+struct page;
+extern struct page * page_splay_tree_splay (struct page *t, unsigned long k);
+extern int page_splay_tree_insert (struct page **pt, struct page *k);
+extern struct page *page_splay_tree_delete (struct page **r, unsigned long k);
+
+static inline struct page *
+page_splay_tree_find (struct page **r, unsigned long k) {
+ *r = page_splay_tree_splay (*r, k);
+
+ if (*r && (*r)->index == k)
+ return *r;
+ else
+ return 0;
+}
+#endif /* _PAGE_SPLAY_H */
diff -Nru a/mm/swap_state.c b/mm/swap_state.c
--- a/mm/swap_state.c Sun Nov 25 16:39:44 2001
+++ b/mm/swap_state.c Sun Nov 25 16:39:44 2001
@@ -75,8 +75,7 @@
INC_CACHE_INFO(noent_race);
return -ENOENT;
}
- if (add_to_page_cache_unique(page, &swapper_space, entry.val,
- page_hash(&swapper_space, entry.val)) != 0) {
+ if (add_to_page_cache_unique(page, &swapper_space, entry.val) != 0) {
swap_free(entry);
INC_CACHE_INFO(exist_race);
return -EEXIST;
@@ -121,9 +120,9 @@

entry.val = page->index;

- spin_lock(&pagecache_lock);
+ spin_lock(&swapper_space.i_shared_lock);
__delete_from_swap_cache(page);
- spin_unlock(&pagecache_lock);
+ spin_unlock(&swapper_space.i_shared_lock);

swap_free(entry);
page_cache_release(page);
diff -Nru a/mm/swapfile.c b/mm/swapfile.c
--- a/mm/swapfile.c Sun Nov 25 16:39:44 2001
+++ b/mm/swapfile.c Sun Nov 25 16:39:44 2001
@@ -239,10 +239,10 @@
/* Is the only swap cache user the cache itself? */
if (p->swap_map[SWP_OFFSET(entry)] == 1) {
/* Recheck the page count with the pagecache lock held.. */
- spin_lock(&pagecache_lock);
+ spin_lock(&swapper_space.i_shared_lock);
if (page_count(page) - !!page->buffers == 2)
retval = 1;
- spin_unlock(&pagecache_lock);
+ spin_unlock(&swapper_space.i_shared_lock);
}
swap_info_put(p);
}
@@ -307,13 +307,13 @@
retval = 0;
if (p->swap_map[SWP_OFFSET(entry)] == 1) {
/* Recheck the page count with the pagecache lock held.. */
- spin_lock(&pagecache_lock);
+ spin_lock(&swapper_space.i_shared_lock);
if (page_count(page) - !!page->buffers == 2) {
__delete_from_swap_cache(page);
SetPageDirty(page);
retval = 1;
}
- spin_unlock(&pagecache_lock);
+ spin_unlock(&swapper_space.i_shared_lock);
}
swap_info_put(p);

diff -Nru a/mm/vmscan.c b/mm/vmscan.c
--- a/mm/vmscan.c Sun Nov 25 16:39:44 2001
+++ b/mm/vmscan.c Sun Nov 25 16:39:44 2001
@@ -337,6 +337,7 @@
static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority)
{
struct list_head * entry;
+ struct address_space *mapping;
int max_scan = nr_inactive_pages / priority;
int max_mapped = nr_pages << (9 - priority);

@@ -391,7 +392,9 @@
continue;
}

- if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) {
+ mapping = page->mapping;
+
+ if (PageDirty(page) && is_page_cache_freeable(page) && mapping) {
/*
* It is not critical here to write it only if
* the page is unmapped beause any direct writer
@@ -402,7 +405,7 @@
*/
int (*writepage)(struct page *);

- writepage = page->mapping->a_ops->writepage;
+ writepage = mapping->a_ops->writepage;
if ((gfp_mask & __GFP_FS) && writepage) {
ClearPageDirty(page);
SetPageLaunder(page);
@@ -429,7 +432,7 @@
page_cache_get(page);

if (try_to_release_page(page, gfp_mask)) {
- if (!page->mapping) {
+ if (!mapping) {
/*
* We must not allow an anon page
* with no buffers to be visible on
@@ -466,13 +469,22 @@
}
}

- spin_lock(&pagecache_lock);
+ /*
+ * Page is locked, so mapping can't change under our
+ * feet.
+ */
+ if (!mapping) {
+ UnlockPage (page);
+ goto page_mapped;
+ }
+
+ spin_lock(&mapping->i_shared_lock);

/*
* this is the non-racy check for busy page.
*/
- if (!page->mapping || !is_page_cache_freeable(page)) {
- spin_unlock(&pagecache_lock);
+ if (!is_page_cache_freeable(page)) {
+ spin_unlock(&mapping->i_shared_lock);
UnlockPage(page);
page_mapped:
if (--max_mapped >= 0)
@@ -492,7 +504,7 @@
* the page is freeable* so not in use by anybody.
*/
if (PageDirty(page)) {
- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);
UnlockPage(page);
continue;
}
@@ -500,12 +512,12 @@
/* point of no return */
if (likely(!PageSwapCache(page))) {
__remove_inode_page(page);
- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);
} else {
swp_entry_t swap;
swap.val = page->index;
__delete_from_swap_cache(page);
- spin_unlock(&pagecache_lock);
+ spin_unlock(&mapping->i_shared_lock);
swap_free(swap);
}


2001-11-26 15:26:22

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache


On 26 Nov 2001, Momchil Velikov wrote:

> Hi,
>
> This patch:
>
> - replaces the global page cache hash table with a per mapping
> splay tree;
>
> - eliminates the ``pagecache_lock'', instead ``i_shared_lock''
> is used so serialize access during insertion/deletion
> into/from the tree;
>
> The goals of the patch are to:
>
> - to improve scalability (via the elimination of the global
> lock);
>
> - reduce the memory/cache footprint (via to the
> ``page_hash_table'' elimination);
>
> The patch is against 2.4.16-pre1. Comments are welcome.

are you aware of the following patch? (written by David Miller and me.)

http://people.redhat.com/mingo/smp-pagecache-patches/pagecache-2.4.10-A3

it gets rid of the pagecache lock without introducing a tree.

while reducing memory footprint is a goal we want to achieve, the
pagecache hash is such a critical piece of data structure that we want
O(1)-type search properties, not a tree. The pagetable hash takes up 0.2%
of RAM currently. (but we could cut the size of the hash in half i think,
it's a bit over-sized currently - it has as many entries.)

The problem with the tree is that if we have a big, eg. 16 GB pagecache,
then even assuming a perfectly balanced tree, it takes more than 20
iterations to find the page in the tree. (which also means 20 cachelines
touched per tree node we pass.) Such an overhead (both algorithmic and
cache-footprint overhead) is absolutely out of question - and it will only
get worse with more RAM, which isnt a good property.

hashes on the other hand are simple and fast, and we can always balance
performance against cache footprint and hash-table memory usage. This is
one reason why we keept the pagetable hash in our patch.

Ingo

2001-11-26 17:27:21

by Momchil Velikov

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache

>>>>> "Ingo" == Ingo Molnar <[email protected]> writes:

Ingo> On 26 Nov 2001, Momchil Velikov wrote:

>> Hi,
>>
>> This patch:
>>
>> - replaces the global page cache hash table with a per mapping
>> splay tree;
>>
>> - eliminates the ``pagecache_lock'', instead ``i_shared_lock''
>> is used so serialize access during insertion/deletion
>> into/from the tree;
>>
>> The goals of the patch are to:
>>
>> - to improve scalability (via the elimination of the global
>> lock);
>>
>> - reduce the memory/cache footprint (via to the
>> ``page_hash_table'' elimination);
>>
>> The patch is against 2.4.16-pre1. Comments are welcome.

Ingo> are you aware of the following patch? (written by David Miller and me.)

Ingo> http://people.redhat.com/mingo/smp-pagecache-patches/pagecache-2.4.10-A3

Yep. Folks on #kernelnewbies told me about it, when there were only
changes to ``shrink_cache'' left. So, I decided to funish mine ;)

Ingo> it gets rid of the pagecache lock without introducing a tree.

Ingo> while reducing memory footprint is a goal we want to achieve, the
Ingo> pagecache hash is such a critical piece of data structure that we want
Ingo> O(1)-type search properties, not a tree. The pagetable hash takes up 0.2%
Ingo> of RAM currently. (but we could cut the size of the hash in half i think,
Ingo> it's a bit over-sized currently - it has as many entries.)

That's why I use splay tree and not red-black or AVL-balanced one - to
exploit the locality of reference, expecting to have O(1) on average.
Of course, I decided on tree because it is hard to choose the right
hash size.

Ingo> The problem with the tree is that if we have a big, eg. 16 GB pagecache,
Ingo> then even assuming a perfectly balanced tree, it takes more than 20
Ingo> iterations to find the page in the tree. (which also means 20 cachelines
Ingo> touched per tree node we pass.) Such an overhead (both algorithmic and
Ingo> cache-footprint overhead) is absolutely out of question - and it will only
Ingo> get worse with more RAM, which isnt a good property.

The tree is per mapping, not a single one. Now, with 16GB cached in a
single mapping, it'd perform poorly, indeed (though probably not 20).

Ingo> hashes on the other hand are simple and fast, and we can always balance
Ingo> performance against cache footprint and hash-table memory usage. This is
Ingo> one reason why we keept the pagetable hash in our patch.

Ingo> Ingo

2001-11-26 17:36:31

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache

In article <[email protected]>,
Ingo Molnar <[email protected]> wrote:
>
>it gets rid of the pagecache lock without introducing a tree.
>
>while reducing memory footprint is a goal we want to achieve, the
>pagecache hash is such a critical piece of data structure that we want
>O(1)-type search properties, not a tree. The pagetable hash takes up 0.2%
>of RAM currently. (but we could cut the size of the hash in half i think,
>it's a bit over-sized currently - it has as many entries.)

I actually considered a tree a long time ago, but I was thinking more
along the lines of the page table tree - with the optimization of being
able to perhaps map sub-trees _directly_ into the address space.

it's a cool idea, especially if done right (ie try to share the
functions between the VM trees and the "page cache tree"), but I was too
lazy to try it out (it's a _lot_ of work to do right). And I suspect
that it would optimize all the wrong cases, ie on x86 you could mmap
4MB-aligned areas at 4MB offsets with "zero cost", but in real life
that's not a very common situation.

Tree's _do_ have advantages over hashes, though, in having both better
cache locality and better locking locality.

I don't think a binary tree (even if it is self-balancing) is the proper
format, though. Binary trees have bad cache characteristics, and as
Ingo points out, with large files (and many performance-critical things
like databases have _huge_ files) you get bad behaviour on lookup with a
binary tree.

A indexed tree (like the page tables) has much better characteristics,
and can be looked up in O(1), and might be worth looking into. The
locality of a indexed tree means that it's MUCH easier to efficiently
fill in (or get rid of) large contiguous chunks of page cache than it is
with hashes. This can be especially useful for doing swapping, where you
don't have to look up adjacent pages - they're right there, adjacent to
your entry.

Anybody interested?

Linus

2001-11-26 17:52:11

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache


On 26 Nov 2001, Momchil Velikov wrote:

> Yep. Folks on #kernelnewbies told me about it, when there were only
> changes to ``shrink_cache'' left. So, I decided to funish mine ;)

ok :) A search on Google for 'scalable pagecache' brings you straight to
our patch. I've uploaded the patch against 2.4.16 as well:

http://redhat.com/~mingo/smp-pagecache-patches/pagecache-2.4.16-A1

this is a (tested) port of the patch to the latest VM.

> The tree is per mapping, not a single one. Now, with 16GB cached in a
> single mapping, it'd perform poorly, indeed (though probably not 20).

some databases tend to keep all their stuff in big files. 16 GB ~== 2^22,
thats why i thought 20 was a good approximation of the depth of the tree.

And even with just a depth of 10 per mapping (files with a few megabytes
of size - a totally mainstream thing), the cache footprint per lookup is
still 320 bytes with 32 byte cacheline-size, which is way too big. With
the hash table (page bucket hash table), the typical footprint for a
lookup is just around 2 cachelines, and one of that is a more 'compressed'
data structure.

we really only use trees in cases where it's absolutely necessery. There
mixture data structures of hashes and trees that are beneficial in some
cases, but the pagecache is mostly a random-indexed thing, seldom do we
want to scan adjacent pages. And even in that case, looking up the hash is
very cheap.

Ingo

2001-11-26 18:04:31

by David Miller

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache

From: Ingo Molnar <[email protected]>
Date: Mon, 26 Nov 2001 18:22:25 +0100 (CET)

The problem with the tree is that if we have a big, eg. 16 GB pagecache,
then even assuming a perfectly balanced tree, it takes more than 20
iterations to find the page in the tree.

His tree is per-inode, so you'd need a fully in ram 16GB _FILE_ to get
the bad tree search properties you describe.

I like his patch, a lot.

2001-11-26 18:06:01

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache


On Mon, 26 Nov 2001, Linus Torvalds wrote:

> Tree's _do_ have advantages over hashes, though, in having both better
> cache locality and better locking locality.
>
> I don't think a binary tree (even if it is self-balancing) is the
> proper format, though. Binary trees have bad cache characteristics,
> and as Ingo points out, with large files (and many
> performance-critical things like databases have _huge_ files) you get
> bad behaviour on lookup with a binary tree.
>
> A indexed tree (like the page tables) has much better characteristics,
> and can be looked up in O(1), and might be worth looking into. The
> locality of a indexed tree means that it's MUCH easier to efficiently
> fill in (or get rid of) large contiguous chunks of page cache than it
> is with hashes. This can be especially useful for doing swapping,
> where you don't have to look up adjacent pages - they're right there,
> adjacent to your entry.

i think the pagecache's data structures should be driven by cached access,
not by things like the cache footprint of swapping. In most systems, the
percentage of cached accessed is at least 90%, with 10% of pagecache
accesses being uncached. (i've run a number of pagecache profiles under
various large system and small system workloads.) I agree that if we can
get a data structure that is O(1) and has 2-cachelines footprint, then
other factors (such as integration with other parts of the VM) could
weight against the hash table.

but i dont think lookups can get any better than the current hash solution
- in high performance environments, by far the biggest overhead are
cachemisses and SMP-synchronization costs. Compressing the cache footprint
and spreading out the locking is the highest priority IMO.

right now the hash creates pretty good locality of reference: two pages
that are close to each other in the logical file space are close to each
other in the hash space as well. This is why scanning pages in the logical
space isnt all that bad cache-footprint-wise, i think. We do take/drop a
spinlock per scan, instead of just going on entry forward in the indexed
tree solution, but this is not a significant overhead, given that the hash
has locality of reference as well.

If we can get better cache footprint than this with indexed trees (which i
think are closer to hashes than trees) then i'm all for it, but i cannot
see where the win comes from (even assuming statistically linear access to
the pagecache space, which is not true for a number of important
workloads).

Ingo

2001-11-26 18:09:51

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache

Hi Ingo,

In article <[email protected]> you wrote:
> On 26 Nov 2001, Momchil Velikov wrote:
>
>> Yep. Folks on #kernelnewbies told me about it, when there were only
>> changes to ``shrink_cache'' left. So, I decided to funish mine ;)
>
> ok :) A search on Google for 'scalable pagecache' brings you straight to
> our patch. I've uploaded the patch against 2.4.16 as well:
>
> http://redhat.com/~mingo/smp-pagecache-patches/pagecache-2.4.16-A1
>
> this is a (tested) port of the patch to the latest VM.

This patch seems to have a number of interesting changes compared to
older versions.

- do_generic_file_read() now takes an additional integer parameter,
'nonblock'. This one always is zero, though. Why do you break
the interface?
- there is a new global function, flush_inode_pages(). It is not
used at all. What is this one supposed to do?
- file_send_actor() is no more static in mm/filemap.c. I'm perfectly
fine with that as I will need that for the UnixWare sendv64 emulation
in Linux-ABI, but again no user outside of filemap.c exists.
- you change a number of parameters called 'offset' into 'index',
this makes sense to me, but doesn't really belong into this diff..
- due to the additional per-bucket spinlock the pagecache size
dramatically increases. Wouldn't it be a good idea to switch to
bootmem allocation so very big machines still can have the full
size, not limited by __get_free_pages failing?

2001-11-26 18:12:44

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache

On Mon, 26 Nov 2001, David S. Miller wrote:
> From: Ingo Molnar <[email protected]>
> Date: Mon, 26 Nov 2001 18:22:25 +0100 (CET)
>
> The problem with the tree is that if we have a big, eg. 16 GB pagecache,
> then even assuming a perfectly balanced tree, it takes more than 20
> iterations to find the page in the tree.
>
> His tree is per-inode, so you'd need a fully in ram 16GB _FILE_ to get
> the bad tree search properties you describe.

Also, if you keep heavily accessing that file, chances
are the top nodes of the tree will be in cache.

> I like his patch, a lot.

Certainly removes some complexity ;)

Rik
--
DMCA, SSSCA, W3C? Who cares? http://thefreeworld.net/

http://www.surriel.com/ http://distro.conectiva.com/

2001-11-26 18:19:02

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache


On Mon, 26 Nov 2001, Christoph Hellwig wrote:

> - do_generic_file_read() now takes an additional integer parameter,
> 'nonblock'. This one always is zero, though. Why do you break
> the interface?

this is just the first, more important half of O_NONBLOCK support for
block IO. Some servers (squid, TUX) rely on it for good performance.

> - there is a new global function, flush_inode_pages(). It is not
> used at all. What is this one supposed to do?

it's for TUX's logfiles - the log file can grow to many gigabytes without
polluting the pagecache. I think a new syscall should also use this:
sys_flushfile(fd,offset,size), so that user-space servers can make use of
this feature as well. If anyone is interested in actually using this new
system-call then i can add those few additional lines. (other applications
could use it as well, eg. clustered databases to invalidate database
caches - but for this purpose the function has to become a bit more
strict.)

> - file_send_actor() is no more static in mm/filemap.c. I'm perfectly
> fine with that as I will need that for the UnixWare sendv64 emulation
> in Linux-ABI, but again no user outside of filemap.c exists.

a TUX thing again, you are right, it makes no sense otherwise.

> - you change a number of parameters called 'offset' into 'index',
> this makes sense to me, but doesn't really belong into this diff..

these are cleanups triggered by a bug i did: 'offset' was not consistently
used as the byte granularity thing, it was used as a page granularity
index once, which caused a bug in an earlier version of the patch.

> - due to the additional per-bucket spinlock the pagecache size
> dramatically increases. Wouldn't it be a good idea to switch to
> bootmem allocation so very big machines still can have the full
> size, not limited by __get_free_pages failing?

for this purpose i wrote memarea_alloc() - bootmem IMO should remain a
limited boot-time helper, not a full-blown allocator.

Ingo

2001-11-26 18:19:01

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache

On Mon, Nov 26, 2001 at 07:23:52PM +0200, Momchil Velikov wrote:
...
> Ingo> The problem with the tree is that if we have a big, eg. 16 GB pagecache,
> Ingo> then even assuming a perfectly balanced tree, it takes more than 20
> Ingo> iterations to find the page in the tree. (which also means 20 cachelines
> Ingo> touched per tree node we pass.) Such an overhead (both algorithmic and
> Ingo> cache-footprint overhead) is absolutely out of question - and it will only
> Ingo> get worse with more RAM, which isnt a good property.
>
> The tree is per mapping, not a single one. Now, with 16GB cached in a
> single mapping, it'd perform poorly, indeed (though probably not 20).
...

I've looked at both splay trees and the page lock patch by Ingo & Dave, and
personally I disagree with both approaches as they fail to address some of
the very common cases that are actually causing the underlying problem.

First off, if we take a look at why the page cache lock is being contended
a few obvious problems pop out immediately:

1. potentially long hash chains are walked with the page cache
lock held for the entire duration of the operation
2. multiple cache lines are touched while holding the page cache
lock
3. sequential lookups involve reaquiring the page cache lock
4. the page cache hash is too large, virtually assuring that
lookups will cause a cache miss

Neither proposed patch addresses all of these problems to the degree that
I think is possible and would like to attempt. What I've got in mind is
a hybrid hash + lock + b*tree scheme, where the lock is per-bucket. With
that in place, each bucket would be treated like the top of a b*tree
(I might have the name wrong, someone please correct me if what I'm
describing has a different name), where a number of {index, page} pairs
are stored, plus pointers to the next level of tree. The whole thing
would be sized at 1 cacheline per bucket and hold around 5-6 page pointers
per bucket (64 byte cacheline size). The lock would be a rw lock to
allow simultaneous readers (the most common operation).

This directly helps with #1 by allowing us to avoid pulling in extra
cache lines during the initial lookup phase since the page index #
is already in the cache line we've retrieved (we may need a pointer
to the address space, but the hash function can be chosen to make
such collisions irrelevant) as well as reducing the number of cache
lines accessed during a lookup. Since a tree is used for the worst
case, the length of a chain is reduced, with a very low likelyhood
of ever exceeding 2 levels (remember, we're working on buckets in
each level of the tree, not individual page cache entries). Item 3
is addressed by improving cache locality for sequential lookups.
Item 4 can be addressed by changing the hash function to cause some
grouping for address spaces.

I haven't coded up this idea yet, but I'll get moving on it. If we're
going to change the locking again, I'd like to go all the way and do
it Right. To this end I'd like to see some appropriate benchmarks made
to look at the behaviour of the proposed changes under sequential access
patterns as well as random. Thoughts?

-ben
--
Fish.

2001-11-26 18:36:04

by David Miller

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache

From: Ingo Molnar <[email protected]>
Date: Mon, 26 Nov 2001 21:29:39 +0100 (CET)

so i'm not against removing (or improving) the hash [our patch in fact
just left the hash alone], but the patch presented is not a win IMO.

Maybe you should give it a test to find out for sure :)

2001-11-26 18:44:44

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache


On Mon, 26 Nov 2001, Benjamin LaHaise wrote:

> 1. potentially long hash chains are walked with the page cache
> lock held for the entire duration of the operation

not the case.

> 2. multiple cache lines are touched while holding the page cache
> lock

not the case because the problem with the pagecache_lock was its bouncing
between CPUs, not processes waiting on it.

> 3. sequential lookups involve reaquiring the page cache lock

the granularity of the pagecache directly influences the number of
accesses to the pagecache locking mechanizm. Neither patch solves this,
the number of lock operations does not change - but the lock data
structures are spread out more.

i think it's a separate (and just as interesting) issue to decrease the
granularity of the pagecache - this not only decreases locking (and other
iteration) costs, it also decreases the size of the hash (or whatever
other data structure is used).

> 4. the page cache hash is too large, virtually assuring that
> lookups will cause a cache miss

this does not appear to be the case (see my other replies). Even if the
hash table is big and assuming the worst-case (we miss on every hash table
access), mem_map is *way* bigger in the cache because it has a much less
compressed format. The compression ratio between mem_map[] and the hash
table is 1:8 in the stock kernel, 1:4 with the page buckets patch.

Ingo

2001-11-26 18:54:14

by Daniel Phillips

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache

On November 26, 2001 06:22 pm, Ingo Molnar wrote:
> are you aware of the following patch? (written by David Miller and me.)
>
> http://people.redhat.com/mingo/smp-pagecache-patches/pagecache-2.4.10-A3
>
> it gets rid of the pagecache lock without introducing a tree.

Great. How?

--
Daniel

2001-11-26 18:44:43

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache

On Mon, 26 Nov 2001, Benjamin LaHaise wrote:

> Thoughts?

Please don't sacrifice code clarity on the altar of
absolute performance ;)

Rik
--
DMCA, SSSCA, W3C? Who cares? http://thefreeworld.net/

http://www.surriel.com/ http://distro.conectiva.com/

2001-11-26 18:37:41

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache

On Mon, Nov 26, 2001 at 09:29:39PM +0100, Ingo Molnar wrote:
> this is a misunderstanding of the problem. The reason why the
> pagecache_lock is a performance problem is *not* contention, the reason is
> *not* the length of chains, or any other reason you listed. The problem is
> SMP cacheline invalidation costs, due to using the same cacheline from
> multiple CPUs. Thus the spreading out of locking gives good SMP cacheline
> usage properties.

Please reply to the rest of the message. Perhaps that item is a
misunderstanding, but the rest of it is applicable across the board.

-ben

2001-11-26 19:05:34

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache


there is a case where the binary tree has less cache footprint than the
hash - when big continuous areas of files are cached, *and* the access
patterns are linear. In this case the binary tree uses only the continuous
mem_map[] area for its data structures - while the hash uses the hash
table as well. (which is +12% or +25%, in the stock/buckets variant.)

Ingo

2001-11-26 18:34:13

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache


On Mon, 26 Nov 2001, Benjamin LaHaise wrote:

> First off, if we take a look at why the page cache lock is being
> contended a few obvious problems pop out immediately:
>
> 1. potentially long hash chains are walked with the page cache
> lock held for the entire duration of the operation

this is a misunderstanding of the problem. The reason why the
pagecache_lock is a performance problem is *not* contention, the reason is
*not* the length of chains, or any other reason you listed. The problem is
SMP cacheline invalidation costs, due to using the same cacheline from
multiple CPUs. Thus the spreading out of locking gives good SMP cacheline
usage properties.

[ yes, we (well, mostly I) have analyzed such workloads where the
pagecache_lock starts being a problem. ]

the current hash table is pretty compressed, and the chains are short and
good - 1-2 entries in 95% of the cases even with all RAM being in the
pagecache, for a wide range of workloads. [in Anton's test we hit the
limit of the buddy allocator - this problem can be solved either via
bootmem allocation or via the memarea allocator i wrote.]

by using a binary tree we increase cache footprint visibly. Even assuming
the best case (access patters goes linearly in not too big files), the
cache footprint is 1.5*nr_accessed_pages cachelines (statistically). Via
the hash, it's nr_accessed_pages+nr_accessed_hash_cachelines, which is
lower, because in the stock kernel, nr_hash_cachelines is nr_pages/8, with
our patch it's nr_pages/4. So we have an almost 50% difference in cache
footprint - and this was the best-case!

so i'm not against removing (or improving) the hash [our patch in fact
just left the hash alone], but the patch presented is not a win IMO.

Ingo

2001-11-26 19:21:15

by David Miller

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache

From: Ingo Molnar <[email protected]>
Date: Mon, 26 Nov 2001 22:09:02 +0100 (CET)

something like this:

- #define SetPageReferenced(page) set_bit(PG_referenced, &(page)->flags)
+ #define SetPageReferenced(page) \
+ if (!test_bit(PG_referenced), &(page)->flags) \
+ set_bit(PG_referenced, &(page)->flags)

On some platforms all the {test_,}{clear,change,set}_bit() ops give
you this cache behavior. How hard would it be to make ix86 act
similarly?

2001-11-26 19:21:13

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache

On Mon, Nov 26, 2001 at 09:40:53PM +0100, Ingo Molnar wrote:
> this does not appear to be the case (see my other replies). Even if the
> hash table is big and assuming the worst-case (we miss on every hash table
> access), mem_map is *way* bigger in the cache because it has a much less
> compressed format. The compression ratio between mem_map[] and the hash
> table is 1:8 in the stock kernel, 1:4 with the page buckets patch.

Well, the only point you've made that has any impact on the data structure
that I'm proposing is that the cachling bouncing caused by lock acquisition
is the limiting factor. The best solution to that is to make the per-bucket
lock only used for insert and remove, and make lookups lockless. In order
to make that work, we need to break the current PageLock into a spinlock
portion and an io-lock flag, and grab the spinlock before removing the page
from the page cache. Now, would you care to comment on the data structure?

-ben
--
Fish.

2001-11-26 19:32:31

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache

"David S. Miller" wrote:
>
> From: Ingo Molnar <[email protected]>
> Date: Mon, 26 Nov 2001 21:29:39 +0100 (CET)
>
> so i'm not against removing (or improving) the hash [our patch in fact
> just left the hash alone], but the patch presented is not a win IMO.
>
> Maybe you should give it a test to find out for sure :)

umm.. I've never seen any numbers from you, David.

Correct me if I'm wrong, but the pagecache_hash cost is
significant in the following situations:

1: TUX, because its pagecache lookups are not associated with
a page copy. This copy makes the benefits of the patch
unmeasurable with other workloads.

1a: Other sendfile-intensive applications. (Theoretical benefit.
No benchmark results have been seen).

2: NUMA hardware, where the cost of cacheline transfer is much
higher.

ergo, there is no point in futzing with the pagecache_lock *at all*
until either TUX is merged, or we decide to support large-scale
NUMA hardware well, which will require changes in other places.

Prove me wrong. Please.

-

2001-11-26 19:36:41

by David Miller

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache

From: Andrew Morton <[email protected]>
Date: Mon, 26 Nov 2001 11:29:19 -0800

"David S. Miller" wrote:
> Maybe you should give it a test to find out for sure :)

umm.. I've never seen any numbers from you, David.

I'll work on something for you this week then :-)
Not a problem.

ergo, there is no point in futzing with the pagecache_lock *at all*
until either TUX is merged, or we decide to support large-scale
NUMA hardware well

Or your "1a", "other sendfile applications", right?

I really still feel (and will try to show with numbers) that the copy
being present is not so important. Like I have stated on other
occaisions, several platforms do not harm L2 cache state during
copies, new lines are never brought in.

I expect x86 systems to move more in this direction if they aren't
somewhat there already.

2001-11-26 19:48:45

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache

"David S. Miller" wrote:
>
> From: Ingo Molnar <[email protected]>
> Date: Mon, 26 Nov 2001 22:09:02 +0100 (CET)
>
> something like this:
>
> - #define SetPageReferenced(page) set_bit(PG_referenced, &(page)->flags)
> + #define SetPageReferenced(page) \
> + if (!test_bit(PG_referenced), &(page)->flags) \
> + set_bit(PG_referenced, &(page)->flags)
>
> On some platforms all the {test_,}{clear,change,set}_bit() ops give
> you this cache behavior. How hard would it be to make ix86 act
> similarly?

It would be nice to have a full complement of non-atomic
bitops. At present we have __set_bit, but no __clear_bit, etc.

So we often do buslocked RMW's in places where it's not needed.

-

2001-11-26 19:59:41

by David Miller

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache

From: Andrew Morton <[email protected]>
Date: Mon, 26 Nov 2001 11:45:36 -0800

It would be nice to have a full complement of non-atomic
bitops. At present we have __set_bit, but no __clear_bit, etc.

So we often do buslocked RMW's in places where it's not needed.

I totally agree about fixing up bitops.

I have a patch which cleans up the filesystem bitops. Specifically,
right now we use things like "ext2_set_bit()" in places where we care
what the old value was and also in places where we don't.

So I added "ext2_test_and_set_bit()" and the like.

I therefore plan to flesh this out, and flesh out the non-atomic
bitops, then resubmit.

I did the work originally during 2.4.15-preX which made me have to
back it out of my tree temporarily while Linus was in anal patch
acceptance mode.

Ahh, speaking of bitops, what's the status of the ext3 bitops bugs I
found Andrew? :-) Those are pretty serious.

2001-11-26 20:07:22

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache

"David S. Miller" wrote:
>
> Ahh, speaking of bitops, what's the status of the ext3 bitops bugs I
> found Andrew? :-) Those are pretty serious.

err, umm, shuffles feet...

Here's the current rolled-up ext3 diff which I've been using.
It needs to be resynced with pestiferous ext3 CVS, changelogged
and pushed out this week.



--- linux-2.4.15-pre9/fs/ext3/inode.c Thu Nov 22 10:56:33 2001
+++ linux-akpm/fs/ext3/inode.c Thu Nov 22 11:07:03 2001
@@ -1026,9 +1026,20 @@ static int ext3_prepare_write(struct fil
if (ret != 0)
goto prepare_write_failed;

- if (ext3_should_journal_data(inode))
+ if (ext3_should_journal_data(inode)) {
ret = walk_page_buffers(handle, page->buffers,
from, to, NULL, do_journal_get_write_access);
+ if (ret) {
+ /*
+ * We're going to fail this prepare_write(),
+ * so commit_write() will not be called.
+ * We need to undo block_prepare_write()'s kmap().
+ * AKPM: Do we need to clear PageUptodate? I don't
+ * think so.
+ */
+ kunmap(page);
+ }
+ }
prepare_write_failed:
if (ret)
ext3_journal_stop(handle, inode);
@@ -1096,7 +1107,7 @@ static int ext3_commit_write(struct file
kunmap(page);
if (pos > inode->i_size)
inode->i_size = pos;
- set_bit(EXT3_STATE_JDATA, &inode->u.ext3_i.i_state);
+ EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
} else {
if (ext3_should_order_data(inode)) {
ret = walk_page_buffers(handle, page->buffers,
@@ -1104,8 +1115,17 @@ static int ext3_commit_write(struct file
}
/* Be careful here if generic_commit_write becomes a
* required invocation after block_prepare_write. */
- if (ret == 0)
+ if (ret == 0) {
ret = generic_commit_write(file, page, from, to);
+ } else {
+ /*
+ * block_prepare_write() was called, but we're not
+ * going to call generic_commit_write(). So we
+ * need to perform generic_commit_write()'s kunmap
+ * by hand.
+ */
+ kunmap(page);
+ }
}
if (inode->i_size > inode->u.ext3_i.i_disksize) {
inode->u.ext3_i.i_disksize = inode->i_size;
@@ -1140,7 +1160,7 @@ static int ext3_bmap(struct address_spac
journal_t *journal;
int err;

- if (test_and_clear_bit(EXT3_STATE_JDATA, &inode->u.ext3_i.i_state)) {
+ if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
/*
* This is a REALLY heavyweight approach, but the use of
* bmap on dirty files is expected to be extremely rare:
@@ -1159,6 +1179,7 @@ static int ext3_bmap(struct address_spac
* everything they get.
*/

+ EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
journal = EXT3_JOURNAL(inode);
journal_lock_updates(journal);
err = journal_flush(journal);
@@ -2203,7 +2224,7 @@ static int ext3_do_update_inode(handle_t
/* If we are not tracking these fields in the in-memory inode,
* then preserve them on disk, but still initialise them to zero
* for new inodes. */
- if (inode->u.ext3_i.i_state & EXT3_STATE_NEW) {
+ if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) {
raw_inode->i_faddr = 0;
raw_inode->i_frag = 0;
raw_inode->i_fsize = 0;
@@ -2249,7 +2270,7 @@ static int ext3_do_update_inode(handle_t
rc = ext3_journal_dirty_metadata(handle, bh);
if (!err)
err = rc;
- inode->u.ext3_i.i_state &= ~EXT3_STATE_NEW;
+ EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW;

out_brelse:
brelse (bh);
--- linux-2.4.15-pre9/fs/ext3/super.c Thu Nov 22 10:56:33 2001
+++ linux-akpm/fs/ext3/super.c Thu Nov 22 11:07:03 2001
@@ -1350,7 +1350,7 @@ static int ext3_load_journal(struct supe
journal_t *journal;
int journal_inum = le32_to_cpu(es->s_journal_inum);
int journal_dev = le32_to_cpu(es->s_journal_dev);
- int err;
+ int err = 0;
int really_read_only;

really_read_only = is_read_only(sb->s_dev);
@@ -1400,9 +1400,10 @@ static int ext3_load_journal(struct supe
}

if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER))
- journal_wipe(journal, !really_read_only);
+ err = journal_wipe(journal, !really_read_only);
+ if (!err)
+ err = journal_load(journal);

- err = journal_load(journal);
if (err) {
printk(KERN_ERR "EXT3-fs: error loading journal.\n");
journal_destroy(journal);
--- linux-2.4.15-pre9/fs/jbd/journal.c Thu Nov 22 10:56:33 2001
+++ linux-akpm/fs/jbd/journal.c Thu Nov 22 11:07:03 2001
@@ -757,7 +757,8 @@ journal_t * journal_init_inode (struct i
journal->j_inode = inode;
jbd_debug(1,
"journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
- journal, bdevname(inode->i_dev), inode->i_ino, inode->i_size,
+ journal, bdevname(inode->i_dev), inode->i_ino,
+ (long long) inode->i_size,
inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);

journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
@@ -772,6 +773,18 @@ journal_t * journal_init_inode (struct i
return journal;
}

+/*
+ * If the journal init or create aborts, we need to mark the journal
+ * superblock as being NULL to prevent the journal destroy from writing
+ * back a bogus superblock.
+ */
+static void journal_fail_superblock (journal_t *journal)
+{
+ struct buffer_head *bh = journal->j_sb_buffer;
+ brelse(bh);
+ journal->j_sb_buffer = NULL;
+}
+
/*
* Given a journal_t structure, initialise the various fields for
* startup of a new journaling session. We use this both when creating
@@ -825,6 +838,7 @@ int journal_create (journal_t *journal)
if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) {
printk (KERN_ERR "Journal length (%d blocks) too short.\n",
journal->j_maxlen);
+ journal_fail_superblock(journal);
return -EINVAL;
}

@@ -915,7 +929,8 @@ static int journal_get_superblock(journa
{
struct buffer_head *bh;
journal_superblock_t *sb;
-
+ int err = -EIO;
+
bh = journal->j_sb_buffer;

J_ASSERT(bh != NULL);
@@ -925,16 +940,18 @@ static int journal_get_superblock(journa
if (!buffer_uptodate(bh)) {
printk (KERN_ERR
"JBD: IO error reading journal superblock\n");
- return -EIO;
+ goto out;
}
}

sb = journal->j_superblock;

+ err = -EINVAL;
+
if (sb->s_header.h_magic != htonl(JFS_MAGIC_NUMBER) ||
sb->s_blocksize != htonl(journal->j_blocksize)) {
printk(KERN_WARNING "JBD: no valid journal superblock found\n");
- return -EINVAL;
+ goto out;
}

switch(ntohl(sb->s_header.h_blocktype)) {
@@ -946,17 +963,21 @@ static int journal_get_superblock(journa
break;
default:
printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
- return -EINVAL;
+ goto out;
}

if (ntohl(sb->s_maxlen) < journal->j_maxlen)
journal->j_maxlen = ntohl(sb->s_maxlen);
else if (ntohl(sb->s_maxlen) > journal->j_maxlen) {
printk (KERN_WARNING "JBD: journal file too short\n");
- return -EINVAL;
+ goto out;
}

return 0;
+
+out:
+ journal_fail_superblock(journal);
+ return err;
}

/*
@@ -1061,7 +1082,10 @@ void journal_destroy (journal_t *journal
/* We can now mark the journal as empty. */
journal->j_tail = 0;
journal->j_tail_sequence = ++journal->j_transaction_sequence;
- journal_update_superblock(journal, 1);
+ if (journal->j_sb_buffer) {
+ journal_update_superblock(journal, 1);
+ brelse(journal->j_sb_buffer);
+ }

if (journal->j_inode)
iput(journal->j_inode);
@@ -1069,7 +1093,6 @@ void journal_destroy (journal_t *journal
journal_destroy_revoke(journal);

unlock_journal(journal);
- brelse(journal->j_sb_buffer);
kfree(journal);
MOD_DEC_USE_COUNT;
}
--- linux-2.4.15-pre9/fs/jbd/transaction.c Thu Nov 22 10:56:33 2001
+++ linux-akpm/fs/jbd/transaction.c Thu Nov 22 11:07:03 2001
@@ -1058,21 +1058,6 @@ int journal_dirty_data (handle_t *handle
JBUFFER_TRACE(jh, "not on a transaction");
__journal_file_buffer(jh, handle->h_transaction, wanted_jlist);
}
- /*
- * We need to mark the buffer dirty and refile it inside the lock to
- * protect it from release by journal_try_to_free_buffer()
- *
- * We set ->b_flushtime to something small enough to typically keep
- * kupdate away from the buffer.
- *
- * We don't need to do a balance_dirty() - __block_commit_write()
- * does that.
- */
- if (!async && !atomic_set_buffer_dirty(jh2bh(jh))) {
- jh2bh(jh)->b_flushtime =
- jiffies + journal->j_commit_interval + 1 * HZ;
- refile_buffer(jh2bh(jh));
- }
no_journal:
spin_unlock(&journal_datalist_lock);
if (need_brelse) {
@@ -1604,8 +1589,6 @@ static int __journal_try_to_free_buffer(

assert_spin_locked(&journal_datalist_lock);

- if (!buffer_jbd(bh))
- return 1;
jh = bh2jh(bh);

if (buffer_locked(bh) || buffer_dirty(bh)) {
--- linux-2.4.15-pre9/Documentation/Configure.help Thu Nov 22 10:56:31 2001
+++ linux-akpm/Documentation/Configure.help Thu Nov 22 11:07:03 2001
@@ -13712,7 +13712,7 @@ CONFIG_EXT2_FS
Ext3 journaling file system support (EXPERIMENTAL)
CONFIG_EXT3_FS
This is the journaling version of the Second extended file system
- (often called ext3), the de facto standard Linux file system
+ (often called ext2), the de facto standard Linux file system
(method to organize files on a storage device) for hard disks.

The journaling code included in this driver means you do not have
--- linux-2.4.15-pre9/Documentation/Changes Mon Sep 17 23:10:44 2001
+++ linux-akpm/Documentation/Changes Thu Nov 22 11:07:03 2001
@@ -53,7 +53,7 @@ o Gnu make 3.77
o binutils 2.9.1.0.25 # ld -v
o util-linux 2.10o # fdformat --version
o modutils 2.4.2 # insmod -V
-o e2fsprogs 1.19 # tune2fs
+o e2fsprogs 1.25 # tune2fs
o reiserfsprogs 3.x.0j # reiserfsck 2>&1|grep reiserfsprogs
o pcmcia-cs 3.1.21 # cardmgr -V
o PPP 2.4.0 # pppd --version
@@ -317,8 +317,7 @@ o <ftp://rawhide.redhat.com/pub/rawhide

E2fsprogs
---------
-o <http://prdownloads.sourceforge.net/e2fsprogs/e2fsprogs-1.19.tar.gz>
-o <http://prdownloads.sourceforge.net/e2fsprogs/e2fsprogs-1.19-0.src.rpm>
+o <http://prdownloads.sourceforge.net/e2fsprogs/e2fsprogs-1.25.tar.gz>

Reiserfsprogs
-------------
--- linux-2.4.15-pre9/Documentation/filesystems/Locking Fri Feb 16 15:53:08 2001
+++ linux-akpm/Documentation/filesystems/Locking Thu Nov 22 11:07:03 2001
@@ -123,6 +123,10 @@ prototypes:
int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
int (*bmap)(struct address_space *, long);
+ int (*flushpage) (struct page *, unsigned long);
+ int (*releasepage) (struct page *, int);
+ int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int);
+
locking rules:
All may block
BKL PageLocked(page)
@@ -132,6 +136,8 @@ sync_page: no maybe
prepare_write: no yes
commit_write: no yes
bmap: yes
+flushpage: no yes
+releasepage: no yes

->prepare_write(), ->commit_write(), ->sync_page() and ->readpage()
may be called from the request handler (/dev/loop).
@@ -144,6 +150,15 @@ well-defined...
filesystems and by the swapper. The latter will eventually go away. All
instances do not actually need the BKL. Please, keep it that way and don't
breed new callers.
+ ->flushpage() is called when the filesystem must attempt to drop
+some or all of the buffers from the page when it is being truncated. It
+returns zero on success. If ->flushpage is zero, the kernel uses
+block_flushpage() instead.
+ ->releasepage() is called when the kernel is about to try to drop the
+buffers from the page in preparation for freeing it. It returns zero to
+indicate that the buffers are (or may be) freeable. If ->flushpage is zero,
+the kernel assumes that the fs has no private interest in the buffers.
+
Note: currently almost all instances of address_space methods are
using BKL for internal serialization and that's one of the worst sources
of contention. Normally they are calling library functions (in fs/buffer.c)

2001-11-26 20:43:02

by Daniel Phillips

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache

On November 26, 2001 07:16 pm, Benjamin LaHaise wrote:

> First off, if we take a look at why the page cache lock is being contended
> a few obvious problems pop out immediately:
>
> 1. potentially long hash chains are walked with the page cache
> lock held for the entire duration of the operation

Yes, having a not-very-random hash function makes this worse. What we see
is: the attempt to improve locality by having the hash function preserve
certain input bits can easily produce a net loss by causing a big variance in
hash chain length, increasing the average length of the list walk. At some
point I'll go check this theory for the pache cache (in fact I think it's
already been checked, and a paper written about it).

> 2. multiple cache lines are touched while holding the page cache
> lock
> 3. sequential lookups involve reaquiring the page cache lock
> 4. the page cache hash is too large, virtually assuring that
> lookups will cause a cache miss

A more random hash improves (4) too, by allowing a smaller table.

Though somebody may well come up with a better structure than a hash table
for the page cache, I'm pretty sure we can make the existing approach work a
little better.

We could divide up the locking using pcache_locks [page_hash >> some_shift].

--
Daniel

2001-11-26 20:46:01

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache

In article <[email protected]> you write:
>
>Here's the current rolled-up ext3 diff which I've been using.
>It needs to be resynced with pestiferous ext3 CVS, changelogged
>and pushed out this week.
>
>--- linux-2.4.15-pre9/fs/ext3/inode.c Thu Nov 22 10:56:33 2001
>+++ linux-akpm/fs/ext3/inode.c Thu Nov 22 11:07:03 2001
>@@ -1026,9 +1026,20 @@ static int ext3_prepare_write(struct fil
> if (ret != 0)
> goto prepare_write_failed;
>
>- if (ext3_should_journal_data(inode))
>+ if (ext3_should_journal_data(inode)) {
> ret = walk_page_buffers(handle, page->buffers,
> from, to, NULL, do_journal_get_write_access);
>+ if (ret) {
>+ /*
>+ * We're going to fail this prepare_write(),
>+ * so commit_write() will not be called.
>+ * We need to undo block_prepare_write()'s kmap().
>+ * AKPM: Do we need to clear PageUptodate? I don't
>+ * think so.
>+ */
>+ kunmap(page);
>+ }
>+ }

This is wrong.

The generic VM layer does the kmap/kunmap itself these days (it really
always should have - it needs the page mapped itself for the memcpy
anyway, and depending on the low-level FS to do kmap/kunmap was an ugly
layering violation).

So you should just remove all the kmap/kunmap stuff in
prepare/commit_write instead of adding more of them to handle the
brokenness.

Linus

2001-11-26 21:05:23

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache

Linus Torvalds wrote:
>
> In article <[email protected]> you write:
> >
> >Here's the current rolled-up ext3 diff which I've been using.
> >It needs to be resynced with pestiferous ext3 CVS, changelogged
> >and pushed out this week.
> >
> >--- linux-2.4.15-pre9/fs/ext3/inode.c Thu Nov 22 10:56:33 2001
> >+++ linux-akpm/fs/ext3/inode.c Thu Nov 22 11:07:03 2001
> >@@ -1026,9 +1026,20 @@ static int ext3_prepare_write(struct fil
> > if (ret != 0)
> > goto prepare_write_failed;
> >
> >- if (ext3_should_journal_data(inode))
> >+ if (ext3_should_journal_data(inode)) {
> > ret = walk_page_buffers(handle, page->buffers,
> > from, to, NULL, do_journal_get_write_access);
> >+ if (ret) {
> >+ /*
> >+ * We're going to fail this prepare_write(),
> >+ * so commit_write() will not be called.
> >+ * We need to undo block_prepare_write()'s kmap().
> >+ * AKPM: Do we need to clear PageUptodate? I don't
> >+ * think so.
> >+ */
> >+ kunmap(page);
> >+ }
> >+ }
>
> This is wrong.
>
> The generic VM layer does the kmap/kunmap itself these days (it really
> always should have - it needs the page mapped itself for the memcpy
> anyway, and depending on the low-level FS to do kmap/kunmap was an ugly
> layering violation).

Actually the comment is a bit misleading.

We've called block_prepare_write(), which has done the kmap.
But even though block_prepare_write() returned success, this
call to the filesystem's ->prepare_write() is about to fail.

So the caller of ->prepare_write() isn't going to run ->commit_write(),
and we have to undo the kmap here.

> So you should just remove all the kmap/kunmap stuff in
> prepare/commit_write instead of adding more of them to handle the
> brokenness.

There have been a number of mistakes made over this particular kmap()
operation. NFS client had it wrong for a while. I think sct had
some proposal for making it more robust.

-

2001-11-26 19:12:12

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache


hm, as far as i can see, your patch dirties both cachelines of the target
struct page on every lookup (page_splay_tree_find()), correct? If correct,
is this just an implementational thing, or something more fundamental?

i've been thinking about getting rid of some of the cacheline dirtying in
the page lookup code, ie. something like this:

- #define SetPageReferenced(page) set_bit(PG_referenced, &(page)->flags)
+ #define SetPageReferenced(page) \
+ if (!test_bit(PG_referenced), &(page)->flags) \
+ set_bit(PG_referenced, &(page)->flags)

this would have the benefit of not touching the cacheline while doing a
simple read(), if the referenced bit is still set. (which is not cleared
too eagerly in most no-VM-pressure cases.) And it should not be a problem
that this is not race-free - missing to set the referenced bit is not a
big failure.

Ingo

2001-11-26 19:15:12

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache


On Mon, 26 Nov 2001, Ingo Molnar wrote:

> [...] And it should not be a problem that this is not race-free -
> missing to set the referenced bit is not a big failure.

(in the swapping code this race might be more lethal - so this
optimization would only be for the pagecache lookup only.)

Ingo

2001-11-26 22:29:34

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache


On Mon, 26 Nov 2001, Andrew Morton wrote:
>
> We've called block_prepare_write(), which has done the kmap.
> But even though block_prepare_write() returned success, this
> call to the filesystem's ->prepare_write() is about to fail.

That's _way_ too intimate knowledge of how block_prepare_write() works (or
doesn't work).

How about sending me a patch that removes all the kmap/kunmap crap from
_both_ ext3 and block_prepare/commit_write.

> There have been a number of mistakes made over this particular kmap()
> operation. NFS client had it wrong for a while. I think sct had
> some proposal for making it more robust.

It _is_ more robust. You are only battling remnants from an older age when
it wasn't.

Linus

2001-11-26 22:49:46

by Jeff Garzik

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache

Linus Torvalds wrote:
>
> On Mon, 26 Nov 2001, Andrew Morton wrote:
> >
> > We've called block_prepare_write(), which has done the kmap.
> > But even though block_prepare_write() returned success, this
> > call to the filesystem's ->prepare_write() is about to fail.
>
> That's _way_ too intimate knowledge of how block_prepare_write() works (or
> doesn't work).
>
> How about sending me a patch that removes all the kmap/kunmap crap from
> _both_ ext3 and block_prepare/commit_write.

FWIW Al Viro pointed out to me yesterday that block_xxx are really
nothing but helpers... Depending on them doing or not doing certain
things is IMHO ok...

If the behavior of the helpers is not what is desired, you are free to
ignore them and roll your own... or wrap them as it appears ext3 is
doing here.

Jeff



--
Jeff Garzik | Only so many songs can be sung
Building 1024 | with two lips, two lungs, and one tongue.
MandrakeSoft | - nomeansno

2001-11-26 23:12:58

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] Scalable page cache


On Mon, 26 Nov 2001, Jeff Garzik wrote:
>
> FWIW Al Viro pointed out to me yesterday that block_xxx are really
> nothing but helpers... Depending on them doing or not doing certain
> things is IMHO ok...

They _are_ helpers, but I was actually _this_ close to removing the
kmap/kunmap from them a few weeks ago when we fixed the NFS uses of the
same. People should NOT depend on those kinds of subtle internal things.

Linus

2001-11-27 08:11:55

by Anton Blanchard

[permalink] [raw]
Subject: benchmark results: Scalable page cache


> His tree is per-inode, so you'd need a fully in ram 16GB _FILE_ to get
> the bad tree search properties you describe.

Well lets pass my dbench-o-matic over the two patches.

http://samba.org/~anton/linux/pagecache_locking/

The tests were run on an old 12 way ppc64 machine. I might do the runs
again on a newer 16 way, it just takes ages to boot it.

summary.png shows an equal improvement from 8 clients through to 12
clients with either patch. Ignore the breakdown after 12 clients, I need
to look closer at what is going on there.

I was seeing larger problems with the pagecache lock on zero copy
workloads with the standard kernel. It might show up some differences in
the two approaches with that sort of workload, if I get a chance I'll do
more testing.

The dbench-o-matic gives some more detailed information as well as the
summary. For each dbench run you can get the results from the kernel
statistical profiler. For the top 40 kernel functions you can also see
the breakdown of hits (in % of total hits for that function) against
the disassembly.

eg for standard 2.4.16 and 12 clients, the __find_lock disassembly
is:

http://samba.org/~anton/linux/pagecache_locking/2.4.16/12/8.html

One hint for reading the disassembly, this is a spin lock aquisition
in ppc64:

c0000000000e38e8 b c0000000000e3900
c0000000000e38ec mr r1,r1
c0000000000e38f0 lwzx r11,r0,r0
c0000000000e38f4 cmpwi r11,0
c0000000000e38f8 bne+ c0000000000e38ec
c0000000000e38fc mr r2,r2
c0000000000e3900 lwarx r11,r0,r0
c0000000000e3904 cmpwi r11,0
c0000000000e3908 bne- c0000000000e38ec
c0000000000e390c stwcx. r9,r0,r0
c0000000000e3910 bne- c0000000000e3900
c0000000000e3914 isync

If you see a large percentage on the first five instructions then
we are spending a lot of time busy spinning for a spinlock to be
dropped. Another useful tip is that hits are usually attributed to the
instruction after the offending one, because we use the return address
from the timer interrupt.

In the case of __find_lock_page we see both contention (looping over the
first five instructions) and also what looks to be cacheline ping ponging
(the high % of hits the instruction after the lwarx).

Anton