Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S263980AbUFKOWI (ORCPT ); Fri, 11 Jun 2004 10:22:08 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S263984AbUFKOWI (ORCPT ); Fri, 11 Jun 2004 10:22:08 -0400 Received: from mx1.redhat.com ([66.187.233.31]:52708 "EHLO mx1.redhat.com") by vger.kernel.org with ESMTP id S263980AbUFKOVu (ORCPT ); Fri, 11 Jun 2004 10:21:50 -0400 From: David Howells In-Reply-To: <567.1086950642@redhat.com> References: <567.1086950642@redhat.com> To: torvalds@osdl.org, akpm@osdl.org Cc: linux-kernel@vger.kernel.org Subject: [PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size [#2] User-Agent: EMH/1.14.1 SEMI/1.14.5 (Awara-Onsen) FLIM/1.14.5 (Demachiyanagi) APEL/10.6 Emacs/21.3 (i386-redhat-linux-gnu) MULE/5.0 (SAKAKI) MIME-Version: 1.0 (generated by SEMI 1.14.5 - "Awara-Onsen") Content-Type: multipart/mixed; boundary="Multipart_Fri_Jun_11_15:21:45_2004-1" Date: Fri, 11 Jun 2004 15:21:45 +0100 Message-ID: <6567.1086963705@redhat.com> Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 11338 Lines: 392 --Multipart_Fri_Jun_11_15:21:45_2004-1 Content-Type: text/plain; charset=US-ASCII Hi Linus, Andrew, Here's an update to my patch. Thanks to Ingo Oeser for noticing that the patch had a couple of problems in the allocation loop (it would never end if an allocation failed, and the result of the allocation didn't need casting). David --Multipart_Fri_Jun_11_15:21:45_2004-1 Content-Type: text/plain; type=patch; charset=US-ASCII Content-Disposition: attachment; filename="bootmem-hash-267rc3-2.diff" Content-Transfer-Encoding: 7bit diff -uNrp linux-2.6.7-rc3/fs/dcache.c linux-2.6.7-rc3-hash/fs/dcache.c --- linux-2.6.7-rc3/fs/dcache.c 2004-06-11 09:36:25.000000000 +0100 +++ linux-2.6.7-rc3-hash/fs/dcache.c 2004-06-11 10:52:23.000000000 +0100 @@ -30,6 +30,7 @@ #include #include #include +#include #define DCACHE_PARANOIA 1 /* #define DCACHE_DEBUG 1 */ @@ -1561,13 +1562,31 @@ static int __init set_dhash_entries(char } __setup("dhash_entries=", set_dhash_entries); -static void __init dcache_init(unsigned long mempages) +static void __init dcache_init_early(void) { - struct hlist_head *d; - unsigned long order; - unsigned int nr_hash; - int i; + struct hlist_head *p; + int loop; + + dentry_hashtable = + alloc_large_system_hash("Dentry cache", + sizeof(struct hlist_head), + dhash_entries, + 13, + 0, + &d_hash_shift, + &d_hash_mask); + + p = dentry_hashtable; + loop = 1 << d_hash_shift; + do { + INIT_HLIST_HEAD(p); + p++; + loop--; + } while (loop); +} +static void __init dcache_init(unsigned long mempages) +{ /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature @@ -1580,45 +1599,6 @@ static void __init dcache_init(unsigned NULL, NULL); set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory); - - if (!dhash_entries) - dhash_entries = PAGE_SHIFT < 13 ? - mempages >> (13 - PAGE_SHIFT) : - mempages << (PAGE_SHIFT - 13); - - dhash_entries *= sizeof(struct hlist_head); - for (order = 0; ((1UL << order) << PAGE_SHIFT) < dhash_entries; order++) - ; - - do { - unsigned long tmp; - - nr_hash = (1UL << order) * PAGE_SIZE / - sizeof(struct hlist_head); - d_hash_mask = (nr_hash - 1); - - tmp = nr_hash; - d_hash_shift = 0; - while ((tmp >>= 1UL) != 0UL) - d_hash_shift++; - - dentry_hashtable = (struct hlist_head *) - __get_free_pages(GFP_ATOMIC, order); - } while (dentry_hashtable == NULL && --order >= 0); - - printk(KERN_INFO "Dentry cache hash table entries: %d (order: %ld, %ld bytes)\n", - nr_hash, order, (PAGE_SIZE << order)); - - if (!dentry_hashtable) - panic("Failed to allocate dcache hash table\n"); - - d = dentry_hashtable; - i = nr_hash; - do { - INIT_HLIST_HEAD(d); - d++; - i--; - } while (i); } /* SLAB cache for __getname() consumers */ @@ -1632,6 +1612,12 @@ EXPORT_SYMBOL(d_genocide); extern void bdev_cache_init(void); extern void chrdev_init(void); +void __init vfs_caches_init_early(void) +{ + dcache_init_early(); + inode_init_early(); +} + void __init vfs_caches_init(unsigned long mempages) { unsigned long reserve; diff -uNrp linux-2.6.7-rc3/fs/inode.c linux-2.6.7-rc3-hash/fs/inode.c --- linux-2.6.7-rc3/fs/inode.c 2004-06-11 09:36:27.000000000 +0100 +++ linux-2.6.7-rc3-hash/fs/inode.c 2004-06-11 10:52:48.744705411 +0100 @@ -20,6 +20,7 @@ #include #include #include +#include /* * This is needed for the following functions: @@ -1345,54 +1346,35 @@ __setup("ihash_entries=", set_ihash_entr /* * Initialize the waitqueues and inode hash table. */ -void __init inode_init(unsigned long mempages) +void __init inode_init_early(void) { - struct hlist_head *head; - unsigned long order; - unsigned int nr_hash; - int i; - - for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++) - init_waitqueue_head(&i_wait_queue_heads[i].wqh); + struct hlist_head *p; + int loop; - if (!ihash_entries) - ihash_entries = PAGE_SHIFT < 14 ? - mempages >> (14 - PAGE_SHIFT) : - mempages << (PAGE_SHIFT - 14); - - ihash_entries *= sizeof(struct hlist_head); - for (order = 0; ((1UL << order) << PAGE_SHIFT) < ihash_entries; order++) - ; + inode_hashtable = + alloc_large_system_hash("Inode-cache", + sizeof(struct hlist_head), + ihash_entries, + 14, + 0, + &i_hash_shift, + &i_hash_mask); + p = inode_hashtable; + loop = 1 << i_hash_shift; do { - unsigned long tmp; - - nr_hash = (1UL << order) * PAGE_SIZE / - sizeof(struct hlist_head); - i_hash_mask = (nr_hash - 1); - - tmp = nr_hash; - i_hash_shift = 0; - while ((tmp >>= 1UL) != 0UL) - i_hash_shift++; - - inode_hashtable = (struct hlist_head *) - __get_free_pages(GFP_ATOMIC, order); - } while (inode_hashtable == NULL && --order >= 0); - - printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n", - nr_hash, order, (PAGE_SIZE << order)); + INIT_HLIST_HEAD(p); + p++; + loop--; + } while (loop); +} - if (!inode_hashtable) - panic("Failed to allocate inode hash table\n"); +void __init inode_init(unsigned long mempages) +{ + int i; - head = inode_hashtable; - i = nr_hash; - do { - INIT_HLIST_HEAD(head); - head++; - i--; - } while (i); + for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++) + init_waitqueue_head(&i_wait_queue_heads[i].wqh); /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), diff -uNrp linux-2.6.7-rc3/include/linux/bootmem.h linux-2.6.7-rc3-hash/include/linux/bootmem.h --- linux-2.6.7-rc3/include/linux/bootmem.h 2004-06-11 09:25:03.000000000 +0100 +++ linux-2.6.7-rc3-hash/include/linux/bootmem.h 2004-06-11 10:38:23.000000000 +0100 @@ -67,4 +67,12 @@ extern void * __init __alloc_bootmem_nod __alloc_bootmem_node((pgdat), (x), PAGE_SIZE, 0) #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ +extern void *__init alloc_large_system_hash(const char *tablename, + unsigned long bucketsize, + unsigned long numentries, + int scale, + int consider_highmem, + unsigned int *_hash_shift, + unsigned int *_hash_mask); + #endif /* _LINUX_BOOTMEM_H */ diff -uNrp linux-2.6.7-rc3/include/linux/fs.h linux-2.6.7-rc3-hash/include/linux/fs.h --- linux-2.6.7-rc3/include/linux/fs.h 2004-06-11 09:36:36.000000000 +0100 +++ linux-2.6.7-rc3-hash/include/linux/fs.h 2004-06-11 10:46:44.290698753 +0100 @@ -221,6 +221,7 @@ extern int leases_enable, dir_notify_ena extern void update_atime (struct inode *); extern void inode_init(unsigned long); +extern void inode_init_early(void); extern void mnt_init(unsigned long); extern void files_init(unsigned long); @@ -1199,6 +1200,7 @@ extern int filp_close(struct file *, fl_ extern char * getname(const char __user *); /* fs/dcache.c */ +extern void vfs_caches_init_early(void); extern void vfs_caches_init(unsigned long); #define __getname() kmem_cache_alloc(names_cachep, SLAB_KERNEL) diff -uNrp linux-2.6.7-rc3/include/linux/mmzone.h linux-2.6.7-rc3-hash/include/linux/mmzone.h --- linux-2.6.7-rc3/include/linux/mmzone.h 2004-06-11 09:36:36.000000000 +0100 +++ linux-2.6.7-rc3-hash/include/linux/mmzone.h 2004-06-11 09:44:41.531812981 +0100 @@ -20,6 +20,18 @@ #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER #endif +/* + * system hash table size limits + * - on large memory machines, we may want to allocate a bigger hash than that + * permitted by MAX_ORDER, so we allocate with the bootmem allocator, and are + * limited to this size + */ +#if MAX_ORDER > 14 +#define MAX_SYS_HASH_TABLE_ORDER MAX_ORDER +#else +#define MAX_SYS_HASH_TABLE_ORDER 14 +#endif + struct free_area { struct list_head free_list; unsigned long *map; diff -uNrp linux-2.6.7-rc3/init/main.c linux-2.6.7-rc3-hash/init/main.c --- linux-2.6.7-rc3/init/main.c 2004-06-11 09:36:36.565153337 +0100 +++ linux-2.6.7-rc3-hash/init/main.c 2004-06-11 10:48:37.028742509 +0100 @@ -454,6 +454,7 @@ asmlinkage void __init start_kernel(void initrd_start = 0; } #endif + vfs_caches_init_early(); mem_init(); kmem_cache_init(); if (late_time_init) diff -uNrp linux-2.6.7-rc3/mm/page_alloc.c linux-2.6.7-rc3-hash/mm/page_alloc.c --- linux-2.6.7-rc3/mm/page_alloc.c 2004-06-11 09:36:36.000000000 +0100 +++ linux-2.6.7-rc3-hash/mm/page_alloc.c 2004-06-11 11:21:59.761918397 +0100 @@ -55,6 +55,9 @@ EXPORT_SYMBOL(zone_table); static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; int min_free_kbytes = 1024; +static unsigned long __initdata nr_kernel_pages; +static unsigned long __initdata nr_all_pages; + /* * Temporary debugging check for pages not lying within a given zone. */ @@ -1454,6 +1457,10 @@ static void __init free_area_init_core(s if (zholes_size) realsize -= zholes_size[j]; + if (j == ZONE_DMA || j == ZONE_NORMAL) + nr_kernel_pages += realsize; + nr_all_pages += realsize; + zone->spanned_pages = size; zone->present_pages = realsize; zone->name = zone_names[j]; @@ -1994,3 +2001,78 @@ int lower_zone_protection_sysctl_handler setup_per_zone_protection(); return 0; } + +/* + * allocate a large system hash table from bootmem + * - it is assumed that the hash table must contain an exact power-of-2 + * quantity of entries + */ +static inline int log2(unsigned long x) __attribute__((pure)); +static inline int log2(unsigned long x) +{ + int r = 0; + for (x >>= 1; x > 0; x >>= 1) + r++; + return r; +} + +void *__init alloc_large_system_hash(const char *tablename, + unsigned long bucketsize, + unsigned long numentries, + int scale, + int consider_highmem, + unsigned int *_hash_shift, + unsigned int *_hash_mask) +{ + unsigned long mem, max, log2qty, size; + void *table; + + /* round applicable memory size up to nearest megabyte */ + mem = consider_highmem ? nr_all_pages : nr_kernel_pages; + mem += (1UL << (20 - PAGE_SHIFT)) - 1; + mem >>= 20 - PAGE_SHIFT; + mem <<= 20 - PAGE_SHIFT; + + /* limit to 1 bucket per 2^scale bytes of low memory (rounded up to + * nearest power of 2 in size) */ + if (scale > PAGE_SHIFT) + mem >>= (scale - PAGE_SHIFT); + else + mem <<= (PAGE_SHIFT - scale); + + mem = 1UL << (log2(mem) + 1); + + /* limit allocation size */ + max = (1UL << (PAGE_SHIFT + MAX_SYS_HASH_TABLE_ORDER)) / bucketsize; + if (max > mem) + max = mem; + + /* allow the kernel cmdline to have a say */ + if (!numentries || numentries > max) + numentries = max; + + log2qty = log2(numentries); + + do { + size = bucketsize << log2qty; + + table = alloc_bootmem(size); + + } while (!table && size > PAGE_SIZE && --log2qty); + + if (!table) + panic("Failed to allocate %s hash table\n", tablename); + + printk("%s hash table entries: %d (order: %d, %lu bytes)\n", + tablename, + (1U << log2qty), + log2(size) - PAGE_SHIFT, + size); + + if (_hash_shift) + *_hash_shift = log2qty; + if (_hash_mask) + *_hash_mask = (1 << log2qty) - 1; + + return table; +} --Multipart_Fri_Jun_11_15:21:45_2004-1-- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/