2002-10-26 19:15:41

by Manfred Spraul

[permalink] [raw]
Subject: [PATCH,RFC] faster kmalloc lookup

--- 2.5/mm/slab.c Sat Oct 26 21:13:33 2002
+++ build-2.5/mm/slab.c Sat Oct 26 20:40:09 2002
@@ -424,6 +430,7 @@
CN("size-131072")
};
#undef CN
+static struct cache_sizes *malloc_hints[sizeof(size_t)*8];

struct arraycache_init initarray_cache __initdata = { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
struct arraycache_init initarray_generic __initdata = { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
@@ -587,6 +594,7 @@
void __init kmem_cache_init(void)
{
size_t left_over;
+ int i;

init_MUTEX(&cache_chain_sem);
INIT_LIST_HEAD(&cache_chain);
@@ -604,6 +612,18 @@
* that initializes ac_data for all new cpus
*/
register_cpu_notifier(&cpucache_notifier);
+
+ for (i=0;i<sizeof(size_t)*8;i++) {
+ struct cache_sizes *csizep = malloc_sizes;
+ int size = (1<<i)/2+1;
+
+ for ( ; csizep->cs_size; csizep++) {
+ if (size > csizep->cs_size)
+ continue;
+ break;
+ }
+ malloc_hints[i] = csizep;
+ }
}


@@ -1796,7 +1816,11 @@
*/
void * kmalloc (size_t size, int flags)
{
- struct cache_sizes *csizep = malloc_sizes;
+ struct cache_sizes *csizep;
+
+ if(unlikely(size<2))
+ size=2;
+ csizep = malloc_hints[fls((size-1))];

for (; csizep->cs_size; csizep++) {
if (size > csizep->cs_size)


Attachments:
patch-fast-kmalloc (1.19 kB)

2002-10-26 22:05:57

by Alan

[permalink] [raw]
Subject: Re: [PATCH,RFC] faster kmalloc lookup

On Sat, 2002-10-26 at 20:22, Manfred Spraul wrote:
> kmalloc spends a large part of the total execution time trying to find
> the cache for the passed in size.
>
> What about the attached patch (against 2.5.44-mm5)?
> It uses fls jump over the caches that are definitively too small.

Out of curiousity how does fls compare with finding the right cache by
using a binary tree walk ? A lot of platforms seem to use generic_fls
which has a lot of conditions in it and also a lot of references to just
computed values that look likely to stall

2002-10-27 10:02:35

by Manfred Spraul

[permalink] [raw]
Subject: Re: [PATCH,RFC] faster kmalloc lookup

--- 2.5/include/asm-i386/bitops.h Sun Sep 22 06:25:12 2002
+++ build-2.5/include/asm-i386/bitops.h Sun Oct 27 11:04:57 2002
@@ -414,11 +414,22 @@
return word;
}

-/*
+/**
* fls: find last bit set.
+ * @x: The word to search
+ *
*/

-#define fls(x) generic_fls(x)
+static inline int fls(int x)
+{
+ int r;
+
+ __asm__("bsrl %1,%0\n\t"
+ "jnz 1f\n\t"
+ "movl $-1,%0\n"
+ "1:" : "=r" (r) : "g" (x));
+ return r+1;
+}

#ifdef __KERNEL__


Attachments:
patch-fls (449.00 B)

2002-10-27 13:23:12

by Manfred Spraul

[permalink] [raw]
Subject: Re: [PATCH,RFC] faster kmalloc lookup

I've run my slab microbenchmark over the 3 versions:
- current
- generic_fls
- i386 asm optimized fls

The test reports the fastest time for 100 kmalloc calls in a tight loop
(Duron 700). Loop/test overhead substracted.

32-byte alloc:
current: 41 ticks
generic_fls: 56 ticks
bsrl: 54 ticks

4096 byte alloc: 84 ticks
generic_fls: 53 ticks
bsrl: 54 ticks

40 ticks difference for -current between 4096 and 32 bytes - ~4 cycles
for each loop.
bit scan is 10 ticks slower for 32 byte allocs, 30 ticks faster for 4096
byte allocs.

No difference between generic_fls and bsrl - the branch predictor can
easily predict all branches in generic_fls for constant kmalloc calls.

--
Manfred

2002-10-28 12:59:16

by Nikita Danilov

[permalink] [raw]
Subject: Re: [PATCH,RFC] faster kmalloc lookup

Manfred Spraul writes:
> I've run my slab microbenchmark over the 3 versions:
> - current
> - generic_fls
> - i386 asm optimized fls
>
> The test reports the fastest time for 100 kmalloc calls in a tight loop
> (Duron 700). Loop/test overhead substracted.
>
> 32-byte alloc:
> current: 41 ticks
> generic_fls: 56 ticks
> bsrl: 54 ticks
>
> 4096 byte alloc: 84 ticks
> generic_fls: 53 ticks
> bsrl: 54 ticks
>
> 40 ticks difference for -current between 4096 and 32 bytes - ~4 cycles
> for each loop.
> bit scan is 10 ticks slower for 32 byte allocs, 30 ticks faster for 4096
> byte allocs.
>
> No difference between generic_fls and bsrl - the branch predictor can
> easily predict all branches in generic_fls for constant kmalloc calls.
>

Most kmalloc calls get constant size argument (usually
sizeof(something)). So, if switch() is used in stead of loop (and
kmalloc made inline), compiler would be able to optimize away
cache_sizes[] selection completely. Attached (ugly) patch does this.

> --

Nikita.
===== include/linux/slab.h 1.13 vs edited =====
--- 1.13/include/linux/slab.h Thu Sep 26 04:41:05 2002
+++ edited/include/linux/slab.h Mon Oct 28 15:55:35 2002
@@ -58,8 +58,91 @@
extern void kmem_cache_free(kmem_cache_t *, void *);
extern unsigned int kmem_cache_size(kmem_cache_t *);

-extern void *kmalloc(size_t, int);
extern void kfree(const void *);
+extern void * __kmalloc (int i, size_t size, int flags);
+
+/**
+ * kmalloc - allocate memory
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * kmalloc is the normal method of allocating memory
+ * in the kernel.
+ *
+ * The @flags argument may be one of:
+ *
+ * %GFP_USER - Allocate memory on behalf of user. May sleep.
+ *
+ * %GFP_KERNEL - Allocate normal kernel ram. May sleep.
+ *
+ * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers.
+ *
+ * Additionally, the %GFP_DMA flag may be set to indicate the memory
+ * must be suitable for DMA. This can mean different things on different
+ * platforms. For example, on i386, it means that the memory must come
+ * from the first 16MB.
+ */
+static inline void * kmalloc (size_t size, int flags)
+{
+ int i;
+
+ switch(size) {
+#if PAGE_SIZE == 4096
+ case 0 ... 32:
+ i = 0;
+ break;
+ case 33 ... 64:
+#else
+ case 0 ... 64:
+#endif
+ i = 1;
+ break;
+ case 65 ... 96:
+ i = 2;
+ break;
+ case 97 ... 128:
+ i = 3;
+ break;
+ case 129 ... 192:
+ i = 4;
+ break;
+ case 193 ... 256:
+ i = 5;
+ break;
+ case 257 ... 512:
+ i = 6;
+ break;
+ case 513 ... 1024:
+ i = 7;
+ break;
+ case 1025 ... 2048:
+ i = 8;
+ break;
+ case 2049 ... 4096:
+ i = 9;
+ break;
+ case 4097 ... 8192:
+ i = 10;
+ break;
+ case 8193 ... 16384:
+ i = 11;
+ break;
+ case 16385 ... 32768:
+ i = 12;
+ break;
+ case 32769 ... 65536:
+ i = 13;
+ break;
+ case 65537 ... 131072:
+ i = 14;
+ break;
+ default:
+ return NULL;
+ }
+
+ return __kmalloc(i, size, flags);
+}
+

extern int FASTCALL(kmem_cache_reap(int));

===== mm/slab.c 1.33 vs edited =====
--- 1.33/mm/slab.c Sat Sep 28 18:23:50 2002
+++ edited/mm/slab.c Mon Oct 28 16:01:24 2002
@@ -1573,40 +1573,6 @@
}

/**
- * kmalloc - allocate memory
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- *
- * kmalloc is the normal method of allocating memory
- * in the kernel.
- *
- * The @flags argument may be one of:
- *
- * %GFP_USER - Allocate memory on behalf of user. May sleep.
- *
- * %GFP_KERNEL - Allocate normal kernel ram. May sleep.
- *
- * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers.
- *
- * Additionally, the %GFP_DMA flag may be set to indicate the memory
- * must be suitable for DMA. This can mean different things on different
- * platforms. For example, on i386, it means that the memory must come
- * from the first 16MB.
- */
-void * kmalloc (size_t size, int flags)
-{
- cache_sizes_t *csizep = cache_sizes;
-
- for (; csizep->cs_size; csizep++) {
- if (size > csizep->cs_size)
- continue;
- return __kmem_cache_alloc(flags & GFP_DMA ?
- csizep->cs_dmacachep : csizep->cs_cachep, flags);
- }
- return NULL;
-}
-
-/**
* kmem_cache_free - Deallocate an object
* @cachep: The cache the allocation was from.
* @objp: The previously allocated object.
@@ -1626,6 +1592,19 @@
local_irq_save(flags);
__kmem_cache_free(cachep, objp);
local_irq_restore(flags);
+}
+
+void * __kmalloc (int i, size_t size, int flags)
+{
+ cache_sizes_t *csizep;
+
+#if PAGE_SIZE == 4096
+ csizep = &cache_sizes[i];
+#else
+ csizep = &cache_sizes[i - 1];
+#endif
+ return kmem_cache_alloc(flags & GFP_DMA ?
+ csizep->cs_dmacachep : csizep->cs_cachep, flags);
}

/**

2002-10-28 13:11:55

by Marcus Alanen

[permalink] [raw]
Subject: Re: [PATCH,RFC] faster kmalloc lookup

>Most kmalloc calls get constant size argument (usually
>sizeof(something)). So, if switch() is used in stead of loop (and
>kmalloc made inline), compiler would be able to optimize away
>cache_sizes[] selection completely. Attached (ugly) patch does this.

Perhaps a compile-time test to check if the argument is
a constant, and only in that case call your new kmalloc, otherwise
a non-inline kmalloc call? With your current patch, a non-constant
size argument to kmalloc means that the function is inlined anyway,
leading to unnecessary bloat in the resulting image.

Marcus

2002-10-28 13:20:19

by Nikita Danilov

[permalink] [raw]
Subject: Re: [PATCH,RFC] faster kmalloc lookup

Marcus Alanen writes:
> >Most kmalloc calls get constant size argument (usually
> >sizeof(something)). So, if switch() is used in stead of loop (and
> >kmalloc made inline), compiler would be able to optimize away
> >cache_sizes[] selection completely. Attached (ugly) patch does this.
>
> Perhaps a compile-time test to check if the argument is
> a constant, and only in that case call your new kmalloc, otherwise
> a non-inline kmalloc call? With your current patch, a non-constant
> size argument to kmalloc means that the function is inlined anyway,
> leading to unnecessary bloat in the resulting image.

Yes, exactly.

>
> Marcus
>

Nikita.

2002-10-28 15:57:35

by Manfred Spraul

[permalink] [raw]
Subject: Re: [PATCH,RFC] faster kmalloc lookup

Nikita Danilov wrote:

>Marcus Alanen writes:
> > >Most kmalloc calls get constant size argument (usually
> > >sizeof(something)). So, if switch() is used in stead of loop (and
> > >kmalloc made inline), compiler would be able to optimize away
> > >cache_sizes[] selection completely. Attached (ugly) patch does this.
> >
> > Perhaps a compile-time test to check if the argument is
> > a constant, and only in that case call your new kmalloc, otherwise
> > a non-inline kmalloc call? With your current patch, a non-constant
> > size argument to kmalloc means that the function is inlined anyway,
> > leading to unnecessary bloat in the resulting image.
>
>Yes, exactly.
>
>
I agree, I have an old patch that does that.

http://www.colorfullife.com/~manfred/slab/patch-km_div

Please ignore the part about fixed point division, it's not needed - the
'div' instructions is now outside of the hot path.

The problem is that the -mm tree contains around 10 slab patches, I want
to see them in Linus' tree before I add further patches.

--
Manfred

2002-10-31 15:34:13

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH,RFC] faster kmalloc lookup

On Sat, 2002-10-26 at 21:22, Manfred Spraul wrote:
> kmalloc spends a large part of the total execution time trying to find
> the cache for the passed in size.

would it be possible for fixed size kmalloc's to have the compiler
precalculate this ?



Attachments:
signature.asc (189.00 B)
This is a digitally signed message part

2002-10-31 22:59:06

by Manfred Spraul

[permalink] [raw]
Subject: Re: [PATCH,RFC] faster kmalloc lookup

// $Header$
// Kernel Version:
// VERSION = 2
// PATCHLEVEL = 5
// SUBLEVEL = 45
// EXTRAVERSION =
--- 2.5/include/linux/slab.h Sat Oct 26 21:02:12 2002
+++ build-2.5/include/linux/slab.h Thu Oct 31 23:58:26 2002
@@ -58,7 +58,80 @@
extern void kmem_cache_free(kmem_cache_t *, void *);
extern unsigned int kmem_cache_size(kmem_cache_t *);

-extern void *kmalloc(size_t, int);
+/* Size description struct for general caches. */
+struct cache_sizes {
+ size_t cs_size;
+ kmem_cache_t *cs_cachep;
+ kmem_cache_t *cs_dmacachep;
+};
+extern struct cache_sizes malloc_sizes[];
+extern void __you_cannot_kmalloc_more_than_128_kilo_bytes(void);
+
+enum malloc_numbers
+{
+#if PAGE_SIZE == 4096
+ kmalloc_size_32 = 0,
+#endif
+ kmalloc_size_64,
+#if L1_CACHE_BYTES < 64
+ kmalloc_size_96,
+#endif
+ kmalloc_size_128,
+#if L1_CACHE_BYTES < 128
+ kmalloc_size_192,
+#endif
+ kmalloc_size_256,
+ kmalloc_size_512,
+ kmalloc_size_1024,
+ kmalloc_size_2048,
+ kmalloc_size_4096,
+ kmalloc_size_8192,
+ kmalloc_size_16384,
+ kmalloc_size_32768,
+ kmalloc_size_65536,
+ kmalloc_size_131072
+};
+
+static inline void* __constant_kmalloc(size_t size, int flags)
+{
+#define FIXED_ALLOC(len) \
+ if(size <= len) \
+ return kmem_cache_alloc( (flags & GFP_DMA)? \
+ malloc_sizes[kmalloc_size_##len].cs_dmacachep \
+ : malloc_sizes[kmalloc_size_##len].cs_cachep, \
+ flags)
+#if PAGE_SIZE == 4096
+ FIXED_ALLOC(32);
+#endif
+ FIXED_ALLOC(64);
+#if L1_CACHE_BYTES < 64
+ FIXED_ALLOC(96);
+#endif
+ FIXED_ALLOC(128);
+#if L1_CACHE_BYTES < 128
+ FIXED_ALLOC(192);
+#endif
+ FIXED_ALLOC(256);
+ FIXED_ALLOC(512);
+ FIXED_ALLOC(1024);
+ FIXED_ALLOC(2048);
+ FIXED_ALLOC(4096);
+ FIXED_ALLOC(8192);
+ FIXED_ALLOC(16384);
+ FIXED_ALLOC(32768);
+ FIXED_ALLOC(65536);
+ FIXED_ALLOC(131072);
+#undef FIXED_ALLOC
+ __you_cannot_kmalloc_more_than_128_kilo_bytes();
+ return NULL;
+}
+extern void *__kmalloc(size_t, int);
+
+#define kmalloc(size, flags) \
+ (__builtin_constant_p(size) ? \
+ __constant_kmalloc((size),(flags)) : \
+ __kmalloc(size,flags))
+
extern void kfree(const void *);

extern int FASTCALL(kmem_cache_reap(int));
--- 2.5/mm/slab.c Thu Oct 31 18:48:19 2002
+++ build-2.5/mm/slab.c Thu Oct 31 23:58:26 2002
@@ -369,15 +369,8 @@
#define SET_PAGE_SLAB(pg,x) ((pg)->list.prev = (struct list_head *)(x))
#define GET_PAGE_SLAB(pg) ((struct slab *)(pg)->list.prev)

-/* Size description struct for general caches. */
-struct cache_sizes {
- size_t cs_size;
- kmem_cache_t *cs_cachep;
- kmem_cache_t *cs_dmacachep;
-};
-
/* These are the default caches for kmalloc. Custom caches can have other sizes. */
-static struct cache_sizes malloc_sizes[] = {
+struct cache_sizes malloc_sizes[] = {
#if PAGE_SIZE == 4096
{ 32, NULL, NULL},
#endif
@@ -1804,7 +1797,7 @@
* platforms. For example, on i386, it means that the memory must come
* from the first 16MB.
*/
-void * kmalloc (size_t size, int flags)
+void * __kmalloc (size_t size, int flags)
{
struct cache_sizes *csizep = malloc_sizes;

--- 2.5/kernel/ksyms.c Thu Oct 31 18:48:18 2002
+++ build-2.5/kernel/ksyms.c Thu Oct 31 23:44:54 2002
@@ -102,7 +102,8 @@
EXPORT_SYMBOL(kmem_cache_size);
EXPORT_SYMBOL(set_shrinker);
EXPORT_SYMBOL(remove_shrinker);
-EXPORT_SYMBOL(kmalloc);
+EXPORT_SYMBOL(malloc_sizes);
+EXPORT_SYMBOL(__kmalloc);
EXPORT_SYMBOL(kfree);
EXPORT_SYMBOL(vfree);
EXPORT_SYMBOL(__vmalloc);


Attachments:
patch-kmalloc-fast (3.32 kB)

2002-11-01 07:56:43

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH,RFC] faster kmalloc lookup

Manfred Spraul <[email protected]> writes:

> is there a simpler way to achieve that? Right now the list of kmalloc
> caches exists in 3 copies, which could easily get out of sync.

x86-64 system calls had a similar problem. I solved it by putting
them into a separate file with macros and including it multiple
times with different macro definitions.

It would be quite useful to have it in a separate file: I'm still
toying with the idea to write a memory profiler to compute a better
list of slab sizes than the relatively dumb power of two default. Previously
it was trivial to change the table at boot, now it would be a bit
easier at least if it was a simple file that could be easily
rewritten.

-Andi