2012-10-20 15:49:31

by Joonsoo Kim

[permalink] [raw]
Subject: [PATCH for-v3.7 1/2] slub: optimize poorly inlined kmalloc* functions

kmalloc() and kmalloc_node() is always inlined into generic code.
However, there is a mistake in implemention of the SLUB.

In kmalloc() and kmalloc_node() of the SLUB,
we try to compare kmalloc_caches[index] with NULL.
As it cannot be known at compile time,
this comparison is inserted into generic code invoking kmalloc*.
This may decrease system performance, so we should fix it.

Below is the result of "size vmlinux"
text size is decreased roughly 20KB

Before:
text data bss dec hex filename
10044177 1443168 5722112 17209457 1069871 vmlinux
After:
text data bss dec hex filename
10022627 1443136 5722112 17187875 1064423 vmlinux

Cc: Christoph Lameter <[email protected]>
Signed-off-by: Joonsoo Kim <[email protected]>
---
With Christoph's patchset(common kmalloc caches:
'[15/15] Common Kmalloc cache determination') which is not merged into mainline yet,
this issue will be fixed.
As it takes some time, I send this patch for v3.7

This patch is based on v3.7-rc1

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index df448ad..4c75f2b 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -271,9 +271,10 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
return kmalloc_large(size, flags);

if (!(flags & SLUB_DMA)) {
- struct kmem_cache *s = kmalloc_slab(size);
+ int index = kmalloc_index(size);
+ struct kmem_cache *s = kmalloc_caches[index];

- if (!s)
+ if (!index)
return ZERO_SIZE_PTR;

return kmem_cache_alloc_trace(s, flags, size);
@@ -304,9 +305,10 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
{
if (__builtin_constant_p(size) &&
size <= SLUB_MAX_SIZE && !(flags & SLUB_DMA)) {
- struct kmem_cache *s = kmalloc_slab(size);
+ int index = kmalloc_index(size);
+ struct kmem_cache *s = kmalloc_caches[index];

- if (!s)
+ if (!index)
return ZERO_SIZE_PTR;

return kmem_cache_alloc_node_trace(s, flags, node, size);
--
1.7.9.5


2012-10-20 15:49:38

by Joonsoo Kim

[permalink] [raw]
Subject: [PATCH for-v3.7 2/2] slub: optimize kmalloc* inlining for GFP_DMA

kmalloc() and kmalloc_node() of the SLUB isn't inlined when @flags = __GFP_DMA.
This patch optimize this case,
so when @flags = __GFP_DMA, it will be inlined into generic code.

Cc: Christoph Lameter <[email protected]>
Signed-off-by: Joonsoo Kim <[email protected]>

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 4c75f2b..4adf50b 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -147,6 +147,7 @@ struct kmem_cache {
* 2^x bytes of allocations.
*/
extern struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
+extern struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];

/*
* Sorry that the following has to be that ugly but some versions of GCC
@@ -266,19 +267,24 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)

static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
+ struct kmem_cache *s;
+ int index;
+
if (__builtin_constant_p(size)) {
if (size > SLUB_MAX_SIZE)
return kmalloc_large(size, flags);

- if (!(flags & SLUB_DMA)) {
- int index = kmalloc_index(size);
- struct kmem_cache *s = kmalloc_caches[index];
-
- if (!index)
- return ZERO_SIZE_PTR;
+ index = kmalloc_index(size);
+ if (!index)
+ return ZERO_SIZE_PTR;
+#ifdef CONFIG_ZONE_DMA
+ if (unlikely(flags & SLUB_DMA)) {
+ s = kmalloc_dma_caches[index];
+ } else
+#endif
+ s = kmalloc_caches[index];

- return kmem_cache_alloc_trace(s, flags, size);
- }
+ return kmem_cache_alloc_trace(s, flags, size);
}
return __kmalloc(size, flags);
}
@@ -303,13 +309,19 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,

static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
{
- if (__builtin_constant_p(size) &&
- size <= SLUB_MAX_SIZE && !(flags & SLUB_DMA)) {
- int index = kmalloc_index(size);
- struct kmem_cache *s = kmalloc_caches[index];
+ struct kmem_cache *s;
+ int index;

+ if (__builtin_constant_p(size) && size <= SLUB_MAX_SIZE) {
+ index = kmalloc_index(size);
if (!index)
return ZERO_SIZE_PTR;
+#ifdef CONFIG_ZONE_DMA
+ if (unlikely(flags & SLUB_DMA)) {
+ s = kmalloc_dma_caches[index];
+ } else
+#endif
+ s = kmalloc_caches[index];

return kmem_cache_alloc_node_trace(s, flags, node, size);
}
diff --git a/mm/slub.c b/mm/slub.c
index a0d6984..a94533c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3222,7 +3222,8 @@ struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
EXPORT_SYMBOL(kmalloc_caches);

#ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
+struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
+EXPORT_SYMBOL(kmalloc_dma_caches);
#endif

static int __init setup_slub_min_order(char *str)
--
1.7.9.5

Subject: Re: [PATCH for-v3.7 2/2] slub: optimize kmalloc* inlining for GFP_DMA

On Sun, 21 Oct 2012, Joonsoo Kim wrote:

> kmalloc() and kmalloc_node() of the SLUB isn't inlined when @flags = __GFP_DMA.
> This patch optimize this case,
> so when @flags = __GFP_DMA, it will be inlined into generic code.

__GFP_DMA is a rarely used flag for kmalloc allocators and so far it was
not considered that it is worth to directly support it in the inlining
code.

2012-10-23 02:29:46

by Joonsoo Kim

[permalink] [raw]
Subject: Re: [PATCH for-v3.7 2/2] slub: optimize kmalloc* inlining for GFP_DMA

2012/10/22 Christoph Lameter <[email protected]>:
> On Sun, 21 Oct 2012, Joonsoo Kim wrote:
>
>> kmalloc() and kmalloc_node() of the SLUB isn't inlined when @flags = __GFP_DMA.
>> This patch optimize this case,
>> so when @flags = __GFP_DMA, it will be inlined into generic code.
>
> __GFP_DMA is a rarely used flag for kmalloc allocators and so far it was
> not considered that it is worth to directly support it in the inlining
> code.
>
>

Hmm... but, the SLAB already did that optimization for __GFP_DMA.
Almost every kmalloc() is invoked with constant flags value,
so I think that overhead from this patch may be negligible.
With this patch, code size of vmlinux is reduced slightly.

2012-10-23 06:17:03

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH for-v3.7 2/2] slub: optimize kmalloc* inlining for GFP_DMA

On Tue, 2012-10-23 at 11:29 +0900, JoonSoo Kim wrote:
> 2012/10/22 Christoph Lameter <[email protected]>:
> > On Sun, 21 Oct 2012, Joonsoo Kim wrote:
> >
> >> kmalloc() and kmalloc_node() of the SLUB isn't inlined when @flags = __GFP_DMA.
> >> This patch optimize this case,
> >> so when @flags = __GFP_DMA, it will be inlined into generic code.
> >
> > __GFP_DMA is a rarely used flag for kmalloc allocators and so far it was
> > not considered that it is worth to directly support it in the inlining
> > code.
> >
> >
>
> Hmm... but, the SLAB already did that optimization for __GFP_DMA.
> Almost every kmalloc() is invoked with constant flags value,
> so I think that overhead from this patch may be negligible.
> With this patch, code size of vmlinux is reduced slightly.

Only because you asked a allyesconfig

GFP_DMA is used for less than 0.1 % of kmalloc() calls, for legacy
hardware (from last century)


In fact if you want to reduce even more your vmlinux, you could test

if (__builtin_constant_p(flags) && (flags & SLUB_DMA))
return kmem_cache_alloc_trace(s, flags, size);

to force the call to out of line code.



2012-10-23 16:12:38

by Joonsoo Kim

[permalink] [raw]
Subject: Re: [PATCH for-v3.7 2/2] slub: optimize kmalloc* inlining for GFP_DMA

Hi, Eric.

2012/10/23 Eric Dumazet <[email protected]>:
> On Tue, 2012-10-23 at 11:29 +0900, JoonSoo Kim wrote:
>> 2012/10/22 Christoph Lameter <[email protected]>:
>> > On Sun, 21 Oct 2012, Joonsoo Kim wrote:
>> >
>> >> kmalloc() and kmalloc_node() of the SLUB isn't inlined when @flags = __GFP_DMA.
>> >> This patch optimize this case,
>> >> so when @flags = __GFP_DMA, it will be inlined into generic code.
>> >
>> > __GFP_DMA is a rarely used flag for kmalloc allocators and so far it was
>> > not considered that it is worth to directly support it in the inlining
>> > code.
>> >
>> >
>>
>> Hmm... but, the SLAB already did that optimization for __GFP_DMA.
>> Almost every kmalloc() is invoked with constant flags value,
>> so I think that overhead from this patch may be negligible.
>> With this patch, code size of vmlinux is reduced slightly.
>
> Only because you asked a allyesconfig
>
> GFP_DMA is used for less than 0.1 % of kmalloc() calls, for legacy
> hardware (from last century)

I'm not doing with allyesconfig,
but localmodconfig on my ubuntu desktop system.
On my system, 700 bytes of text of vmlinux is reduced
which mean there may be more than 100 callsite with GFP_DMA.

> In fact if you want to reduce even more your vmlinux, you could test
>
> if (__builtin_constant_p(flags) && (flags & SLUB_DMA))
> return kmem_cache_alloc_trace(s, flags, size);
>
> to force the call to out of line code.

The reason why I mention about code size is that I want to say it may
be good for performance,
although it has a just small impact.
I'm not interest of reducing code size :)

Thanks for comment.

2012-10-24 08:05:24

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH for-v3.7 1/2] slub: optimize poorly inlined kmalloc* functions

On Sat, Oct 20, 2012 at 6:48 PM, Joonsoo Kim <[email protected]> wrote:
> kmalloc() and kmalloc_node() is always inlined into generic code.
> However, there is a mistake in implemention of the SLUB.
>
> In kmalloc() and kmalloc_node() of the SLUB,
> we try to compare kmalloc_caches[index] with NULL.
> As it cannot be known at compile time,
> this comparison is inserted into generic code invoking kmalloc*.
> This may decrease system performance, so we should fix it.
>
> Below is the result of "size vmlinux"
> text size is decreased roughly 20KB
>
> Before:
> text data bss dec hex filename
> 10044177 1443168 5722112 17209457 1069871 vmlinux
> After:
> text data bss dec hex filename
> 10022627 1443136 5722112 17187875 1064423 vmlinux
>
> Cc: Christoph Lameter <[email protected]>
> Signed-off-by: Joonsoo Kim <[email protected]>
> ---
> With Christoph's patchset(common kmalloc caches:
> '[15/15] Common Kmalloc cache determination') which is not merged into mainline yet,
> this issue will be fixed.
> As it takes some time, I send this patch for v3.7
>
> This patch is based on v3.7-rc1

Looks reasonable to me. Christoph, any objections?

>
> diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
> index df448ad..4c75f2b 100644
> --- a/include/linux/slub_def.h
> +++ b/include/linux/slub_def.h
> @@ -271,9 +271,10 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
> return kmalloc_large(size, flags);
>
> if (!(flags & SLUB_DMA)) {
> - struct kmem_cache *s = kmalloc_slab(size);
> + int index = kmalloc_index(size);
> + struct kmem_cache *s = kmalloc_caches[index];
>
> - if (!s)
> + if (!index)
> return ZERO_SIZE_PTR;
>
> return kmem_cache_alloc_trace(s, flags, size);
> @@ -304,9 +305,10 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
> {
> if (__builtin_constant_p(size) &&
> size <= SLUB_MAX_SIZE && !(flags & SLUB_DMA)) {
> - struct kmem_cache *s = kmalloc_slab(size);
> + int index = kmalloc_index(size);
> + struct kmem_cache *s = kmalloc_caches[index];
>
> - if (!s)
> + if (!index)
> return ZERO_SIZE_PTR;
>
> return kmem_cache_alloc_node_trace(s, flags, node, size);
> --
> 1.7.9.5
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

Subject: Re: [PATCH for-v3.7 1/2] slub: optimize poorly inlined kmalloc* functions

On Wed, 24 Oct 2012, Pekka Enberg wrote:

> Looks reasonable to me. Christoph, any objections?

I am fine with it. Its going to be short lived because my latest patchset
will do the same. Can we merge this for 3.7?