On x86 allyesconfig build:
The function compiles to 130 bytes of machine code.
It has 493 callsites.
Total reduction of vmlinux size: 27906 bytes.
text data bss dec hex filename
82447071 22255384 20627456 125329911 77861f7 vmlinux4
82419165 22255384 20627456 125302005 777f4f5 vmlinux5
Signed-off-by: Denys Vlasenko <[email protected]>
CC: Eric W. Biederman <[email protected]>
CC: David S. Miller <[email protected]>
CC: Jan Engelhardt <[email protected]>
CC: Jiri Pirko <[email protected]>
CC: [email protected]
CC: [email protected]
---
include/net/netns/generic.h | 15 +--------------
net/core/net_namespace.c | 16 ++++++++++++++++
2 files changed, 17 insertions(+), 14 deletions(-)
diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h
index 0931618..61a33bf 100644
--- a/include/net/netns/generic.h
+++ b/include/net/netns/generic.h
@@ -31,18 +31,5 @@ struct net_generic {
void *ptr[0];
};
-static inline void *net_generic(const struct net *net, int id)
-{
- struct net_generic *ng;
- void *ptr;
-
- rcu_read_lock();
- ng = rcu_dereference(net->gen);
- BUG_ON(id == 0 || id > ng->len);
- ptr = ng->ptr[id - 1];
- rcu_read_unlock();
-
- BUG_ON(!ptr);
- return ptr;
-}
+void *net_generic(const struct net *net, int id);
#endif
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index cb5290b..66c9ba1 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -42,6 +42,22 @@ EXPORT_SYMBOL(init_net);
static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
+void *net_generic(const struct net *net, int id)
+{
+ struct net_generic *ng;
+ void *ptr;
+
+ rcu_read_lock();
+ ng = rcu_dereference(net->gen);
+ BUG_ON(id == 0 || id > ng->len);
+ ptr = ng->ptr[id - 1];
+ rcu_read_unlock();
+
+ BUG_ON(!ptr);
+ return ptr;
+}
+EXPORT_SYMBOL(net_generic);
+
static struct net_generic *net_alloc_generic(void)
{
struct net_generic *ng;
--
1.8.1.4
Tue, Apr 14, 2015 at 02:25:11PM CEST, [email protected] wrote:
>On x86 allyesconfig build:
>The function compiles to 130 bytes of machine code.
>It has 493 callsites.
>Total reduction of vmlinux size: 27906 bytes.
Hmm. That is not much. How about performance impacts?
>
> text data bss dec hex filename
>82447071 22255384 20627456 125329911 77861f7 vmlinux4
>82419165 22255384 20627456 125302005 777f4f5 vmlinux5
>
>Signed-off-by: Denys Vlasenko <[email protected]>
>CC: Eric W. Biederman <[email protected]>
>CC: David S. Miller <[email protected]>
>CC: Jan Engelhardt <[email protected]>
>CC: Jiri Pirko <[email protected]>
>CC: [email protected]
>CC: [email protected]
>---
> include/net/netns/generic.h | 15 +--------------
> net/core/net_namespace.c | 16 ++++++++++++++++
> 2 files changed, 17 insertions(+), 14 deletions(-)
>
>diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h
>index 0931618..61a33bf 100644
>--- a/include/net/netns/generic.h
>+++ b/include/net/netns/generic.h
>@@ -31,18 +31,5 @@ struct net_generic {
> void *ptr[0];
> };
>
>-static inline void *net_generic(const struct net *net, int id)
>-{
>- struct net_generic *ng;
>- void *ptr;
>-
>- rcu_read_lock();
>- ng = rcu_dereference(net->gen);
>- BUG_ON(id == 0 || id > ng->len);
>- ptr = ng->ptr[id - 1];
>- rcu_read_unlock();
>-
>- BUG_ON(!ptr);
>- return ptr;
>-}
>+void *net_generic(const struct net *net, int id);
> #endif
>diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
>index cb5290b..66c9ba1 100644
>--- a/net/core/net_namespace.c
>+++ b/net/core/net_namespace.c
>@@ -42,6 +42,22 @@ EXPORT_SYMBOL(init_net);
>
> static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
>
>+void *net_generic(const struct net *net, int id)
>+{
>+ struct net_generic *ng;
>+ void *ptr;
>+
>+ rcu_read_lock();
>+ ng = rcu_dereference(net->gen);
>+ BUG_ON(id == 0 || id > ng->len);
>+ ptr = ng->ptr[id - 1];
>+ rcu_read_unlock();
>+
>+ BUG_ON(!ptr);
>+ return ptr;
>+}
>+EXPORT_SYMBOL(net_generic);
>+
> static struct net_generic *net_alloc_generic(void)
> {
> struct net_generic *ng;
>--
>1.8.1.4
>
On Tue, 2015-04-14 at 14:25 +0200, Denys Vlasenko wrote:
> On x86 allyesconfig build:
> The function compiles to 130 bytes of machine code.
> It has 493 callsites.
> Total reduction of vmlinux size: 27906 bytes.
>
> text data bss dec hex filename
> 82447071 22255384 20627456 125329911 77861f7 vmlinux4
> 82419165 22255384 20627456 125302005 777f4f5 vmlinux5
This sounds a big hammer to me.
These savings probably comes from the BUG_ON() that could simply be
removed.
The second one for sure has no purpose. First one looks defensive.
For a typical (non allyesconfig) kernel, net_generic() would translate
to :
return net->gen[id - 1]
Tunnels need this in fast path, so I presume we could introduce
net_generic_rcu() to keep this stuff inlined where it matters.
static inline void *net_generic_rcu(const struct net *net, int id)
{
struct net_generic *ng = rcu_dereference(net->gen);
return ng->ptr[id - 1];
}
On 04/14/2015 03:19 PM, Eric Dumazet wrote:
> On Tue, 2015-04-14 at 14:25 +0200, Denys Vlasenko wrote:
>> On x86 allyesconfig build:
>> The function compiles to 130 bytes of machine code.
>> It has 493 callsites.
>> Total reduction of vmlinux size: 27906 bytes.
>>
>> text data bss dec hex filename
>> 82447071 22255384 20627456 125329911 77861f7 vmlinux4
>> 82419165 22255384 20627456 125302005 777f4f5 vmlinux5
>
> This sounds a big hammer to me.
>
> These savings probably comes from the BUG_ON() that could simply be
> removed.
> The second one for sure has no purpose. First one looks defensive.
>
> For a typical (non allyesconfig) kernel, net_generic() would translate
> to :
>
> return net->gen[id - 1]
My allyesconfig, with BUG_ON's commented out:
void *net_generic(const struct net *net, int id)
{
struct net_generic *ng;
void *ptr;
rcu_read_lock();
ng = rcu_dereference(net->gen);
// BUG_ON(id == 0 || id > ng->len);
ptr = ng->ptr[id - 1];
rcu_read_unlock();
// BUG_ON(!ptr);
return ptr;
}
results in the following assembly:
net_generic:
call __fentry__
pushq %rbp #
movq %rsp, %rbp #,
pushq %r12 #
movl %esi, %r12d # id, id
pushq %rbx #
movq %rdi, %rbx # net, net
call rcu_read_lock #
movq 4784(%rbx), %rbx # MEM[(struct net_generic * const volatile *)net_2(D) + 4784B], _________p1
call debug_lockdep_rcu_enabled #
testl %eax, %eax # D.48937
je .L93 #,
cmpb $0, __warned.47396(%rip) #, __warned
jne .L93 #,
call rcu_read_lock_held #
testl %eax, %eax # D.48944
jne .L93 #,
movq $.LC6, %rdx #,
movl $51, %esi #,
movq $.LC0, %rdi #,
movb $1, __warned.47396(%rip) #, __warned
call lockdep_rcu_suspicious #
.L93:
decl %r12d # tmp68
movslq %r12d, %r12 # tmp68, tmp69
movq 24(%rbx,%r12,8), %rbx # _________p1_4->ptr, ptr
call rcu_read_unlock #
movq %rbx, %rax # ptr,
popq %rbx #
popq %r12 #
popq %rbp #
ret
This is still 112 bytes of code.
On Tue, 2015-04-14 at 15:57 +0200, Denys Vlasenko wrote:
> My allyesconfig, with BUG_ON's commented out:
>
Right. But I can tell you nobody uses lockdep on a production kernel.
Here, at Google, we get what I described.
On 04/14/2015 04:21 PM, Eric Dumazet wrote:
> On Tue, 2015-04-14 at 15:57 +0200, Denys Vlasenko wrote:
>
>> My allyesconfig, with BUG_ON's commented out:
>>
>
> Right. But I can tell you nobody uses lockdep on a production kernel.
>
> Here, at Google, we get what I described.
I'm trying to get to the .config which generates a small function.
So far, with LOCKDEP off, it is still this big:
net_generic:
call __fentry__
pushq %rbp #
#APP
# 72 "./arch/x86/include/asm/preempt.h" 1
incl %gs:__preempt_count(%rip) # __preempt_count
# 0 "" 2
#NO_APP
movq %rsp, %rbp #,
pushq %r12 #
movq %rdi, %r12 # net, net
pushq %rbx #
movl %esi, %ebx # id, id
call rcu_lock_acquire.constprop.15 #
movq 4784(%r12), %rax # MEM[(struct net_generic * const volatile *)net_2(D) + 4784B], _________p1
decl %ebx # tmp65
movslq %ebx, %rbx # tmp65, tmp66
movq 24(%rax,%rbx,8), %rbx # _________p1_4->ptr, ptr
call rcu_read_unlock #
movq %rbx, %rax # ptr,
popq %rbx #
popq %r12 #
popq %rbp #
ret
Config is attached.
On Tue, 2015-04-14 at 17:04 +0200, Denys Vlasenko wrote:
> On 04/14/2015 04:21 PM, Eric Dumazet wrote:
> > On Tue, 2015-04-14 at 15:57 +0200, Denys Vlasenko wrote:
> >
> >> My allyesconfig, with BUG_ON's commented out:
> >>
> >
> > Right. But I can tell you nobody uses lockdep on a production kernel.
> >
> > Here, at Google, we get what I described.
>
> I'm trying to get to the .config which generates a small function.
>
> So far, with LOCKDEP off, it is still this big:
>
If you read exactly what I wrote, you'll understand that at Google,
rcu_read_lock() & rcu_read_unlock() are nop.
Your patch is based on a worst case scenario, and will slow down our use
case.
From: Denys Vlasenko <[email protected]>
Date: Tue, 14 Apr 2015 14:25:11 +0200
> On x86 allyesconfig build:
> The function compiles to 130 bytes of machine code.
> It has 493 callsites.
> Total reduction of vmlinux size: 27906 bytes.
>
> text data bss dec hex filename
> 82447071 22255384 20627456 125329911 77861f7 vmlinux4
> 82419165 22255384 20627456 125302005 777f4f5 vmlinux5
>
> Signed-off-by: Denys Vlasenko <[email protected]>
That BUG_ON() was added 7 years ago, and I don't remember it ever
triggering or helping us diagnose something, so just remove it and
keep the function inlined.
Hi David, Eric,
As you may have surmised, this patch wasn't a result of me looking
at networking code; rather, it is a result of semi-automated search
for huge inlines.
The last step of this process would be the submission of patches.
I am expecting a range of responses from maintainers: enthusiastic,
"no reply", "go away, I don't care about code size",
and everything in between.
I can't invest a large amount of effort trying to push
every deinlining patch through. If someone, for example, would
flat out refuse to fix a huge inline in his driver, so be it.
I will be happy to run at least a few iterations over a patch
if maintainers do see a merit in deinlining, but have some
reservations wrt performance.
Your response falls into the latter case (you aren't refusing the patch
outright, but want it to be changed in some way).
It would help if you tell me how I should change the patches.
(I also hope to have a semi-generic way of addressing
performance concerns for future deinlining
patch submissions.)
On 04/14/2015 08:37 PM, David Miller wrote:
> From: Denys Vlasenko <[email protected]>
> Date: Tue, 14 Apr 2015 14:25:11 +0200
>
>> On x86 allyesconfig build:
>> The function compiles to 130 bytes of machine code.
>> It has 493 callsites.
>> Total reduction of vmlinux size: 27906 bytes.
>>
>> text data bss dec hex filename
>> 82447071 22255384 20627456 125329911 77861f7 vmlinux4
>> 82419165 22255384 20627456 125302005 777f4f5 vmlinux5
>>
>> Signed-off-by: Denys Vlasenko <[email protected]>
>
> That BUG_ON() was added 7 years ago, and I don't remember it ever
> triggering or helping us diagnose something, so just remove it and
> keep the function inlined.
There are two BUG_ONs. I'll remove both of them in v2.
Let me know if you want something else.
However, without BUG_ONs, function is still a bit big
on PREEMPT configs.
Among almost 500 users, many probably aren't hot paths.
How about having an inlined version, say, "fast_net_generic()",
and a deinlined one, and using one or another as appropriate.
Is this ok with you?
On 04/14/2015 03:19 PM, Eric Dumazet wrote:
> On Tue, 2015-04-14 at 14:25 +0200, Denys Vlasenko wrote:
>> On x86 allyesconfig build:
>> The function compiles to 130 bytes of machine code.
>> It has 493 callsites.
>> Total reduction of vmlinux size: 27906 bytes.
>>
>> text data bss dec hex filename
>> 82447071 22255384 20627456 125329911 77861f7 vmlinux4
>> 82419165 22255384 20627456 125302005 777f4f5 vmlinux5
>
> This sounds a big hammer to me.
>
> These savings probably comes from the BUG_ON() that could simply be
> removed.
> The second one for sure has no purpose. First one looks defensive.
>
> For a typical (non allyesconfig) kernel, net_generic() would translate
> to :
>
> return net->gen[id - 1]
>
> Tunnels need this in fast path, so I presume we could introduce
> net_generic_rcu() to keep this stuff inlined where it matters.
>
> static inline void *net_generic_rcu(const struct net *net, int id)
> {
> struct net_generic *ng = rcu_dereference(net->gen);
>
> return ng->ptr[id - 1];
> }
I looked at the code and I'm not feeling confident enough
to find places where replacing net_generic() with
net_generic_rcu() is okay.
Would it be okay if I add net_generic_rcu() as you suggest,
but leave it to you (network people) to switch to it where
appropriate?
--
vda
On Thu, 2015-04-16 at 13:14 +0200, Denys Vlasenko wrote:
> However, without BUG_ONs, function is still a bit big
> on PREEMPT configs.
Only on allyesconfig builds, that nobody use but to prove some points
about code size.
If you look at net_generic(), it is mostly used from code that is
normally in 3 modules. How many people really load them ?
net/tipc : 91 call sites
net/sunrpc : 57
fs/nfsd & fs/lockd : 183
Then few remaining uses in tunnels.
As we suggested, please just remove the BUG_ON().
Then, if you can come up with a solution that does not slow down non
PREEMPT kernel, why not.
From: Denys Vlasenko <[email protected]>
Date: Thu, 16 Apr 2015 13:14:14 +0200
> It would help if you tell me how I should change the patches.
Why ask Eric when I told you exactly how to change the patch to make
it acceptable, so please do so.
On 04/16/2015 02:38 PM, Eric Dumazet wrote:
> On Thu, 2015-04-16 at 13:14 +0200, Denys Vlasenko wrote:
>
>> However, without BUG_ONs, function is still a bit big
>> on PREEMPT configs.
>
> Only on allyesconfig builds, that nobody use but to prove some points
> about code size.
How do you expect one to find excessively large inlines,
if not on allyesconfig build?
Only by using allyesconfig, I can measure how many calls
are there in the kernel. (grepping source is utterly unreliable
due to nested inlines and macros).
For the record: I am not using the _full_ allyesconfig,
I do disable some debugging options which clearly aren't
ever enabled on production systems. E.g. in my config:
# CONFIG_DEBUG_KMEMLEAK_TEST is not set
# CONFIG_KASAN is not set
etc.
> If you look at net_generic(), it is mostly used from code that is
> normally in 3 modules. How many people really load them ?
>
> net/tipc : 91 call sites
> net/sunrpc : 57
> fs/nfsd & fs/lockd : 183
>
> Then few remaining uses in tunnels.
Grepping is far from reliable. The above missed more than half
of all calls. I disassembed vmlinux after deinlining, there are
nearly 500 calls of net_generic().
> As we suggested, please just remove the BUG_ON().
Going to send the patch in a minute.
--
vda
On Fri, 2015-04-17 at 19:05 +0200, Denys Vlasenko wrote:
> How do you expect one to find excessively large inlines,
> if not on allyesconfig build?
Tuning kernel sources based on allyesconfig build _size_ only is
terrible. We could build an interpreter based kernel and maybe reduce
its size by 50% who knows...
You are saying that all inline should be removed, since it is obvious
kernel size _will_ be smaller.
That is an ... interesting idea, but hardly related to net_generic().
On 04/17/2015 07:42 PM, Eric Dumazet wrote:
> On Fri, 2015-04-17 at 19:05 +0200, Denys Vlasenko wrote:
>> How do you expect one to find excessively large inlines,
>> if not on allyesconfig build?
>
> Tuning kernel sources based on allyesconfig build _size_ only is
> terrible. We could build an interpreter based kernel and maybe reduce
> its size by 50% who knows...
>
> You are saying that all inline should be removed, since it is obvious
> kernel size _will_ be smaller.
I am not saying that. That would be stupid.
From: Eric Dumazet <[email protected]>
Date: Fri, 17 Apr 2015 10:42:16 -0700
> On Fri, 2015-04-17 at 19:05 +0200, Denys Vlasenko wrote:
>
>> How do you expect one to find excessively large inlines,
>> if not on allyesconfig build?
>
> Tuning kernel sources based on allyesconfig build _size_ only is
> terrible. We could build an interpreter based kernel and maybe reduce
> its size by 50% who knows...
>
> You are saying that all inline should be removed, since it is obvious
> kernel size _will_ be smaller.
+1
From: Denys Vlasenko <[email protected]>
Date: Fri, 17 Apr 2015 19:05:17 +0200
> On 04/16/2015 02:38 PM, Eric Dumazet wrote:
>> On Thu, 2015-04-16 at 13:14 +0200, Denys Vlasenko wrote:
>>
>>> However, without BUG_ONs, function is still a bit big
>>> on PREEMPT configs.
>>
>> Only on allyesconfig builds, that nobody use but to prove some points
>> about code size.
>
> How do you expect one to find excessively large inlines,
> if not on allyesconfig build?
>
> Only by using allyesconfig, I can measure how many calls
> are there in the kernel. (grepping source is utterly unreliable
> due to nested inlines and macros).
It is not indicative for it's overhead in what people actually make
use of, which is what is actually important.
Uninlining a static inline that basically does no more than index into
an array is nothing more than pure folly. So please don't try to
weasel your way out of accepting this basic fact.
That's exactly the situation where the implementation of an abstraction
via a static inline is exactly the thing to do.