2020-03-24 14:26:41

by Peter Zijlstra

[permalink] [raw]
Subject: [RESEND][PATCH v3 14/17] static_call: Add static_cond_call()

Extend the static_call infrastructure to optimize the following common
pattern:

if (func_ptr)
func_ptr(args...)

For the trampoline (which is in effect a tail-call), we patch the
JMP.d32 into a RET, which then directly consumes the trampoline call.

For the in-line sites we replace the CALL with a NOP5.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/include/asm/static_call.h | 10 ++++++++
arch/x86/kernel/static_call.c | 42 ++++++++++++++++++++++++++++---------
include/linux/static_call.h | 29 +++++++++++++++++++++++++
3 files changed, 71 insertions(+), 10 deletions(-)

--- a/arch/x86/include/asm/static_call.h
+++ b/arch/x86/include/asm/static_call.h
@@ -32,4 +32,14 @@
".size " STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \
".popsection \n")

+#define ARCH_DEFINE_STATIC_CALL_RETTRAMP(name) \
+ asm(".pushsection .static_call.text, \"ax\" \n" \
+ ".align 4 \n" \
+ ".globl " STATIC_CALL_TRAMP_STR(name) " \n" \
+ STATIC_CALL_TRAMP_STR(name) ": \n" \
+ " ret; nop; nop; nop; nop; \n" \
+ ".type " STATIC_CALL_TRAMP_STR(name) ", @function \n" \
+ ".size " STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \
+ ".popsection \n")
+
#endif /* _ASM_STATIC_CALL_H */
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -4,19 +4,41 @@
#include <linux/bug.h>
#include <asm/text-patching.h>

-static void __static_call_transform(void *insn, u8 opcode, void *func)
+enum insn_type {
+ call = 0, /* site call */
+ nop = 1, /* site cond-call */
+ jmp = 2, /* tramp / site tail-call */
+ ret = 3, /* tramp / site cond-tail-call */
+};
+
+static void __static_call_transform(void *insn, enum insn_type type, void *func)
{
- const void *code = text_gen_insn(opcode, (long)insn, (long)func);
+ int size = CALL_INSN_SIZE;
+ const void *code;

- if (WARN_ONCE(*(u8 *)insn != opcode,
- "unexpected static call insn opcode 0x%x at %pS\n",
- opcode, insn))
- return;
+ switch (type) {
+ case call:
+ code = text_gen_insn(CALL_INSN_OPCODE, insn, func);
+ break;
+
+ case nop:
+ code = ideal_nops[NOP_ATOMIC5];
+ break;
+
+ case jmp:
+ code = text_gen_insn(JMP32_INSN_OPCODE, insn, func);
+ break;
+
+ case ret:
+ code = text_gen_insn(RET_INSN_OPCODE, insn, func);
+ size = RET_INSN_SIZE;
+ break;
+ }

- if (memcmp(insn, code, CALL_INSN_SIZE) == 0)
+ if (memcmp(insn, code, size) == 0)
return;

- text_poke_bp(insn, code, CALL_INSN_SIZE, NULL);
+ text_poke_bp(insn, code, size, NULL);
}

void arch_static_call_transform(void *site, void *tramp, void *func)
@@ -24,10 +46,10 @@ void arch_static_call_transform(void *si
mutex_lock(&text_mutex);

if (tramp)
- __static_call_transform(tramp, JMP32_INSN_OPCODE, func);
+ __static_call_transform(tramp, jmp + !func, func);

if (IS_ENABLED(CONFIG_HAVE_STATIC_CALL_INLINE) && site)
- __static_call_transform(site, CALL_INSN_OPCODE, func);
+ __static_call_transform(site, !func, func);

mutex_unlock(&text_mutex);
}
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -17,6 +17,7 @@
* DECLARE_STATIC_CALL(name, func);
* DEFINE_STATIC_CALL(name, func);
* static_call(name)(args...);
+ * static_cond_call(name)(args...)
* static_call_update(name, func);
*
* Usage example:
@@ -107,7 +108,17 @@ extern int static_call_text_reserved(voi
__ADDRESSABLE(STATIC_CALL_NAME(name)); \
ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func)

+#define DEFINE_STATIC_COND_CALL(name, _func) \
+ DECLARE_STATIC_CALL(name, _func); \
+ struct static_call_key STATIC_CALL_NAME(name) = { \
+ .func = NULL, \
+ .type = 1, \
+ }; \
+ __ADDRESSABLE(STATIC_CALL_NAME(name)); \
+ ARCH_DEFINE_STATIC_CALL_RETTRAMP(name)
+
#define static_call(name) STATIC_CALL_TRAMP(name)
+#define static_cond_call(name) STATIC_CALL_TRAMP(name)

#define EXPORT_STATIC_CALL(name) \
EXPORT_SYMBOL(STATIC_CALL_NAME(name)); \
@@ -130,7 +141,15 @@ struct static_call_key {
}; \
ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func)

+#define DEFINE_STATIC_COND_CALL(name, _func) \
+ DECLARE_STATIC_CALL(name, _func); \
+ struct static_call_key STATIC_CALL_NAME(name) = { \
+ .func = NULL, \
+ }; \
+ ARCH_DEFINE_STATIC_CALL_RETTRAMP(name)
+
#define static_call(name) STATIC_CALL_TRAMP(name)
+#define static_cond_call(name) STATIC_CALL_TRAMP(name)

static inline
void __static_call_update(struct static_call_key *key, void *tramp, void *func)
@@ -161,9 +180,19 @@ struct static_call_key {
.func = _func, \
}

+#define DEFINE_STATIC_COND_CALL(name, _func) \
+ DECLARE_STATIC_CALL(name, _func); \
+ struct static_call_key STATIC_CALL_NAME(name) = { \
+ .func = NULL, \
+ }
+
#define static_call(name) \
((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_NAME(name).func))

+#define static_cond_call(name) \
+ if (STATIC_CALL_NAME(name).func) \
+ ((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_NAME(name).func))
+
static inline
void __static_call_update(struct static_call_key *key, void *tramp, void *func)
{



2020-03-24 16:14:58

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RESEND][PATCH v3 14/17] static_call: Add static_cond_call()

On Tue, Mar 24, 2020 at 7:25 AM Peter Zijlstra <[email protected]> wrote:
>
> Extend the static_call infrastructure to optimize the following common
> pattern:
>
> if (func_ptr)
> func_ptr(args...)

Is there any reason why this shouldn't be the default static call pattern?

IOW, do we need the special "cond" versions at all? Couldn't we just
say that this is how static calls fundamentally work - if the function
is NULL, they are nops?

Linus

2020-03-24 16:24:24

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RESEND][PATCH v3 14/17] static_call: Add static_cond_call()


> On Mar 24, 2020, at 9:14 AM, Linus Torvalds <[email protected]> wrote:
>
> On Tue, Mar 24, 2020 at 7:25 AM Peter Zijlstra <[email protected]> wrote:
>>
>> Extend the static_call infrastructure to optimize the following common
>> pattern:
>>
>> if (func_ptr)
>> func_ptr(args...)
>
> Is there any reason why this shouldn't be the default static call pattern?
>
> IOW, do we need the special "cond" versions at all? Couldn't we just
> say that this is how static calls fundamentally work - if the function
> is NULL, they are nops?
>
>

I haven’t checked if static calls currently support return values, but the conditional case only makes sense for functions that return void.

Aside from that, it might be nice for passing NULL in to warn or bug when the NULL pointer is stored instead of silently NOPping out the call in cases where having a real implementation isn’t optional.

2020-03-24 16:43:18

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RESEND][PATCH v3 14/17] static_call: Add static_cond_call()

On Tue, Mar 24, 2020 at 9:22 AM Andy Lutomirski <[email protected]> wrote:
>
> I haven’t checked if static calls currently support return values, but
> the conditional case only makes sense for functions that return void.
>
> Aside from that, it might be nice for passing NULL in to warn or bug
> when the NULL pointer is stored instead of silently NOPping out the
> call in cases where having a real implementation isn’t optional.

Both good points. I take back my question.

And it aside from warning about passing in NULL then it doesn't work,
I wonder if we could warn - at build time - when then using the COND
version with a function that doesn't return void?

Of course, one alternative is to just say "instead of using NOP, use
'xorl %eax,%eax'", and then we'd have the rule that a NULL conditional
function returns zero (or NULL).

I _think_ a "xorl %eax,%eax ; retq" is just three bytes and would fit
in the tailcall slot too.

Linus

2020-03-24 16:55:55

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RESEND][PATCH v3 14/17] static_call: Add static_cond_call()

On Tue, Mar 24, 2020 at 09:14:03AM -0700, Linus Torvalds wrote:
> On Tue, Mar 24, 2020 at 7:25 AM Peter Zijlstra <[email protected]> wrote:
> >
> > Extend the static_call infrastructure to optimize the following common
> > pattern:
> >
> > if (func_ptr)
> > func_ptr(args...)
>
> Is there any reason why this shouldn't be the default static call pattern?
>
> IOW, do we need the special "cond" versions at all? Couldn't we just
> say that this is how static calls fundamentally work - if the function
> is NULL, they are nops?

That doesn't work for functions that have a return value ...

2020-03-24 17:04:31

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RESEND][PATCH v3 14/17] static_call: Add static_cond_call()

On Tue, Mar 24, 2020 at 09:33:21AM -0700, Linus Torvalds wrote:
> On Tue, Mar 24, 2020 at 9:22 AM Andy Lutomirski <[email protected]> wrote:
> >
> > I haven’t checked if static calls currently support return values, but
> > the conditional case only makes sense for functions that return void.
> >
> > Aside from that, it might be nice for passing NULL in to warn or bug
> > when the NULL pointer is stored instead of silently NOPping out the
> > call in cases where having a real implementation isn’t optional.
>
> Both good points. I take back my question.
>
> And it aside from warning about passing in NULL then it doesn't work,
> I wonder if we could warn - at build time - when then using the COND
> version with a function that doesn't return void?

I actually (abuse) do that in the last patch... the reason being that
DEFINE_STATIC_COND_CALL() ends up only needing a type expression for the
second argument, while DEFINE_STATIC_CALL() needs an actual function.

> Of course, one alternative is to just say "instead of using NOP, use
> 'xorl %eax,%eax'", and then we'd have the rule that a NULL conditional
> function returns zero (or NULL).
>
> I _think_ a "xorl %eax,%eax ; retq" is just three bytes and would fit
> in the tailcall slot too.

Correct. The only problem is that our text patching machinery can't
replace multiple instructions :/

2020-03-25 18:14:26

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RESEND][PATCH v3 14/17] static_call: Add static_cond_call()

On Tue, Mar 24, 2020 at 06:03:06PM +0100, Peter Zijlstra wrote:
> On Tue, Mar 24, 2020 at 09:33:21AM -0700, Linus Torvalds wrote:

> > Of course, one alternative is to just say "instead of using NOP, use
> > 'xorl %eax,%eax'", and then we'd have the rule that a NULL conditional
> > function returns zero (or NULL).
> >
> > I _think_ a "xorl %eax,%eax ; retq" is just three bytes and would fit
> > in the tailcall slot too.
>
> Correct. The only problem is that our text patching machinery can't
> replace multiple instructions :/

To clarify; the problem is a task getting preempted with its RIP at the
RET. Then when we rewrite the text to be a CALL/JMP.d32 it will read
garbage (1 byte into the displacement of the instruction) instead of a
RET when it resumes.

Now, there are ways to fix this, the easiest being calling
synchronize_rcu_tasks() just like optprobes does (see also commit
5c02ece81848 ("x86/kprobes: Fix ordering while text-patching")).

It would mean patching a call away from NULL will be 'expensive' but it
ought to work.

I'll try and do the patch, see what it looks like.

2020-03-25 18:28:10

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RESEND][PATCH v3 14/17] static_call: Add static_cond_call()

On Wed, Mar 25, 2020 at 11:13 AM Peter Zijlstra <[email protected]> wrote:
>
> To clarify; the problem is a task getting preempted with its RIP at the
> RET. Then when we rewrite the text to be a CALL/JMP.d32 it will read
> garbage (1 byte into the displacement of the instruction) instead of a
> RET when it resumes.

Yeah, I realized it when you mentioned it. I was trying to come up
with some clever way to avoid it, but can't see anything.

I can come up with insane models - you could replace the xor;ret
sequence with a jump to a trampoline where you've aligned the target
trampoline so that the third byte in the "jmp xxx" remains a 'ret'
instruction. Then replace _that_ one with a "call_rcu()" callback.
Wild handwaving of "I'm sure this could be made to work".

But nothing remotely sane comes to mind.

Linus

2020-03-25 19:37:49

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [RESEND][PATCH v3 14/17] static_call: Add static_cond_call()

On March 24, 2020 9:33:21 AM PDT, Linus Torvalds <[email protected]> wrote:
>On Tue, Mar 24, 2020 at 9:22 AM Andy Lutomirski <[email protected]>
>wrote:
>>
>> I haven’t checked if static calls currently support return values,
>but
>> the conditional case only makes sense for functions that return void.
>>
>> Aside from that, it might be nice for passing NULL in to warn or bug
>> when the NULL pointer is stored instead of silently NOPping out the
>> call in cases where having a real implementation isn’t optional.
>
>Both good points. I take back my question.
>
>And it aside from warning about passing in NULL then it doesn't work,
>I wonder if we could warn - at build time - when then using the COND
>version with a function that doesn't return void?
>
>Of course, one alternative is to just say "instead of using NOP, use
>'xorl %eax,%eax'", and then we'd have the rule that a NULL conditional
>function returns zero (or NULL).
>
>I _think_ a "xorl %eax,%eax ; retq" is just three bytes and would fit
>in the tailcall slot too.
>
> Linus

"movl $0,%eax" is five bytes, the same length as a call. Doesn't work for a tailcall, still, although if the sequence:

jmp tailcall
retq

... can be generated at the tailcall site then the jmp can get patched out.

This would be equivalent to disabling tailcalls except that the stack frame is normally not unwound until between the call and the ret, so just disabling tailcalls from the compiler pov doesn't work.
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.

2020-03-25 20:54:05

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RESEND][PATCH v3 14/17] static_call: Add static_cond_call()

On Wed, Mar 25, 2020 at 12:35 PM <[email protected]> wrote:
>
> "movl $0,%eax" is five bytes, the same length as a call. Doesn't work for a tailcall, still, although if the sequence:
>
> jmp tailcall
> retq
>
> ... can be generated at the tailcall site then the jmp can get patched out.

No, the problem is literally that the whole approach depends on the
compiler just generating normal code for the static calls.

And the tailcall is the only interesting case. The normal call thing
can be trivially just a single instruction (a mov like you say, but
also easily just a xor padded with prefixes).

Linus

2020-03-25 22:07:50

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RESEND][PATCH v3 14/17] static_call: Add static_cond_call()

On Wed, Mar 25, 2020 at 01:52:00PM -0700, Linus Torvalds wrote:
> On Wed, Mar 25, 2020 at 12:35 PM <[email protected]> wrote:
> >
> > "movl $0,%eax" is five bytes, the same length as a call. Doesn't work for a tailcall, still, although if the sequence:
> >
> > jmp tailcall
> > retq
> >
> > ... can be generated at the tailcall site then the jmp can get patched out.
>
> No, the problem is literally that the whole approach depends on the
> compiler just generating normal code for the static calls.
>
> And the tailcall is the only interesting case. The normal call thing
> can be trivially just a single instruction (a mov like you say, but
> also easily just a xor padded with prefixes).

So I got the text poking bit written, and that turned out to be the
simple part :/ Find below.

Then we can do:

#define static_void_call(name)
if (STATIC_CALL_NAME(name).func) \
((typeof(STATIC_CALL_TRAMP(name))*)STATIC_CALL_NAME(name).func)

Which works, as evidenced by if being the current static_cond_call(),
but it is non-optimal code-gen for the case where func will never be
NULL, and also there is no way to write a !void version of the same.

The best I can come up with is something like:

#define static_call(name, args...) ({ \
typeof(STATIC_CALL_TRAMP(name)(args)) __ret = (typeof(STATIC_CALL_TRAMP(name)(args)))0; \
if (STATIC_CALL_NAME(name).func) \
__ret = ((typeof(STATIC_CALL_TRAMP(name))*)STATIC_CALL_NAME(name).func)(args); \
__ret; })

Which has a different (and IMO less natural) syntax.

That then brings us to the HAVE_STATIC_CALL variant; there we need to
somehow make the void vs !void thing persistent, and there I ran out of
ideas.

Initially I figured we could do something like:

#define annotate_void_call() ({ \
asm volatile("%c0:\n\t" \
".pushsection .discard.void_call\n\t" \
".long %c0b - .\n\t" \
".popsection\n\t" : : "i" (__COUNTER__)); \
})

#define static_void_call(name) \
annotate_void_call(); \
STATIC_CALL_TRAMP(name)

But that doesn't actually work for something like:

static_void_call(foo)(static_call(bar)());

Where the argument setup of the call, include another static call.
Arguably this is quite insane, and we could just say:
"don't-do-that-then", but it does show how fragile this is.

Anyway, let me ponder this a little more... brain is starting to give
out anyway. More tomorrow.


---
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 346c98d5261e..240996338f66 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -947,6 +947,7 @@ struct text_poke_loc {
s32 rel32;
u8 opcode;
const u8 text[POKE_MAX_OPCODE_SIZE];
+ u8 multi;
};

struct bp_patching_desc {
@@ -1103,8 +1104,8 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
.refs = ATOMIC_INIT(1),
};
unsigned char int3 = INT3_INSN_OPCODE;
+ int do_sync, do_multi = 0;
unsigned int i;
- int do_sync;

lockdep_assert_held(&text_mutex);

@@ -1119,11 +1120,24 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
/*
* First step: add a int3 trap to the address that will be patched.
*/
- for (i = 0; i < nr_entries; i++)
+ for (i = 0; i < nr_entries; i++) {
text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
+ do_multi |= tp[i].multi;
+ }

text_poke_sync();

+ if (do_multi) {
+ /*
+ * In case the 'old' text consisted of multiple instructions
+ * we need to wait for an rcu_tasks quiescence period to ensure
+ * all potentially preempted tasks have normally scheduled.
+ * This ensures no tasks still have their instruction pointer
+ * pointed at what will become the middle of an instruction.
+ */
+ synchronize_rcu_tasks();
+ }
+
/*
* Second step: update all but the first byte of the patched range.
*/
@@ -1176,10 +1190,28 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
{
struct insn insn;

+ /*
+ * Determine if the 'old' text at @addr consists of multiple
+ * instructions. Make an exception for INT3 and RET, since
+ * they don't (necessarily) continue to execute the following
+ * instructions.
+ */
+ kernel_insn_init(&insn, addr, MAX_INSN_SIZE);
+ insn_get_length(&insn);
+ tp.multi = (insn.legnth < len) &&
+ (insn.opcode.bytes[0] != RET_INSN_OPCODE ||
+ insn.opcode.bytes[0] != INT3_INSN_OPCODE);
+
+ /*
+ * Copy the 'new' text into the text_poke vector.
+ */
memcpy((void *)tp->text, opcode, len);
if (!emulate)
emulate = opcode;

+ /*
+ * Decode the instruction poke_int3_handler() needs to emulate.
+ */
kernel_insn_init(&insn, emulate, MAX_INSN_SIZE);
insn_get_length(&insn);

diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index 1e82e2486e76..2055e2d3674d 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -9,8 +9,13 @@ enum insn_type {
nop = 1, /* site cond-call */
jmp = 2, /* tramp / site tail-call */
ret = 3, /* tramp / site cond-tail-call */
+ null = 4,
+ null_ret = 5,
};

+static const u8 null_insn[5] = { 0xb8, 0x00, 0x00, 0x00, 0x00 }; /* movl $0, %eax */
+static const u8 null_ret_insn[5] = { 0x31, 0xc0, 0xc3, 0x90, 0x90 }; /* xorl %eax, %eax; ret; nop; nop; */
+
static void __ref __static_call_transform(void *insn, enum insn_type type, void *func)
{
int size = CALL_INSN_SIZE;
@@ -34,6 +39,14 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void
size = RET_INSN_SIZE;
break;

+ case null:
+ code = null_insi;
+ break;
+
+ case null_ret:
+ code = null_ret_insn;
+ break;
+
default: /* GCC is a moron -- it figures @code can be uninitialized below */
BUG();
}

2020-03-26 23:39:38

by Rasmus Villemoes

[permalink] [raw]
Subject: Re: [RESEND][PATCH v3 14/17] static_call: Add static_cond_call()

On 24/03/2020 14.56, Peter Zijlstra wrote:
> Extend the static_call infrastructure to optimize the following common
> pattern:
>
> if (func_ptr)
> func_ptr(args...)
>

> +#define DEFINE_STATIC_COND_CALL(name, _func) \
> + DECLARE_STATIC_CALL(name, _func); \
> + struct static_call_key STATIC_CALL_NAME(name) = { \
> + .func = NULL, \
> + }
> +
> #define static_call(name) \
> ((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_NAME(name).func))
>
> +#define static_cond_call(name) \
> + if (STATIC_CALL_NAME(name).func) \
> + ((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_NAME(name).func))
> +

What, apart from fear of being ridiculed by kernel folks, prevents the
compiler from reloading STATIC_CALL_NAME(name).func ? IOW, doesn't this
want a READ_ONCE somewhere?

And please remind me, what is the consensus for sizeof(long) loads: does
static_call() need load-tearing protection or not?

Rasmus

2020-03-27 10:09:46

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RESEND][PATCH v3 14/17] static_call: Add static_cond_call()

On Fri, Mar 27, 2020 at 12:37:35AM +0100, Rasmus Villemoes wrote:
> On 24/03/2020 14.56, Peter Zijlstra wrote:
> > Extend the static_call infrastructure to optimize the following common
> > pattern:
> >
> > if (func_ptr)
> > func_ptr(args...)
> >
>
> > +#define DEFINE_STATIC_COND_CALL(name, _func) \
> > + DECLARE_STATIC_CALL(name, _func); \
> > + struct static_call_key STATIC_CALL_NAME(name) = { \
> > + .func = NULL, \
> > + }
> > +
> > #define static_call(name) \
> > ((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_NAME(name).func))
> >
> > +#define static_cond_call(name) \
> > + if (STATIC_CALL_NAME(name).func) \
> > + ((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_NAME(name).func))
> > +
>
> What, apart from fear of being ridiculed by kernel folks, prevents the
> compiler from reloading STATIC_CALL_NAME(name).func ? IOW, doesn't this
> want a READ_ONCE somewhere?

Hurmph.. I suspect you're quite right, but at the same time I can't seem
to write a macro that does that :/ Let me try harder.

> And please remind me, what is the consensus for sizeof(long) loads: does
> static_call() need load-tearing protection or not?

We all like to believe compilers are broken when they tear naturally
aligned words, but we're also not quite comfortable trusting that.

2020-03-27 13:28:10

by Rasmus Villemoes

[permalink] [raw]
Subject: Re: [RESEND][PATCH v3 14/17] static_call: Add static_cond_call()

On 27/03/2020 11.08, Peter Zijlstra wrote:
> On Fri, Mar 27, 2020 at 12:37:35AM +0100, Rasmus Villemoes wrote:
>> On 24/03/2020 14.56, Peter Zijlstra wrote:
>>> Extend the static_call infrastructure to optimize the following common
>>> pattern:
>>>
>>> if (func_ptr)
>>> func_ptr(args...)
>>>
>>
>>> +#define DEFINE_STATIC_COND_CALL(name, _func) \
>>> + DECLARE_STATIC_CALL(name, _func); \
>>> + struct static_call_key STATIC_CALL_NAME(name) = { \
>>> + .func = NULL, \
>>> + }
>>> +
>>> #define static_call(name) \
>>> ((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_NAME(name).func))
>>>
>>> +#define static_cond_call(name) \
>>> + if (STATIC_CALL_NAME(name).func) \
>>> + ((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_NAME(name).func))
>>> +
>>
>> What, apart from fear of being ridiculed by kernel folks, prevents the
>> compiler from reloading STATIC_CALL_NAME(name).func ? IOW, doesn't this
>> want a READ_ONCE somewhere?
>
> Hurmph.. I suspect you're quite right, but at the same time I can't seem
> to write a macro that does that :/ Let me try harder.

Hm, yeah, essentially one wants some macro magic that turns

foo(a)(b, c, d)

into

bar(a, b, c, d)

and then bar() can do the right thing.

One option is to give up on the nice syntax and just make it

static_cond_call(func, ...)

But, here's another few things that makes me wonder if the cond_call
variant is worth it, at least in its current form: In the case where
!ARCH_HAVE_STATIC_CALL, so static_cond_call(foo)(a, b, c) is just syntax
sugar for

if (foo)
foo(a, b, c)

gcc can choose to wait with computing the argument expressions a, b, c
until after the test - they may be somewhat expensive, but at the very
least there's some register shuffling to do to prepare for the call, and
probably also some restoring afterwards. In the ARCH_HAVE_STATIC_CALL
case, whether inline or not, it becomes an unconditional call from gcc's
perspective, so all the arguments must be computed and stuffed in the
right registers. That price may be higher than the load+test. Not to
mention the fact that side-effects in the arguments happen
unconditionally for ARCH_HAVE_STATIC_CALL but only if func is non-null
for !ARCH_HAVE_STATIC_CALL.

Perhaps associating a static_key with each STATIC_COND_CALL could solve
these. But that of course makes the update procedure somewhat more
complicated.

Rasmus