2019-06-28 01:51:55

by Josh Poimboeuf

[permalink] [raw]
Subject: [PATCH v4 0/2] x86: bpf unwinder fixes

Here's the last fix along with its objtool dependency.

For testing I verified that a WARN() in ___bpf_prog_run() now gives a
clean stack trace.

v4:
- Redesigned the jump table detection mechanism. Instead of requiring
the jump table to have a magic name, place it in a special section.

- The other two fixes from v3 have been merged into -tip.

Josh Poimboeuf (2):
objtool: Add support for C jump tables
bpf: Fix ORC unwinding in non-JIT BPF code

include/linux/compiler.h | 5 +++++
kernel/bpf/core.c | 3 +--
tools/objtool/check.c | 27 ++++++++++++++++++++-------
3 files changed, 26 insertions(+), 9 deletions(-)

--
2.20.1


2019-06-28 01:51:59

by Josh Poimboeuf

[permalink] [raw]
Subject: [PATCH v4 1/2] objtool: Add support for C jump tables

Objtool doesn't know how to read C jump tables, so it has to whitelist
functions which use them, causing missing ORC unwinder data for such
functions, e.g. ___bpf_prog_run().

C jump tables are very similar to GCC switch jump tables, which objtool
already knows how to read. So adding support for C jump tables is easy.
It just needs to be able to find the tables and distinguish them from
other data.

To allow the jump tables to be found, create an __annotate_jump_table
macro which can be used to annotate them.

The annotation is done by placing the jump table in an
.rodata..c_jump_table section. The '.rodata' prefix ensures that the
data will be placed in the rodata section by the vmlinux linker script.
The double periods are part of an existing convention which
distinguishes kernel sections from GCC sections.

Signed-off-by: Josh Poimboeuf <[email protected]>
---
include/linux/compiler.h | 5 +++++
tools/objtool/check.c | 27 ++++++++++++++++++++-------
2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 8aaf7cd026b0..f0fd5636fddb 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -116,9 +116,14 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
".pushsection .discard.unreachable\n\t" \
".long 999b - .\n\t" \
".popsection\n\t"
+
+/* Annotate a C jump table to allow objtool to follow the code flow */
+#define __annotate_jump_table __section(".rodata..c_jump_table")
+
#else
#define annotate_reachable()
#define annotate_unreachable()
+#define __annotate_jump_table
#endif

#ifndef ASM_UNREACHABLE
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 172f99195726..27818a93f0b1 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -18,6 +18,8 @@

#define FAKE_JUMP_OFFSET -1

+#define C_JUMP_TABLE_SECTION ".rodata..c_jump_table"
+
struct alternative {
struct list_head list;
struct instruction *insn;
@@ -1035,9 +1037,15 @@ static struct rela *find_switch_table(struct objtool_file *file,

/*
* Make sure the .rodata address isn't associated with a
- * symbol. gcc jump tables are anonymous data.
+ * symbol. GCC jump tables are anonymous data.
+ *
+ * Also support C jump tables which are in the same format as
+ * switch jump tables. For objtool to recognize them, they
+ * need to be placed in the C_JUMP_TABLE_SECTION section. They
+ * have symbols associated with them.
*/
- if (find_symbol_containing(rodata_sec, table_offset))
+ if (find_symbol_containing(rodata_sec, table_offset) &&
+ strcmp(rodata_sec->name, C_JUMP_TABLE_SECTION))
continue;

rodata_rela = find_rela_by_dest(rodata_sec, table_offset);
@@ -1277,13 +1285,18 @@ static void mark_rodata(struct objtool_file *file)
bool found = false;

/*
- * This searches for the .rodata section or multiple .rodata.func_name
- * sections if -fdata-sections is being used. The .str.1.1 and .str.1.8
- * rodata sections are ignored as they don't contain jump tables.
+ * Search for the following rodata sections, each of which can
+ * potentially contain jump tables:
+ *
+ * - .rodata: can contain GCC switch tables
+ * - .rodata.<func>: same, if -fdata-sections is being used
+ * - .rodata..c_jump_table: contains C annotated jump tables
+ *
+ * .rodata.str1.* sections are ignored; they don't contain jump tables.
*/
for_each_sec(file, sec) {
- if (!strncmp(sec->name, ".rodata", 7) &&
- !strstr(sec->name, ".str1.")) {
+ if ((!strncmp(sec->name, ".rodata", 7) && !strstr(sec->name, ".str1.")) ||
+ !strcmp(sec->name, C_JUMP_TABLE_SECTION)) {
sec->rodata = true;
found = true;
}
--
2.20.1

2019-06-28 01:52:31

by Josh Poimboeuf

[permalink] [raw]
Subject: [PATCH v4 2/2] bpf: Fix ORC unwinding in non-JIT BPF code

Objtool previously ignored ___bpf_prog_run() because it didn't
understand the jump table. This resulted in the ORC unwinder not being
able to unwind through non-JIT BPF code.

Now that objtool knows how to read jump tables, remove the whitelist and
annotate the jump table so objtool can recognize it.

Also add an additional "const" to the jump table definition to clarify
that the text pointers are constant. Otherwise GCC sets the section
writable flag and the assembler spits out warnings.

Fixes: d15d356887e7 ("perf/x86: Make perf callchains work without CONFIG_FRAME_POINTER")
Reported-by: Song Liu <[email protected]>
Signed-off-by: Josh Poimboeuf <[email protected]>
---
kernel/bpf/core.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7c473f208a10..45456a796d7f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1299,7 +1299,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
{
#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
- static const void *jumptable[256] = {
+ static const void * const jumptable[256] __annotate_jump_table = {
[0 ... 255] = &&default_label,
/* Now overwrite non-defaults ... */
BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
@@ -1558,7 +1558,6 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
BUG_ON(1);
return 0;
}
-STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */

#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
#define DEFINE_BPF_PROG_RUN(stack_size) \
--
2.20.1

2019-06-28 15:38:15

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [PATCH v4 2/2] bpf: Fix ORC unwinding in non-JIT BPF code

On Thu, Jun 27, 2019 at 6:51 PM Josh Poimboeuf <[email protected]> wrote:
>
> Objtool previously ignored ___bpf_prog_run() because it didn't
> understand the jump table. This resulted in the ORC unwinder not being
> able to unwind through non-JIT BPF code.
>
> Now that objtool knows how to read jump tables, remove the whitelist and
> annotate the jump table so objtool can recognize it.
>
> Also add an additional "const" to the jump table definition to clarify
> that the text pointers are constant. Otherwise GCC sets the section
> writable flag and the assembler spits out warnings.
>
> Fixes: d15d356887e7 ("perf/x86: Make perf callchains work without CONFIG_FRAME_POINTER")
> Reported-by: Song Liu <[email protected]>
> Signed-off-by: Josh Poimboeuf <[email protected]>

I'm traveling atm, but the set looks good.
Acked-by: Alexei Starovoitov <[email protected]>

If tip maintainers can route it to Linus quickly then
please send the whole thing via tip tree.
Or we can send these two patches via bpf tree early next week.

Subject: [tip:x86/urgent] objtool: Add support for C jump tables

Commit-ID: d31acc2cc6ee313cee663ffed89c6a8f807bd87b
Gitweb: https://git.kernel.org/tip/d31acc2cc6ee313cee663ffed89c6a8f807bd87b
Author: Josh Poimboeuf <[email protected]>
AuthorDate: Thu, 27 Jun 2019 20:50:46 -0500
Committer: Thomas Gleixner <[email protected]>
CommitDate: Sat, 29 Jun 2019 07:55:13 +0200

objtool: Add support for C jump tables

Objtool doesn't know how to read C jump tables, so it has to whitelist
functions which use them, causing missing ORC unwinder data for such
functions, e.g. ___bpf_prog_run().

C jump tables are very similar to GCC switch jump tables, which objtool
already knows how to read. So adding support for C jump tables is easy.
It just needs to be able to find the tables and distinguish them from
other data.

To allow the jump tables to be found, create an __annotate_jump_table
macro which can be used to annotate them.

The annotation is done by placing the jump table in an
.rodata..c_jump_table section. The '.rodata' prefix ensures that the data
will be placed in the rodata section by the vmlinux linker script. The
double periods are part of an existing convention which distinguishes
kernel sections from GCC sections.

Signed-off-by: Josh Poimboeuf <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Song Liu <[email protected]>
Cc: Kairui Song <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Alexei Starovoitov <[email protected]>
Cc: Daniel Borkmann <[email protected]>
Link: https://lkml.kernel.org/r/0ba2ca30442b16b97165992381ce643dc27b3d1a.1561685471.git.jpoimboe@redhat.com

---
include/linux/compiler.h | 5 +++++
tools/objtool/check.c | 27 ++++++++++++++++++++-------
2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 8aaf7cd026b0..f0fd5636fddb 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -116,9 +116,14 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
".pushsection .discard.unreachable\n\t" \
".long 999b - .\n\t" \
".popsection\n\t"
+
+/* Annotate a C jump table to allow objtool to follow the code flow */
+#define __annotate_jump_table __section(".rodata..c_jump_table")
+
#else
#define annotate_reachable()
#define annotate_unreachable()
+#define __annotate_jump_table
#endif

#ifndef ASM_UNREACHABLE
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 172f99195726..27818a93f0b1 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -18,6 +18,8 @@

#define FAKE_JUMP_OFFSET -1

+#define C_JUMP_TABLE_SECTION ".rodata..c_jump_table"
+
struct alternative {
struct list_head list;
struct instruction *insn;
@@ -1035,9 +1037,15 @@ static struct rela *find_switch_table(struct objtool_file *file,

/*
* Make sure the .rodata address isn't associated with a
- * symbol. gcc jump tables are anonymous data.
+ * symbol. GCC jump tables are anonymous data.
+ *
+ * Also support C jump tables which are in the same format as
+ * switch jump tables. For objtool to recognize them, they
+ * need to be placed in the C_JUMP_TABLE_SECTION section. They
+ * have symbols associated with them.
*/
- if (find_symbol_containing(rodata_sec, table_offset))
+ if (find_symbol_containing(rodata_sec, table_offset) &&
+ strcmp(rodata_sec->name, C_JUMP_TABLE_SECTION))
continue;

rodata_rela = find_rela_by_dest(rodata_sec, table_offset);
@@ -1277,13 +1285,18 @@ static void mark_rodata(struct objtool_file *file)
bool found = false;

/*
- * This searches for the .rodata section or multiple .rodata.func_name
- * sections if -fdata-sections is being used. The .str.1.1 and .str.1.8
- * rodata sections are ignored as they don't contain jump tables.
+ * Search for the following rodata sections, each of which can
+ * potentially contain jump tables:
+ *
+ * - .rodata: can contain GCC switch tables
+ * - .rodata.<func>: same, if -fdata-sections is being used
+ * - .rodata..c_jump_table: contains C annotated jump tables
+ *
+ * .rodata.str1.* sections are ignored; they don't contain jump tables.
*/
for_each_sec(file, sec) {
- if (!strncmp(sec->name, ".rodata", 7) &&
- !strstr(sec->name, ".str1.")) {
+ if ((!strncmp(sec->name, ".rodata", 7) && !strstr(sec->name, ".str1.")) ||
+ !strcmp(sec->name, C_JUMP_TABLE_SECTION)) {
sec->rodata = true;
found = true;
}

Subject: [tip:x86/urgent] bpf: Fix ORC unwinding in non-JIT BPF code

Commit-ID: b22cf36c189f31883ad0238a69ccf82aa1f3b16b
Gitweb: https://git.kernel.org/tip/b22cf36c189f31883ad0238a69ccf82aa1f3b16b
Author: Josh Poimboeuf <[email protected]>
AuthorDate: Thu, 27 Jun 2019 20:50:47 -0500
Committer: Thomas Gleixner <[email protected]>
CommitDate: Sat, 29 Jun 2019 07:55:14 +0200

bpf: Fix ORC unwinding in non-JIT BPF code

Objtool previously ignored ___bpf_prog_run() because it didn't understand
the jump table. This resulted in the ORC unwinder not being able to unwind
through non-JIT BPF code.

Now that objtool knows how to read jump tables, remove the whitelist and
annotate the jump table so objtool can recognize it.

Also add an additional "const" to the jump table definition to clarify that
the text pointers are constant. Otherwise GCC sets the section writable
flag and the assembler spits out warnings.

Fixes: d15d356887e7 ("perf/x86: Make perf callchains work without CONFIG_FRAME_POINTER")
Reported-by: Song Liu <[email protected]>
Signed-off-by: Josh Poimboeuf <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Acked-by: Alexei Starovoitov <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Kairui Song <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Daniel Borkmann <[email protected]>
Link: https://lkml.kernel.org/r/881939122b88f32be4c374d248c09d7527a87e35.1561685471.git.jpoimboe@redhat.com

---
kernel/bpf/core.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7c473f208a10..45456a796d7f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1299,7 +1299,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
{
#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
- static const void *jumptable[256] = {
+ static const void * const jumptable[256] __annotate_jump_table = {
[0 ... 255] = &&default_label,
/* Now overwrite non-defaults ... */
BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
@@ -1558,7 +1558,6 @@ out:
BUG_ON(1);
return 0;
}
-STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */

#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
#define DEFINE_BPF_PROG_RUN(stack_size) \

2019-07-06 20:30:46

by Ingo Molnar

[permalink] [raw]
Subject: Re: [tip:x86/urgent] bpf: Fix ORC unwinding in non-JIT BPF code


* tip-bot for Josh Poimboeuf <[email protected]> wrote:

> Commit-ID: b22cf36c189f31883ad0238a69ccf82aa1f3b16b
> Gitweb: https://git.kernel.org/tip/b22cf36c189f31883ad0238a69ccf82aa1f3b16b
> Author: Josh Poimboeuf <[email protected]>
> AuthorDate: Thu, 27 Jun 2019 20:50:47 -0500
> Committer: Thomas Gleixner <[email protected]>
> CommitDate: Sat, 29 Jun 2019 07:55:14 +0200
>
> bpf: Fix ORC unwinding in non-JIT BPF code
>
> Objtool previously ignored ___bpf_prog_run() because it didn't understand
> the jump table. This resulted in the ORC unwinder not being able to unwind
> through non-JIT BPF code.
>
> Now that objtool knows how to read jump tables, remove the whitelist and
> annotate the jump table so objtool can recognize it.
>
> Also add an additional "const" to the jump table definition to clarify that
> the text pointers are constant. Otherwise GCC sets the section writable
> flag and the assembler spits out warnings.
>
> Fixes: d15d356887e7 ("perf/x86: Make perf callchains work without CONFIG_FRAME_POINTER")
> Reported-by: Song Liu <[email protected]>
> Signed-off-by: Josh Poimboeuf <[email protected]>
> Signed-off-by: Thomas Gleixner <[email protected]>
> Acked-by: Alexei Starovoitov <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Kairui Song <[email protected]>
> Cc: Steven Rostedt <[email protected]>
> Cc: Borislav Petkov <[email protected]>
> Cc: Daniel Borkmann <[email protected]>
> Link: https://lkml.kernel.org/r/881939122b88f32be4c374d248c09d7527a87e35.1561685471.git.jpoimboe@redhat.com
>
> ---
> kernel/bpf/core.c | 3 +--
> 1 file changed, 1 insertion(+), 2 deletions(-)

Hm, I get this new build warning on x86-64 defconfig-ish kernels plus
these enabled:

CONFIG_BPF=y
CONFIG_BPF_JIT=y

kernel/bpf/core.o: warning: objtool: ___bpf_prog_run()+0x8da: sibling call from callable instruction with modified stack frame

Thanks,

Ingo

2019-07-07 01:35:25

by Josh Poimboeuf

[permalink] [raw]
Subject: Re: [tip:x86/urgent] bpf: Fix ORC unwinding in non-JIT BPF code

On Sat, Jul 06, 2019 at 10:29:42PM +0200, Ingo Molnar wrote:
>
> * tip-bot for Josh Poimboeuf <[email protected]> wrote:
>
> > Commit-ID: b22cf36c189f31883ad0238a69ccf82aa1f3b16b
> > Gitweb: https://git.kernel.org/tip/b22cf36c189f31883ad0238a69ccf82aa1f3b16b
> > Author: Josh Poimboeuf <[email protected]>
> > AuthorDate: Thu, 27 Jun 2019 20:50:47 -0500
> > Committer: Thomas Gleixner <[email protected]>
> > CommitDate: Sat, 29 Jun 2019 07:55:14 +0200
> >
> > bpf: Fix ORC unwinding in non-JIT BPF code
> >
> > Objtool previously ignored ___bpf_prog_run() because it didn't understand
> > the jump table. This resulted in the ORC unwinder not being able to unwind
> > through non-JIT BPF code.
> >
> > Now that objtool knows how to read jump tables, remove the whitelist and
> > annotate the jump table so objtool can recognize it.
> >
> > Also add an additional "const" to the jump table definition to clarify that
> > the text pointers are constant. Otherwise GCC sets the section writable
> > flag and the assembler spits out warnings.
> >
> > Fixes: d15d356887e7 ("perf/x86: Make perf callchains work without CONFIG_FRAME_POINTER")
> > Reported-by: Song Liu <[email protected]>
> > Signed-off-by: Josh Poimboeuf <[email protected]>
> > Signed-off-by: Thomas Gleixner <[email protected]>
> > Acked-by: Alexei Starovoitov <[email protected]>
> > Cc: Peter Zijlstra <[email protected]>
> > Cc: Kairui Song <[email protected]>
> > Cc: Steven Rostedt <[email protected]>
> > Cc: Borislav Petkov <[email protected]>
> > Cc: Daniel Borkmann <[email protected]>
> > Link: https://lkml.kernel.org/r/881939122b88f32be4c374d248c09d7527a87e35.1561685471.git.jpoimboe@redhat.com
> >
> > ---
> > kernel/bpf/core.c | 3 +--
> > 1 file changed, 1 insertion(+), 2 deletions(-)
>
> Hm, I get this new build warning on x86-64 defconfig-ish kernels plus
> these enabled:
>
> CONFIG_BPF=y
> CONFIG_BPF_JIT=y
>
> kernel/bpf/core.o: warning: objtool: ___bpf_prog_run()+0x8da: sibling call from callable instruction with modified stack frame

I assume you have CONFIG_RETPOLINE disabled? For some reason that
causes GCC to add 166 indirect jumps to that function, which is giving
objtool trouble. Looking into it.

--
Josh

2019-07-07 05:55:33

by Josh Poimboeuf

[permalink] [raw]
Subject: Re: [tip:x86/urgent] bpf: Fix ORC unwinding in non-JIT BPF code

On Sat, Jul 06, 2019 at 08:32:06PM -0500, Josh Poimboeuf wrote:
> On Sat, Jul 06, 2019 at 10:29:42PM +0200, Ingo Molnar wrote:
> > Hm, I get this new build warning on x86-64 defconfig-ish kernels plus
> > these enabled:
> >
> > CONFIG_BPF=y
> > CONFIG_BPF_JIT=y
> >
> > kernel/bpf/core.o: warning: objtool: ___bpf_prog_run()+0x8da: sibling call from callable instruction with modified stack frame
>
> I assume you have CONFIG_RETPOLINE disabled? For some reason that
> causes GCC to add 166 indirect jumps to that function, which is giving
> objtool trouble. Looking into it.

Alexei, do you have any objections to setting -fno-gcse for
___bpf_prog_run()? Either for the function or the file? Doing so seems
to be recommended by the GCC manual for computed gotos. It would also
"fix" one of the issues. More details below.

Details:

With CONFIG_RETPOLINE=n, there are a couple of GCC optimizations in
___bpf_prog_run() which objtool is having trouble with.

1)

The function has:

select_insn:
goto *jumptable[insn->code];

And then a bunch of "goto select_insn" statements.

GCC is basically replacing

select_insn:
jmp *jumptable(,%rax,8)
...
ALU64_ADD_X:
...
jmp select_insn
ALU_ADD_X:
...
jmp select_insn

with

select_insn:
jmp *jumptable(, %rax, 8)
...
ALU64_ADD_X:
...
jmp *jumptable(, %rax, 8)
ALU_ADD_X:
...
jmp *jumptable(, %rax, 8)


It does that 166 times.

For some reason, it doesn't do the optimization with retpolines
enabled.

Objtool has never seen multiple indirect jump sites which use the same
jump table. This is relatively trivial to fix (I already have a
working patch).

2)

After doing the first optimization, GCC then does another one which is
a little trickier. It replaces:

select_insn:
jmp *jumptable(, %rax, 8)
...
ALU64_ADD_X:
...
jmp *jumptable(, %rax, 8)
ALU_ADD_X:
...
jmp *jumptable(, %rax, 8)

with

select_insn:
mov jumptable, %r12
jmp *(%r12, %rax, 8)
...
ALU64_ADD_X:
...
jmp *(%r12, %rax, 8)
ALU_ADD_X:
...
jmp *(%r12, %rax, 8)

The problem is that it only moves the jumptable address into %r12
once, for the entire function, then it goes through multiple recursive
indirect jumps which rely on that %r12 value. But objtool isn't yet
smart enough to be able to track the value across multiple recursive
indirect jumps through the jump table.

After some digging I found that the quick and easy fix is to disable
-fgcse. In fact, this seems to be recommended by the GCC manual, for
code like this:

-fgcse
Perform a global common subexpression elimination pass. This
pass also performs global constant and copy propagation.

Note: When compiling a program using computed gotos, a GCC
extension, you may get better run-time performance if you
disable the global common subexpression elimination pass by
adding -fno-gcse to the command line.

Enabled at levels -O2, -O3, -Os.

This code indeed relies extensively on computed gotos. I don't know
*why* disabling this optimization would improve performance. In fact
I really don't see how it could make much of a difference either way.

Anyway, using -fno-gcse makes optimization #2 go away and makes
objtool happy, with only a fix for #1 needed.

If -fno-gcse isn't an option, we might be able to fix objtool by using
the "first_jump_src" thing which Peter added, improving it such that
it also takes table jumps into account.

--
Josh

2019-07-08 22:53:47

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [tip:x86/urgent] bpf: Fix ORC unwinding in non-JIT BPF code

On Sat, Jul 6, 2019 at 10:52 PM Josh Poimboeuf <[email protected]> wrote:
>
> On Sat, Jul 06, 2019 at 08:32:06PM -0500, Josh Poimboeuf wrote:
> > On Sat, Jul 06, 2019 at 10:29:42PM +0200, Ingo Molnar wrote:
> > > Hm, I get this new build warning on x86-64 defconfig-ish kernels plus
> > > these enabled:
> > >
> > > CONFIG_BPF=y
> > > CONFIG_BPF_JIT=y
> > >
> > > kernel/bpf/core.o: warning: objtool: ___bpf_prog_run()+0x8da: sibling call from callable instruction with modified stack frame
> >
> > I assume you have CONFIG_RETPOLINE disabled? For some reason that
> > causes GCC to add 166 indirect jumps to that function, which is giving
> > objtool trouble. Looking into it.
>
> Alexei, do you have any objections to setting -fno-gcse for
> ___bpf_prog_run()? Either for the function or the file? Doing so seems
> to be recommended by the GCC manual for computed gotos. It would also
> "fix" one of the issues. More details below.
>
> Details:
>
> With CONFIG_RETPOLINE=n, there are a couple of GCC optimizations in
> ___bpf_prog_run() which objtool is having trouble with.
>
> 1)
>
> The function has:
>
> select_insn:
> goto *jumptable[insn->code];
>
> And then a bunch of "goto select_insn" statements.
>
> GCC is basically replacing
>
> select_insn:
> jmp *jumptable(,%rax,8)
> ...
> ALU64_ADD_X:
> ...
> jmp select_insn
> ALU_ADD_X:
> ...
> jmp select_insn
>
> with
>
> select_insn:
> jmp *jumptable(, %rax, 8)
> ...
> ALU64_ADD_X:
> ...
> jmp *jumptable(, %rax, 8)
> ALU_ADD_X:
> ...
> jmp *jumptable(, %rax, 8)
>
>
> It does that 166 times.
>
> For some reason, it doesn't do the optimization with retpolines
> enabled.
>
> Objtool has never seen multiple indirect jump sites which use the same
> jump table. This is relatively trivial to fix (I already have a
> working patch).
>
> 2)
>
> After doing the first optimization, GCC then does another one which is
> a little trickier. It replaces:
>
> select_insn:
> jmp *jumptable(, %rax, 8)
> ...
> ALU64_ADD_X:
> ...
> jmp *jumptable(, %rax, 8)
> ALU_ADD_X:
> ...
> jmp *jumptable(, %rax, 8)
>
> with
>
> select_insn:
> mov jumptable, %r12
> jmp *(%r12, %rax, 8)
> ...
> ALU64_ADD_X:
> ...
> jmp *(%r12, %rax, 8)
> ALU_ADD_X:
> ...
> jmp *(%r12, %rax, 8)
>
> The problem is that it only moves the jumptable address into %r12
> once, for the entire function, then it goes through multiple recursive
> indirect jumps which rely on that %r12 value. But objtool isn't yet
> smart enough to be able to track the value across multiple recursive
> indirect jumps through the jump table.
>
> After some digging I found that the quick and easy fix is to disable
> -fgcse. In fact, this seems to be recommended by the GCC manual, for
> code like this:
>
> -fgcse
> Perform a global common subexpression elimination pass. This
> pass also performs global constant and copy propagation.
>
> Note: When compiling a program using computed gotos, a GCC
> extension, you may get better run-time performance if you
> disable the global common subexpression elimination pass by
> adding -fno-gcse to the command line.
>
> Enabled at levels -O2, -O3, -Os.
>
> This code indeed relies extensively on computed gotos. I don't know
> *why* disabling this optimization would improve performance. In fact
> I really don't see how it could make much of a difference either way.
>
> Anyway, using -fno-gcse makes optimization #2 go away and makes
> objtool happy, with only a fix for #1 needed.
>
> If -fno-gcse isn't an option, we might be able to fix objtool by using
> the "first_jump_src" thing which Peter added, improving it such that
> it also takes table jumps into account.

Sorry for delay. I'm mostly offgrid until next week.
As far as -fno-gcse.. I don't mind as long as it doesn't hurt performance.
Which I suspect it will :(
All these indirect gotos are there for performance.
Single indirect goto and a bunch of jmp select_insn
are way slower, since there is only one instruction
for cpu branch predictor to work with.
When every insn is followed by "jmp *jumptable"
there is more room for cpu to speculate.
It's been long time, but when I wrote it the difference
between all indirect goto vs single indirect goto was almost 2x.

2019-07-08 23:40:20

by Josh Poimboeuf

[permalink] [raw]
Subject: Re: [tip:x86/urgent] bpf: Fix ORC unwinding in non-JIT BPF code

On Mon, Jul 08, 2019 at 03:15:37PM -0700, Alexei Starovoitov wrote:
> > 2)
> >
> > After doing the first optimization, GCC then does another one which is
> > a little trickier. It replaces:
> >
> > select_insn:
> > jmp *jumptable(, %rax, 8)
> > ...
> > ALU64_ADD_X:
> > ...
> > jmp *jumptable(, %rax, 8)
> > ALU_ADD_X:
> > ...
> > jmp *jumptable(, %rax, 8)
> >
> > with
> >
> > select_insn:
> > mov jumptable, %r12
> > jmp *(%r12, %rax, 8)
> > ...
> > ALU64_ADD_X:
> > ...
> > jmp *(%r12, %rax, 8)
> > ALU_ADD_X:
> > ...
> > jmp *(%r12, %rax, 8)
> >
> > The problem is that it only moves the jumptable address into %r12
> > once, for the entire function, then it goes through multiple recursive
> > indirect jumps which rely on that %r12 value. But objtool isn't yet
> > smart enough to be able to track the value across multiple recursive
> > indirect jumps through the jump table.
> >
> > After some digging I found that the quick and easy fix is to disable
> > -fgcse. In fact, this seems to be recommended by the GCC manual, for
> > code like this:
> >
> > -fgcse
> > Perform a global common subexpression elimination pass. This
> > pass also performs global constant and copy propagation.
> >
> > Note: When compiling a program using computed gotos, a GCC
> > extension, you may get better run-time performance if you
> > disable the global common subexpression elimination pass by
> > adding -fno-gcse to the command line.
> >
> > Enabled at levels -O2, -O3, -Os.
> >
> > This code indeed relies extensively on computed gotos. I don't know
> > *why* disabling this optimization would improve performance. In fact
> > I really don't see how it could make much of a difference either way.
> >
> > Anyway, using -fno-gcse makes optimization #2 go away and makes
> > objtool happy, with only a fix for #1 needed.
> >
> > If -fno-gcse isn't an option, we might be able to fix objtool by using
> > the "first_jump_src" thing which Peter added, improving it such that
> > it also takes table jumps into account.
>
> Sorry for delay. I'm mostly offgrid until next week.
> As far as -fno-gcse.. I don't mind as long as it doesn't hurt performance.
> Which I suspect it will :(
> All these indirect gotos are there for performance.
> Single indirect goto and a bunch of jmp select_insn
> are way slower, since there is only one instruction
> for cpu branch predictor to work with.
> When every insn is followed by "jmp *jumptable"
> there is more room for cpu to speculate.
> It's been long time, but when I wrote it the difference
> between all indirect goto vs single indirect goto was almost 2x.

Just to clarify, -fno-gcse doesn't get rid of any of the indirect jumps.
It still has 166 indirect jumps. It just gets rid of the second
optimization, where the jumptable address is placed in a register.

If you have a benchmark which is relatively easy to use, I could try to
run some tests.

--
Josh

2019-07-08 23:41:20

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [tip:x86/urgent] bpf: Fix ORC unwinding in non-JIT BPF code

On Mon, Jul 8, 2019 at 3:38 PM Josh Poimboeuf <[email protected]> wrote:
>
> On Mon, Jul 08, 2019 at 03:15:37PM -0700, Alexei Starovoitov wrote:
> > > 2)
> > >
> > > After doing the first optimization, GCC then does another one which is
> > > a little trickier. It replaces:
> > >
> > > select_insn:
> > > jmp *jumptable(, %rax, 8)
> > > ...
> > > ALU64_ADD_X:
> > > ...
> > > jmp *jumptable(, %rax, 8)
> > > ALU_ADD_X:
> > > ...
> > > jmp *jumptable(, %rax, 8)
> > >
> > > with
> > >
> > > select_insn:
> > > mov jumptable, %r12
> > > jmp *(%r12, %rax, 8)
> > > ...
> > > ALU64_ADD_X:
> > > ...
> > > jmp *(%r12, %rax, 8)
> > > ALU_ADD_X:
> > > ...
> > > jmp *(%r12, %rax, 8)
> > >
> > > The problem is that it only moves the jumptable address into %r12
> > > once, for the entire function, then it goes through multiple recursive
> > > indirect jumps which rely on that %r12 value. But objtool isn't yet
> > > smart enough to be able to track the value across multiple recursive
> > > indirect jumps through the jump table.
> > >
> > > After some digging I found that the quick and easy fix is to disable
> > > -fgcse. In fact, this seems to be recommended by the GCC manual, for
> > > code like this:
> > >
> > > -fgcse
> > > Perform a global common subexpression elimination pass. This
> > > pass also performs global constant and copy propagation.
> > >
> > > Note: When compiling a program using computed gotos, a GCC
> > > extension, you may get better run-time performance if you
> > > disable the global common subexpression elimination pass by
> > > adding -fno-gcse to the command line.
> > >
> > > Enabled at levels -O2, -O3, -Os.
> > >
> > > This code indeed relies extensively on computed gotos. I don't know
> > > *why* disabling this optimization would improve performance. In fact
> > > I really don't see how it could make much of a difference either way.
> > >
> > > Anyway, using -fno-gcse makes optimization #2 go away and makes
> > > objtool happy, with only a fix for #1 needed.
> > >
> > > If -fno-gcse isn't an option, we might be able to fix objtool by using
> > > the "first_jump_src" thing which Peter added, improving it such that
> > > it also takes table jumps into account.
> >
> > Sorry for delay. I'm mostly offgrid until next week.
> > As far as -fno-gcse.. I don't mind as long as it doesn't hurt performance.
> > Which I suspect it will :(
> > All these indirect gotos are there for performance.
> > Single indirect goto and a bunch of jmp select_insn
> > are way slower, since there is only one instruction
> > for cpu branch predictor to work with.
> > When every insn is followed by "jmp *jumptable"
> > there is more room for cpu to speculate.
> > It's been long time, but when I wrote it the difference
> > between all indirect goto vs single indirect goto was almost 2x.
>
> Just to clarify, -fno-gcse doesn't get rid of any of the indirect jumps.
> It still has 166 indirect jumps. It just gets rid of the second
> optimization, where the jumptable address is placed in a register.

what about other functions in core.c ?
May be it's easier to teach objtool to recognize that pattern?

> If you have a benchmark which is relatively easy to use, I could try to
> run some tests.

modprobe test_bpf
selftests/bpf/test_progs
both print runtime.
Some of test_progs have high run-to-run variations though.

2019-07-08 23:42:56

by Josh Poimboeuf

[permalink] [raw]
Subject: Re: [tip:x86/urgent] bpf: Fix ORC unwinding in non-JIT BPF code

On Mon, Jul 08, 2019 at 03:49:33PM -0700, Alexei Starovoitov wrote:
> > > Sorry for delay. I'm mostly offgrid until next week.
> > > As far as -fno-gcse.. I don't mind as long as it doesn't hurt performance.
> > > Which I suspect it will :(
> > > All these indirect gotos are there for performance.
> > > Single indirect goto and a bunch of jmp select_insn
> > > are way slower, since there is only one instruction
> > > for cpu branch predictor to work with.
> > > When every insn is followed by "jmp *jumptable"
> > > there is more room for cpu to speculate.
> > > It's been long time, but when I wrote it the difference
> > > between all indirect goto vs single indirect goto was almost 2x.
> >
> > Just to clarify, -fno-gcse doesn't get rid of any of the indirect jumps.
> > It still has 166 indirect jumps. It just gets rid of the second
> > optimization, where the jumptable address is placed in a register.
>
> what about other functions in core.c ?
> May be it's easier to teach objtool to recognize that pattern?

The GCC man page actually recommends using -fno-gcse for computed goto
code, for better performance. So if that's actually true, then it would
be win-win because objtool wouldn't need a change for it.

Otherwise I can teach objtool to recognize the new pattern.

> > If you have a benchmark which is relatively easy to use, I could try to
> > run some tests.
>
> modprobe test_bpf
> selftests/bpf/test_progs
> both print runtime.
> Some of test_progs have high run-to-run variations though.

Thanks, I'll give it a shot.

--
Josh

2019-07-08 23:45:30

by Josh Poimboeuf

[permalink] [raw]
Subject: Re: [tip:x86/urgent] bpf: Fix ORC unwinding in non-JIT BPF code

On Mon, Jul 08, 2019 at 05:53:59PM -0500, Josh Poimboeuf wrote:
> On Mon, Jul 08, 2019 at 03:49:33PM -0700, Alexei Starovoitov wrote:
> > > > Sorry for delay. I'm mostly offgrid until next week.
> > > > As far as -fno-gcse.. I don't mind as long as it doesn't hurt performance.
> > > > Which I suspect it will :(
> > > > All these indirect gotos are there for performance.
> > > > Single indirect goto and a bunch of jmp select_insn
> > > > are way slower, since there is only one instruction
> > > > for cpu branch predictor to work with.
> > > > When every insn is followed by "jmp *jumptable"
> > > > there is more room for cpu to speculate.
> > > > It's been long time, but when I wrote it the difference
> > > > between all indirect goto vs single indirect goto was almost 2x.
> > >
> > > Just to clarify, -fno-gcse doesn't get rid of any of the indirect jumps.
> > > It still has 166 indirect jumps. It just gets rid of the second
> > > optimization, where the jumptable address is placed in a register.
> >
> > what about other functions in core.c ?
> > May be it's easier to teach objtool to recognize that pattern?
>
> The GCC man page actually recommends using -fno-gcse for computed goto
> code, for better performance. So if that's actually true, then it would
> be win-win because objtool wouldn't need a change for it.
>
> Otherwise I can teach objtool to recognize the new pattern.
>
> > > If you have a benchmark which is relatively easy to use, I could try to
> > > run some tests.
> >
> > modprobe test_bpf
> > selftests/bpf/test_progs
> > both print runtime.
> > Some of test_progs have high run-to-run variations though.
>
> Thanks, I'll give it a shot.

I modprobed test_bpf with JIT disabled.

Before: 2.493018s
After: 2.523572s

So it looks like it's either no change, or slightly slower.

I'll just teach objtool to recognize the optimization.

--
Josh

2019-07-08 23:47:26

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [tip:x86/urgent] bpf: Fix ORC unwinding in non-JIT BPF code

On Mon, Jul 8, 2019 at 4:02 PM Josh Poimboeuf <[email protected]> wrote:
> > >
> > > modprobe test_bpf
> > > selftests/bpf/test_progs
> > > both print runtime.
> > > Some of test_progs have high run-to-run variations though.
> >
> > Thanks, I'll give it a shot.
>
> I modprobed test_bpf with JIT disabled.
>
> Before: 2.493018s
> After: 2.523572s
>
> So it looks like it's either no change, or slightly slower.

total time is hard to compare.
Could you compare few tests?
like two that are called "tcpdump *"

I think small regression is ok.
Folks that care about performance should be using JIT.

Subject: [tip:x86/debug] objtool: Add support for C jump tables

Commit-ID: 87b512def792579641499d9bef1d640994ea9c18
Gitweb: https://git.kernel.org/tip/87b512def792579641499d9bef1d640994ea9c18
Author: Josh Poimboeuf <[email protected]>
AuthorDate: Thu, 27 Jun 2019 20:50:46 -0500
Committer: Ingo Molnar <[email protected]>
CommitDate: Tue, 9 Jul 2019 13:55:46 +0200

objtool: Add support for C jump tables

Objtool doesn't know how to read C jump tables, so it has to whitelist
functions which use them, causing missing ORC unwinder data for such
functions, e.g. ___bpf_prog_run().

C jump tables are very similar to GCC switch jump tables, which objtool
already knows how to read. So adding support for C jump tables is easy.
It just needs to be able to find the tables and distinguish them from
other data.

To allow the jump tables to be found, create an __annotate_jump_table
macro which can be used to annotate them.

The annotation is done by placing the jump table in an
.rodata..c_jump_table section. The '.rodata' prefix ensures that the data
will be placed in the rodata section by the vmlinux linker script. The
double periods are part of an existing convention which distinguishes
kernel sections from GCC sections.

Signed-off-by: Josh Poimboeuf <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Song Liu <[email protected]>
Cc: Kairui Song <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Alexei Starovoitov <[email protected]>
Cc: Daniel Borkmann <[email protected]>
Link: https://lkml.kernel.org/r/0ba2ca30442b16b97165992381ce643dc27b3d1a.1561685471.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/compiler.h | 5 +++++
tools/objtool/check.c | 27 ++++++++++++++++++++-------
2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 8aaf7cd026b0..f0fd5636fddb 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -116,9 +116,14 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
".pushsection .discard.unreachable\n\t" \
".long 999b - .\n\t" \
".popsection\n\t"
+
+/* Annotate a C jump table to allow objtool to follow the code flow */
+#define __annotate_jump_table __section(".rodata..c_jump_table")
+
#else
#define annotate_reachable()
#define annotate_unreachable()
+#define __annotate_jump_table
#endif

#ifndef ASM_UNREACHABLE
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 172f99195726..27818a93f0b1 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -18,6 +18,8 @@

#define FAKE_JUMP_OFFSET -1

+#define C_JUMP_TABLE_SECTION ".rodata..c_jump_table"
+
struct alternative {
struct list_head list;
struct instruction *insn;
@@ -1035,9 +1037,15 @@ static struct rela *find_switch_table(struct objtool_file *file,

/*
* Make sure the .rodata address isn't associated with a
- * symbol. gcc jump tables are anonymous data.
+ * symbol. GCC jump tables are anonymous data.
+ *
+ * Also support C jump tables which are in the same format as
+ * switch jump tables. For objtool to recognize them, they
+ * need to be placed in the C_JUMP_TABLE_SECTION section. They
+ * have symbols associated with them.
*/
- if (find_symbol_containing(rodata_sec, table_offset))
+ if (find_symbol_containing(rodata_sec, table_offset) &&
+ strcmp(rodata_sec->name, C_JUMP_TABLE_SECTION))
continue;

rodata_rela = find_rela_by_dest(rodata_sec, table_offset);
@@ -1277,13 +1285,18 @@ static void mark_rodata(struct objtool_file *file)
bool found = false;

/*
- * This searches for the .rodata section or multiple .rodata.func_name
- * sections if -fdata-sections is being used. The .str.1.1 and .str.1.8
- * rodata sections are ignored as they don't contain jump tables.
+ * Search for the following rodata sections, each of which can
+ * potentially contain jump tables:
+ *
+ * - .rodata: can contain GCC switch tables
+ * - .rodata.<func>: same, if -fdata-sections is being used
+ * - .rodata..c_jump_table: contains C annotated jump tables
+ *
+ * .rodata.str1.* sections are ignored; they don't contain jump tables.
*/
for_each_sec(file, sec) {
- if (!strncmp(sec->name, ".rodata", 7) &&
- !strstr(sec->name, ".str1.")) {
+ if ((!strncmp(sec->name, ".rodata", 7) && !strstr(sec->name, ".str1.")) ||
+ !strcmp(sec->name, C_JUMP_TABLE_SECTION)) {
sec->rodata = true;
found = true;
}

Subject: [tip:x86/debug] bpf: Fix ORC unwinding in non-JIT BPF code

Commit-ID: e55a73251da335873a6e87d68fb17e5aabb8978e
Gitweb: https://git.kernel.org/tip/e55a73251da335873a6e87d68fb17e5aabb8978e
Author: Josh Poimboeuf <[email protected]>
AuthorDate: Thu, 27 Jun 2019 20:50:47 -0500
Committer: Ingo Molnar <[email protected]>
CommitDate: Tue, 9 Jul 2019 13:55:57 +0200

bpf: Fix ORC unwinding in non-JIT BPF code

Objtool previously ignored ___bpf_prog_run() because it didn't understand
the jump table. This resulted in the ORC unwinder not being able to unwind
through non-JIT BPF code.

Now that objtool knows how to read jump tables, remove the whitelist and
annotate the jump table so objtool can recognize it.

Also add an additional "const" to the jump table definition to clarify that
the text pointers are constant. Otherwise GCC sets the section writable
flag and the assembler spits out warnings.

Fixes: d15d356887e7 ("perf/x86: Make perf callchains work without CONFIG_FRAME_POINTER")
Reported-by: Song Liu <[email protected]>
Signed-off-by: Josh Poimboeuf <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Acked-by: Alexei Starovoitov <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Kairui Song <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Daniel Borkmann <[email protected]>
Link: https://lkml.kernel.org/r/881939122b88f32be4c374d248c09d7527a87e35.1561685471.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/bpf/core.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 080e2bb644cc..1e12ac382a90 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1299,7 +1299,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
{
#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
- static const void *jumptable[256] = {
+ static const void * const jumptable[256] __annotate_jump_table = {
[0 ... 255] = &&default_label,
/* Now overwrite non-defaults ... */
BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
@@ -1558,7 +1558,6 @@ out:
BUG_ON(1);
return 0;
}
-STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */

#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
#define DEFINE_BPF_PROG_RUN(stack_size) \

2019-07-09 18:04:04

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [tip:x86/urgent] bpf: Fix ORC unwinding in non-JIT BPF code

On Tue, Jul 9, 2019 at 10:48 AM Josh Poimboeuf <[email protected]> wrote:
>
> On Mon, Jul 08, 2019 at 04:16:25PM -0700, Alexei Starovoitov wrote:
> > total time is hard to compare.
> > Could you compare few tests?
> > like two that are called "tcpdump *"
> >
> > I think small regression is ok.
> > Folks that care about performance should be using JIT.
>
> I did each test 20 times and computed the averages:
>
> "tcpdump port 22":
> default: 0.00743175s
> -fno-gcse: 0.00709920s (~4.5% speedup)
>
> "tcpdump complex":
> default: 0.00876715s
> -fno-gcse: 0.00854895s (~2.5% speedup)
>
> So there does seem to be a small performance gain by disabling this
> optimization.

great. thanks for checking.

> We could change it for the whole file, by adjusting CFLAGS_core.o in the
> BPF makefile, or we could change it for the function only with something
> like the below patch.
>
> Thoughts?
>
> diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
> index e8579412ad21..d7ee4c6bad48 100644
> --- a/include/linux/compiler-gcc.h
> +++ b/include/linux/compiler-gcc.h
> @@ -170,3 +170,5 @@
> #else
> #define __diag_GCC_8(s)
> #endif
> +
> +#define __no_fgcse __attribute__((optimize("-fno-gcse")))
> diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
> index 095d55c3834d..599c27b56c29 100644
> --- a/include/linux/compiler_types.h
> +++ b/include/linux/compiler_types.h
> @@ -189,6 +189,10 @@ struct ftrace_likely_data {
> #define asm_volatile_goto(x...) asm goto(x)
> #endif
>
> +#ifndef __no_fgcse
> +# define __no_fgcse
> +#endif
> +
> /* Are two types/vars the same type (ignoring qualifiers)? */
> #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
>
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 7e98f36a14e2..8191a7db2777 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -1295,7 +1295,7 @@ bool bpf_opcode_in_insntable(u8 code)
> *
> * Decode and execute eBPF instructions.
> */
> -static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
> +static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)

I prefer per-function flag.
If you want to route it via tip:
Acked-by: Alexei Starovoitov <[email protected]>

or Daniel can take it into bpf tree while I'm traveling.

2019-07-09 18:20:57

by Josh Poimboeuf

[permalink] [raw]
Subject: Re: [tip:x86/urgent] bpf: Fix ORC unwinding in non-JIT BPF code

On Mon, Jul 08, 2019 at 04:16:25PM -0700, Alexei Starovoitov wrote:
> total time is hard to compare.
> Could you compare few tests?
> like two that are called "tcpdump *"
>
> I think small regression is ok.
> Folks that care about performance should be using JIT.

I did each test 20 times and computed the averages:

"tcpdump port 22":
default: 0.00743175s
-fno-gcse: 0.00709920s (~4.5% speedup)

"tcpdump complex":
default: 0.00876715s
-fno-gcse: 0.00854895s (~2.5% speedup)

So there does seem to be a small performance gain by disabling this
optimization.

We could change it for the whole file, by adjusting CFLAGS_core.o in the
BPF makefile, or we could change it for the function only with something
like the below patch.

Thoughts?

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index e8579412ad21..d7ee4c6bad48 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -170,3 +170,5 @@
#else
#define __diag_GCC_8(s)
#endif
+
+#define __no_fgcse __attribute__((optimize("-fno-gcse")))
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 095d55c3834d..599c27b56c29 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -189,6 +189,10 @@ struct ftrace_likely_data {
#define asm_volatile_goto(x...) asm goto(x)
#endif

+#ifndef __no_fgcse
+# define __no_fgcse
+#endif
+
/* Are two types/vars the same type (ignoring qualifiers)? */
#define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7e98f36a14e2..8191a7db2777 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1295,7 +1295,7 @@ bool bpf_opcode_in_insntable(u8 code)
*
* Decode and execute eBPF instructions.
*/
-static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
+static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
{
#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z

2019-07-09 19:22:43

by Josh Poimboeuf

[permalink] [raw]
Subject: Re: [tip:x86/urgent] bpf: Fix ORC unwinding in non-JIT BPF code

On Tue, Jul 09, 2019 at 11:02:40AM -0700, Alexei Starovoitov wrote:
> On Tue, Jul 9, 2019 at 10:48 AM Josh Poimboeuf <[email protected]> wrote:
> >
> > On Mon, Jul 08, 2019 at 04:16:25PM -0700, Alexei Starovoitov wrote:
> > > total time is hard to compare.
> > > Could you compare few tests?
> > > like two that are called "tcpdump *"
> > >
> > > I think small regression is ok.
> > > Folks that care about performance should be using JIT.
> >
> > I did each test 20 times and computed the averages:
> >
> > "tcpdump port 22":
> > default: 0.00743175s
> > -fno-gcse: 0.00709920s (~4.5% speedup)
> >
> > "tcpdump complex":
> > default: 0.00876715s
> > -fno-gcse: 0.00854895s (~2.5% speedup)
> >
> > So there does seem to be a small performance gain by disabling this
> > optimization.
>
> great. thanks for checking.
>
> > We could change it for the whole file, by adjusting CFLAGS_core.o in the
> > BPF makefile, or we could change it for the function only with something
> > like the below patch.
> >
> > Thoughts?
> >
> > diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
> > index e8579412ad21..d7ee4c6bad48 100644
> > --- a/include/linux/compiler-gcc.h
> > +++ b/include/linux/compiler-gcc.h
> > @@ -170,3 +170,5 @@
> > #else
> > #define __diag_GCC_8(s)
> > #endif
> > +
> > +#define __no_fgcse __attribute__((optimize("-fno-gcse")))
> > diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
> > index 095d55c3834d..599c27b56c29 100644
> > --- a/include/linux/compiler_types.h
> > +++ b/include/linux/compiler_types.h
> > @@ -189,6 +189,10 @@ struct ftrace_likely_data {
> > #define asm_volatile_goto(x...) asm goto(x)
> > #endif
> >
> > +#ifndef __no_fgcse
> > +# define __no_fgcse
> > +#endif
> > +
> > /* Are two types/vars the same type (ignoring qualifiers)? */
> > #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
> >
> > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> > index 7e98f36a14e2..8191a7db2777 100644
> > --- a/kernel/bpf/core.c
> > +++ b/kernel/bpf/core.c
> > @@ -1295,7 +1295,7 @@ bool bpf_opcode_in_insntable(u8 code)
> > *
> > * Decode and execute eBPF instructions.
> > */
> > -static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
> > +static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
>
> I prefer per-function flag.
> If you want to route it via tip:
> Acked-by: Alexei Starovoitov <[email protected]>
>
> or Daniel can take it into bpf tree while I'm traveling.

Thanks! I''ll probably send it through the tip tree, along with an
objtool fix for the other optimization.

--
Josh

2019-07-09 19:28:39

by Daniel Borkmann

[permalink] [raw]
Subject: Re: [tip:x86/urgent] bpf: Fix ORC unwinding in non-JIT BPF code

On 07/09/2019 09:17 PM, Josh Poimboeuf wrote:
> On Tue, Jul 09, 2019 at 11:02:40AM -0700, Alexei Starovoitov wrote:
>> On Tue, Jul 9, 2019 at 10:48 AM Josh Poimboeuf <[email protected]> wrote:
>>>
>>> On Mon, Jul 08, 2019 at 04:16:25PM -0700, Alexei Starovoitov wrote:
>>>> total time is hard to compare.
>>>> Could you compare few tests?
>>>> like two that are called "tcpdump *"
>>>>
>>>> I think small regression is ok.
>>>> Folks that care about performance should be using JIT.
>>>
>>> I did each test 20 times and computed the averages:
>>>
>>> "tcpdump port 22":
>>> default: 0.00743175s
>>> -fno-gcse: 0.00709920s (~4.5% speedup)
>>>
>>> "tcpdump complex":
>>> default: 0.00876715s
>>> -fno-gcse: 0.00854895s (~2.5% speedup)
>>>
>>> So there does seem to be a small performance gain by disabling this
>>> optimization.
>>
>> great. thanks for checking.
>>
>>> We could change it for the whole file, by adjusting CFLAGS_core.o in the
>>> BPF makefile, or we could change it for the function only with something
>>> like the below patch.
>>>
>>> Thoughts?
>>>
>>> diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
>>> index e8579412ad21..d7ee4c6bad48 100644
>>> --- a/include/linux/compiler-gcc.h
>>> +++ b/include/linux/compiler-gcc.h
>>> @@ -170,3 +170,5 @@
>>> #else
>>> #define __diag_GCC_8(s)
>>> #endif
>>> +
>>> +#define __no_fgcse __attribute__((optimize("-fno-gcse")))
>>> diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
>>> index 095d55c3834d..599c27b56c29 100644
>>> --- a/include/linux/compiler_types.h
>>> +++ b/include/linux/compiler_types.h
>>> @@ -189,6 +189,10 @@ struct ftrace_likely_data {
>>> #define asm_volatile_goto(x...) asm goto(x)
>>> #endif
>>>
>>> +#ifndef __no_fgcse
>>> +# define __no_fgcse
>>> +#endif
>>> +
>>> /* Are two types/vars the same type (ignoring qualifiers)? */
>>> #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
>>>
>>> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
>>> index 7e98f36a14e2..8191a7db2777 100644
>>> --- a/kernel/bpf/core.c
>>> +++ b/kernel/bpf/core.c
>>> @@ -1295,7 +1295,7 @@ bool bpf_opcode_in_insntable(u8 code)
>>> *
>>> * Decode and execute eBPF instructions.
>>> */
>>> -static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
>>> +static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
>>
>> I prefer per-function flag.

Same preference from my side.

>> If you want to route it via tip:
>> Acked-by: Alexei Starovoitov <[email protected]>
>>
>> or Daniel can take it into bpf tree while I'm traveling.
>
> Thanks! I''ll probably send it through the tip tree, along with an
> objtool fix for the other optimization.

Ok, sounds good, thanks!