This is kind of like the 32-bit and compat code, except that I
preserved the fast path this time. I was unable to measure any
significant performance change on my laptop in the fast path.
What do you all think?
Andy Lutomirski (12):
selftests/x86: Extend Makefile to allow 64-bit only tests
selftests/x86: Add check_initial_reg_state
x86/syscalls: Refactor syscalltbl.sh
x86/syscalls: Remove __SYSCALL_COMMON and __SYSCALL_X32
x86/syscalls: Move compat syscall entry handling into syscalltbl.sh
x86/syscalls: Add syscall entry qualifiers
x86/entry/64: Always run ptregs-using syscalls on the slow path
x86/entry/64: Call all native slow-path syscalls with full pt-regs
x86/entry/64: Stop using int_ret_from_sys_call in ret_from_fork
x86/entry/64: Migrate the 64-bit syscall slow path to C
x86/entry/32: Change INT80 to be an interrupt gate
x86/entry: Do enter_from_user_mode with IRQs off
arch/x86/entry/common.c | 80 +++----
arch/x86/entry/entry_32.S | 8 +-
arch/x86/entry/entry_64.S | 245 ++++++---------------
arch/x86/entry/entry_64_compat.S | 2 +-
arch/x86/entry/syscall_32.c | 10 +-
arch/x86/entry/syscall_64.c | 30 ++-
arch/x86/entry/syscalls/syscall_64.tbl | 18 +-
arch/x86/entry/syscalls/syscalltbl.sh | 58 ++++-
arch/x86/include/asm/thread_info.h | 5 +-
arch/x86/kernel/asm-offsets_32.c | 2 +-
arch/x86/kernel/asm-offsets_64.c | 10 +-
arch/x86/kernel/traps.c | 2 +-
arch/x86/um/sys_call_table_32.c | 4 +-
arch/x86/um/sys_call_table_64.c | 7 +-
arch/x86/um/user-offsets.c | 6 +-
tools/testing/selftests/x86/Makefile | 13 +-
.../selftests/x86/check_initial_reg_state.c | 108 +++++++++
17 files changed, 330 insertions(+), 278 deletions(-)
create mode 100644 tools/testing/selftests/x86/check_initial_reg_state.c
--
2.5.0
There aren't any yet, but there might be a few some day.
Signed-off-by: Andy Lutomirski <[email protected]>
---
tools/testing/selftests/x86/Makefile | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 389701f59940..a460fe7c5365 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -8,8 +8,9 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt ptr
TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso
TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
+TARGETS_C_64BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_64BIT_ONLY)
BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
-BINARIES_64 := $(TARGETS_C_BOTHBITS:%=%_64)
+BINARIES_64 := $(TARGETS_C_64BIT_ALL:%=%_64)
CFLAGS := -O2 -g -std=gnu99 -pthread -Wall
@@ -37,7 +38,7 @@ clean:
$(TARGETS_C_32BIT_ALL:%=%_32): %_32: %.c
$(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
-$(TARGETS_C_BOTHBITS:%=%_64): %_64: %.c
+$(TARGETS_C_64BIT_ALL:%=%_64): %_64: %.c
$(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
# x86_64 users should be encouraged to install 32-bit libraries
--
2.5.0
This checks that ELF binaries are started with an appropriately
blank register state.
(There's currently a nasty special case in the entry asm to arrange
for this. I'm planning on removing the special case, and this will
help make sure I don't break it.)
Signed-off-by: Andy Lutomirski <[email protected]>
---
tools/testing/selftests/x86/Makefile | 8 +-
.../selftests/x86/check_initial_reg_state.c | 108 +++++++++++++++++++++
2 files changed, 115 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/x86/check_initial_reg_state.c
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index a460fe7c5365..b82409421fa6 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -4,7 +4,7 @@ include ../lib.mk
.PHONY: all all_32 all_64 warn_32bit_failure clean
-TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt ptrace_syscall
+TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt ptrace_syscall check_initial_reg_state
TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso
TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
@@ -63,3 +63,9 @@ endif
sysret_ss_attrs_64: thunks.S
ptrace_syscall_32: raw_syscall_helper_32.S
test_syscall_vdso_32: thunks_32.S
+
+# check_initial_reg_state is special: it needs a custom entry, and it
+# needs to be static so that its interpreter doesn't destroy its initial
+# state.
+check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static
+check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static
diff --git a/tools/testing/selftests/x86/check_initial_reg_state.c b/tools/testing/selftests/x86/check_initial_reg_state.c
new file mode 100644
index 000000000000..0cb565f7786d
--- /dev/null
+++ b/tools/testing/selftests/x86/check_initial_reg_state.c
@@ -0,0 +1,108 @@
+/*
+ * check_initial_reg_state.c - check that execve sets the correct state
+ * Copyright (c) 2014-2015 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+
+unsigned long ax, bx, cx, dx, si, di, bp, sp, flags;
+unsigned long r8, r9, r10, r11, r12, r13, r14, r15;
+
+asm (".pushsection .text\n\t"
+ ".type real_start, @function\n\t"
+ ".global real_start\n\t"
+ "real_start:\n\t"
+#ifdef __x86_64__
+ "mov %rax, ax\n\t"
+ "mov %rbx, bx\n\t"
+ "mov %rcx, cx\n\t"
+ "mov %rdx, dx\n\t"
+ "mov %rsi, si\n\t"
+ "mov %rdi, di\n\t"
+ "mov %rbp, bp\n\t"
+ "mov %rsp, sp\n\t"
+ "mov %r8, r8\n\t"
+ "mov %r9, r9\n\t"
+ "mov %r10, r10\n\t"
+ "mov %r11, r11\n\t"
+ "mov %r12, r12\n\t"
+ "mov %r13, r13\n\t"
+ "mov %r14, r14\n\t"
+ "mov %r15, r15\n\t"
+ "pushfq\n\t"
+ "popq flags\n\t"
+#else
+ "mov %eax, ax\n\t"
+ "mov %ebx, bx\n\t"
+ "mov %ecx, cx\n\t"
+ "mov %edx, dx\n\t"
+ "mov %esi, si\n\t"
+ "mov %edi, di\n\t"
+ "mov %ebp, bp\n\t"
+ "mov %esp, sp\n\t"
+ "pushfl\n\t"
+ "popl flags\n\t"
+#endif
+ "jmp _start\n\t"
+ ".size real_start, . - real_start\n\t"
+ ".popsection");
+
+int main()
+{
+ int nerrs = 0;
+
+ if (sp == 0) {
+ printf("[FAIL]\tTest was built incorrectly\n");
+ return 1;
+ }
+
+ if (ax || bx || cx || dx || si || di || bp
+#ifdef __x86_64__
+ || r8 || r9 || r10 || r11 || r12 || r13 || r14 || r15
+#endif
+ ) {
+ printf("[FAIL]\tAll GPRs except SP should be 0\n");
+#define SHOW(x) printf("\t" #x " = 0x%lx\n", x);
+ SHOW(ax);
+ SHOW(bx);
+ SHOW(cx);
+ SHOW(dx);
+ SHOW(si);
+ SHOW(di);
+ SHOW(bp);
+ SHOW(sp);
+#ifdef __x86_64__
+ SHOW(r8);
+ SHOW(r9);
+ SHOW(r10);
+ SHOW(r11);
+ SHOW(r12);
+ SHOW(r13);
+ SHOW(r14);
+ SHOW(r15);
+#endif
+ nerrs++;
+ } else {
+ printf("[OK]\tAll GPRs except SP are 0\n");
+ }
+
+ if (flags != 0x202) {
+ printf("[FAIL]\tFLAGS is 0x%lx, but it should be 0x202\n", flags);
+ nerrs++;
+ } else {
+ printf("[OK]\tFLAGS is 0x202\n");
+ }
+
+ return nerrs ? 1 : 0;
+}
--
2.5.0
This splits out the code to emit a syscall line.
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/syscalls/syscalltbl.sh | 18 +++++++++++++-----
1 file changed, 13 insertions(+), 5 deletions(-)
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index 0e7f8ec071e7..167965ee742e 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -3,13 +3,21 @@
in="$1"
out="$2"
+emit() {
+ abi="$1"
+ nr="$2"
+ entry="$3"
+ compat="$4"
+ if [ -n "$compat" ]; then
+ echo "__SYSCALL_${abi}($nr, $entry, $compat)"
+ elif [ -n "$entry" ]; then
+ echo "__SYSCALL_${abi}($nr, $entry, $entry)"
+ fi
+}
+
grep '^[0-9]' "$in" | sort -n | (
while read nr abi name entry compat; do
abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
- if [ -n "$compat" ]; then
- echo "__SYSCALL_${abi}($nr, $entry, $compat)"
- elif [ -n "$entry" ]; then
- echo "__SYSCALL_${abi}($nr, $entry, $entry)"
- fi
+ emit "$abi" "$nr" "$entry" "$compat"
done
) > "$out"
--
2.5.0
The common/64/x32 distinction has no effect other than determining
which kernels actually support the syscall. Move the logic into
syscalltbl.sh.
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/syscall_64.c | 8 --------
arch/x86/entry/syscalls/syscalltbl.sh | 17 ++++++++++++++++-
arch/x86/kernel/asm-offsets_64.c | 6 ------
arch/x86/um/sys_call_table_64.c | 3 ---
arch/x86/um/user-offsets.c | 2 --
5 files changed, 16 insertions(+), 20 deletions(-)
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 41283d22be7a..974fd89ac806 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -6,14 +6,6 @@
#include <asm/asm-offsets.h>
#include <asm/syscall.h>
-#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
-
-#ifdef CONFIG_X86_X32_ABI
-# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
-#else
-# define __SYSCALL_X32(nr, sym, compat) /* nothing */
-#endif
-
#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_64.h>
#undef __SYSCALL_64
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index 167965ee742e..5ebeaf1041e7 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -18,6 +18,21 @@ emit() {
grep '^[0-9]' "$in" | sort -n | (
while read nr abi name entry compat; do
abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
- emit "$abi" "$nr" "$entry" "$compat"
+ if [ "$abi" == "COMMON" -o "$abi" == "64" ]; then
+ # COMMON is the same as 64, except that we don't expect X32
+ # programs to use it. Our expectation has nothing to do with
+ # any generated code, so treat them the same.
+ emit 64 "$nr" "$entry" "$compat"
+ elif [ "$abi" == "X32" ]; then
+ # X32 is equivalent to 64 on an X32-compatible kernel.
+ echo "#ifdef CONFIG_X86_X32_ABI"
+ emit 64 "$nr" "$entry" "$compat"
+ echo "#endif"
+ elif [ "$abi" == "I386" ]; then
+ emit "$abi" "$nr" "$entry" "$compat"
+ else
+ echo "Unknown abi $abi" >&2
+ exit 1
+ fi
done
) > "$out"
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index d8f42f902a0f..211224b68766 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -5,12 +5,6 @@
#include <asm/ia32.h>
#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
-#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1,
-#ifdef CONFIG_X86_X32_ABI
-# define __SYSCALL_X32(nr, sym, compat) [nr] = 1,
-#else
-# define __SYSCALL_X32(nr, sym, compat) /* nothing */
-#endif
static char syscalls_64[] = {
#include <asm/syscalls_64.h>
};
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index b74ea6c2c0e7..71a497cde921 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -35,9 +35,6 @@
#define stub_execveat sys_execveat
#define stub_rt_sigreturn sys_rt_sigreturn
-#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
-#define __SYSCALL_X32(nr, sym, compat) /* Not supported */
-
#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_64.h>
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index ce7e3607a870..5edf4f4bbf53 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -15,8 +15,6 @@ static char syscalls[] = {
};
#else
#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
-#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1,
-#define __SYSCALL_X32(nr, sym, compat) /* Not supported */
static char syscalls[] = {
#include <asm/syscalls_64.h>
};
--
2.5.0
Rather than duplicating the compat entry handling in all consumers
of syscalls_BITS.h, handle it directly in syscalltbl.sh. Now we
generate entries in syscalls_32.h like:
__SYSCALL_I386(5, sys_open)
__SYSCALL_I386(5, compat_sys_open)
and all of its consumers implicitly get the right entry point.
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/syscall_32.c | 10 ++--------
arch/x86/entry/syscall_64.c | 4 ++--
arch/x86/entry/syscalls/syscalltbl.sh | 22 ++++++++++++++++++----
arch/x86/kernel/asm-offsets_32.c | 2 +-
arch/x86/kernel/asm-offsets_64.c | 4 ++--
arch/x86/um/sys_call_table_32.c | 4 ++--
arch/x86/um/sys_call_table_64.c | 4 ++--
arch/x86/um/user-offsets.c | 4 ++--
8 files changed, 31 insertions(+), 23 deletions(-)
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 9a6649857106..3e2829759da2 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -6,17 +6,11 @@
#include <asm/asm-offsets.h>
#include <asm/syscall.h>
-#ifdef CONFIG_IA32_EMULATION
-#define SYM(sym, compat) compat
-#else
-#define SYM(sym, compat) sym
-#endif
-
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_32.h>
#undef __SYSCALL_I386
-#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
+#define __SYSCALL_I386(nr, sym) [nr] = sym,
extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 974fd89ac806..3781989b180e 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -6,11 +6,11 @@
#include <asm/asm-offsets.h>
#include <asm/syscall.h>
-#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_64.h>
#undef __SYSCALL_64
-#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
+#define __SYSCALL_64(nr, sym) [nr] = sym,
extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index 5ebeaf1041e7..b81479c8c5fb 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -8,10 +8,24 @@ emit() {
nr="$2"
entry="$3"
compat="$4"
- if [ -n "$compat" ]; then
- echo "__SYSCALL_${abi}($nr, $entry, $compat)"
- elif [ -n "$entry" ]; then
- echo "__SYSCALL_${abi}($nr, $entry, $entry)"
+
+ if [ "$abi" == "64" -a -n "$compat" ]; then
+ echo "a compat entry for a 64-bit syscall makes no sense" >&2
+ exit 1
+ fi
+
+ if [ -z "$compat" ]; then
+ if [ -n "$entry" ]; then
+ echo "__SYSCALL_${abi}($nr, $entry)"
+ fi
+ else
+ echo "#ifdef CONFIG_X86_32"
+ if [ -n "$entry" ]; then
+ echo "__SYSCALL_${abi}($nr, $entry)"
+ fi
+ echo "#else"
+ echo "__SYSCALL_${abi}($nr, $compat)"
+ echo "#endif"
fi
}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 6ce39025f467..abec4c9f1c97 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -7,7 +7,7 @@
#include <linux/lguest.h>
#include "../../../drivers/lguest/lg.h"
-#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_I386(nr, sym) [nr] = 1,
static char syscalls[] = {
#include <asm/syscalls_32.h>
};
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 211224b68766..7d2b4d2e36b2 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -4,11 +4,11 @@
#include <asm/ia32.h>
-#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_64(nr, sym) [nr] = 1,
static char syscalls_64[] = {
#include <asm/syscalls_64.h>
};
-#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_I386(nr, sym) [nr] = 1,
static char syscalls_ia32[] = {
#include <asm/syscalls_32.h>
};
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index 439c0994b696..d4669a679fd0 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -25,11 +25,11 @@
#define old_mmap sys_old_mmap
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_32.h>
#undef __SYSCALL_I386
-#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym,
+#define __SYSCALL_I386(nr, sym) [ nr ] = sym,
extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 71a497cde921..6ee5268beb05 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -35,11 +35,11 @@
#define stub_execveat sys_execveat
#define stub_rt_sigreturn sys_rt_sigreturn
-#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_64.h>
#undef __SYSCALL_64
-#define __SYSCALL_64(nr, sym, compat) [ nr ] = sym,
+#define __SYSCALL_64(nr, sym) [ nr ] = sym,
extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index 5edf4f4bbf53..6c9a9c1eae32 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -9,12 +9,12 @@
#include <asm/types.h>
#ifdef __i386__
-#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_I386(nr, sym) [nr] = 1,
static char syscalls[] = {
#include <asm/syscalls_32.h>
};
#else
-#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_64(nr, sym) [nr] = 1,
static char syscalls[] = {
#include <asm/syscalls_64.h>
};
--
2.5.0
This will let us specify something like sys_xyz/foo instead of
sys_xyz in the syscall table, where the foo conveys some information
to the C code.
The intent is to allow things like sys_execve/ptregs to indicate
that sys_execve touches pt_regs.
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/syscall_32.c | 4 ++--
arch/x86/entry/syscall_64.c | 4 ++--
arch/x86/entry/syscalls/syscalltbl.sh | 19 ++++++++++++++++---
arch/x86/kernel/asm-offsets_32.c | 2 +-
arch/x86/kernel/asm-offsets_64.c | 4 ++--
arch/x86/um/sys_call_table_32.c | 4 ++--
arch/x86/um/sys_call_table_64.c | 4 ++--
arch/x86/um/user-offsets.c | 4 ++--
8 files changed, 29 insertions(+), 16 deletions(-)
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 3e2829759da2..8f895ee13a1c 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -6,11 +6,11 @@
#include <asm/asm-offsets.h>
#include <asm/syscall.h>
-#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_32.h>
#undef __SYSCALL_I386
-#define __SYSCALL_I386(nr, sym) [nr] = sym,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = sym,
extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 3781989b180e..a1d408772ae6 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -6,11 +6,11 @@
#include <asm/asm-offsets.h>
#include <asm/syscall.h>
-#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_64.h>
#undef __SYSCALL_64
-#define __SYSCALL_64(nr, sym) [nr] = sym,
+#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index b81479c8c5fb..cd3d3015d7df 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -3,6 +3,19 @@
in="$1"
out="$2"
+syscall_macro() {
+ abi="$1"
+ nr="$2"
+ entry="$3"
+
+ # Entry can be either just a function name or "function/qualifier"
+ real_entry="${entry%%/*}"
+ qualifier="${entry:${#real_entry}}" # Strip the function name
+ qualifier="${qualifier:1}" # Strip the slash, if any
+
+ echo "__SYSCALL_${abi}($nr, $real_entry, $qualifier)"
+}
+
emit() {
abi="$1"
nr="$2"
@@ -16,15 +29,15 @@ emit() {
if [ -z "$compat" ]; then
if [ -n "$entry" ]; then
- echo "__SYSCALL_${abi}($nr, $entry)"
+ syscall_macro "$abi" "$nr" "$entry"
fi
else
echo "#ifdef CONFIG_X86_32"
if [ -n "$entry" ]; then
- echo "__SYSCALL_${abi}($nr, $entry)"
+ syscall_macro "$abi" "$nr" "$entry"
fi
echo "#else"
- echo "__SYSCALL_${abi}($nr, $compat)"
+ syscall_macro "$abi" "$nr" "$compat"
echo "#endif"
fi
}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index abec4c9f1c97..fdeb0ce07c16 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -7,7 +7,7 @@
#include <linux/lguest.h>
#include "../../../drivers/lguest/lg.h"
-#define __SYSCALL_I386(nr, sym) [nr] = 1,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
static char syscalls[] = {
#include <asm/syscalls_32.h>
};
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 7d2b4d2e36b2..5de52cf589be 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -4,11 +4,11 @@
#include <asm/ia32.h>
-#define __SYSCALL_64(nr, sym) [nr] = 1,
+#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
static char syscalls_64[] = {
#include <asm/syscalls_64.h>
};
-#define __SYSCALL_I386(nr, sym) [nr] = 1,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
static char syscalls_ia32[] = {
#include <asm/syscalls_32.h>
};
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index d4669a679fd0..bfce503dffae 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -25,11 +25,11 @@
#define old_mmap sys_old_mmap
-#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_32.h>
#undef __SYSCALL_I386
-#define __SYSCALL_I386(nr, sym) [ nr ] = sym,
+#define __SYSCALL_I386(nr, sym, qual) [ nr ] = sym,
extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 6ee5268beb05..f306413d3eb6 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -35,11 +35,11 @@
#define stub_execveat sys_execveat
#define stub_rt_sigreturn sys_rt_sigreturn
-#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_64.h>
#undef __SYSCALL_64
-#define __SYSCALL_64(nr, sym) [ nr ] = sym,
+#define __SYSCALL_64(nr, sym, qual) [ nr ] = sym,
extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index 6c9a9c1eae32..470564bbd08e 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -9,12 +9,12 @@
#include <asm/types.h>
#ifdef __i386__
-#define __SYSCALL_I386(nr, sym) [nr] = 1,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
static char syscalls[] = {
#include <asm/syscalls_32.h>
};
#else
-#define __SYSCALL_64(nr, sym) [nr] = 1,
+#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
static char syscalls[] = {
#include <asm/syscalls_64.h>
};
--
2.5.0
64-bit syscalls currently have an optimization in which they are
called with partial pt_regs. A small handful require full pt_regs.
In the 32-bit and compat cases, I cleaned this up by forcing full
pt_regs for all syscalls. The performance hit doesn't really matter.
I want to clean up the 64-bit case as well, but I don't want to hurt
fast path performance. To do that, I want to force the syscalls
that use pt_regs onto the slow path. This will enable us to make
slow path syscalls be real ABI-compliant C functions.
Use the new syscall entry qualification machinery for this.
stub_clone is now stub_clone/ptregs.
The next patch will eliminate the stubs, and we'll just have
sys_clone/ptregs.
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/entry_64.S | 17 +++++++++--------
arch/x86/entry/syscall_64.c | 18 ++++++++++++++++++
arch/x86/entry/syscalls/syscall_64.tbl | 16 ++++++++--------
3 files changed, 35 insertions(+), 16 deletions(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9d34d3cfceb6..a698b8092831 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -182,7 +182,7 @@ entry_SYSCALL_64_fastpath:
#endif
ja 1f /* return -ENOSYS (already in pt_regs->ax) */
movq %r10, %rcx
- call *sys_call_table(, %rax, 8)
+ call *sys_call_table_fastpath_64(, %rax, 8)
movq %rax, RAX(%rsp)
1:
/*
@@ -238,13 +238,6 @@ tracesys:
movq %rsp, %rdi
movl $AUDIT_ARCH_X86_64, %esi
call syscall_trace_enter_phase1
- test %rax, %rax
- jnz tracesys_phase2 /* if needed, run the slow path */
- RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
- movq ORIG_RAX(%rsp), %rax
- jmp entry_SYSCALL_64_fastpath /* and return to the fast path */
-
-tracesys_phase2:
SAVE_EXTRA_REGS
movq %rsp, %rdi
movl $AUDIT_ARCH_X86_64, %esi
@@ -355,6 +348,14 @@ opportunistic_sysret_failed:
jmp restore_c_regs_and_iret
END(entry_SYSCALL_64)
+ENTRY(stub_ptregs_64)
+ /*
+ * Syscalls marked as needing ptregs that go through the fast path
+ * land here. We transfer to the slow path.
+ */
+ addq $8, %rsp
+ jmp tracesys
+END(stub_ptregs_64)
.macro FORK_LIKE func
ENTRY(stub_\func)
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index a1d408772ae6..601745c667ce 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -22,3 +22,21 @@ asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
[0 ... __NR_syscall_max] = &sys_ni_syscall,
#include <asm/syscalls_64.h>
};
+
+#undef __SYSCALL_64
+
+extern long stub_ptregs_64(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
+
+#define __SYSCALL_64_QUAL_(nr, sym) [nr] = sym,
+#define __SYSCALL_64_QUAL_ptregs(nr, sym) [nr] = stub_ptregs_64,
+
+#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(nr, sym)
+
+asmlinkage const sys_call_ptr_t sys_call_table_fastpath_64[__NR_syscall_max+1] = {
+ /*
+ * Smells like a compiler bug -- it doesn't work
+ * when the & below is removed.
+ */
+ [0 ... __NR_syscall_max] = &sys_ni_syscall,
+#include <asm/syscalls_64.h>
+};
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 278842fdf1f6..6b9db2e338f4 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -21,7 +21,7 @@
12 common brk sys_brk
13 64 rt_sigaction sys_rt_sigaction
14 common rt_sigprocmask sys_rt_sigprocmask
-15 64 rt_sigreturn stub_rt_sigreturn
+15 64 rt_sigreturn stub_rt_sigreturn/ptregs
16 64 ioctl sys_ioctl
17 common pread64 sys_pread64
18 common pwrite64 sys_pwrite64
@@ -62,10 +62,10 @@
53 common socketpair sys_socketpair
54 64 setsockopt sys_setsockopt
55 64 getsockopt sys_getsockopt
-56 common clone stub_clone
-57 common fork stub_fork
-58 common vfork stub_vfork
-59 64 execve stub_execve
+56 common clone stub_clone/ptregs
+57 common fork stub_fork/ptregs
+58 common vfork stub_vfork/ptregs
+59 64 execve stub_execve/ptregs
60 common exit sys_exit
61 common wait4 sys_wait4
62 common kill sys_kill
@@ -328,7 +328,7 @@
319 common memfd_create sys_memfd_create
320 common kexec_file_load sys_kexec_file_load
321 common bpf sys_bpf
-322 64 execveat stub_execveat
+322 64 execveat stub_execveat/ptregs
323 common userfaultfd sys_userfaultfd
324 common membarrier sys_membarrier
@@ -344,7 +344,7 @@
517 x32 recvfrom compat_sys_recvfrom
518 x32 sendmsg compat_sys_sendmsg
519 x32 recvmsg compat_sys_recvmsg
-520 x32 execve stub_x32_execve
+520 x32 execve stub_x32_execve/ptregs
521 x32 ptrace compat_sys_ptrace
522 x32 rt_sigpending compat_sys_rt_sigpending
523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait
@@ -369,4 +369,4 @@
542 x32 getsockopt compat_sys_getsockopt
543 x32 io_setup compat_sys_io_setup
544 x32 io_submit compat_sys_io_submit
-545 x32 execveat stub_x32_execveat
+545 x32 execveat stub_x32_execveat/ptregs
--
2.5.0
This removes all of the remaining asm syscall stubs except for
stub_ptregs_64. Entries in the main syscall table are now normal C
functions.
The resulting asm is every bit as ridiculous as it looks. The next
few patches will clean it up. This patch is here to let reviewers
rest their brains and for bisection.
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/entry_64.S | 79 +---------------------------------
arch/x86/entry/syscalls/syscall_64.tbl | 18 ++++----
2 files changed, 10 insertions(+), 87 deletions(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a698b8092831..8a6b7ce2beff 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -250,7 +250,6 @@ tracesys:
* the value it wants us to use in the table lookup.
*/
RESTORE_C_REGS_EXCEPT_RAX
- RESTORE_EXTRA_REGS
#if __SYSCALL_MASK == ~0
cmpq $__NR_syscall_max, %rax
#else
@@ -261,6 +260,7 @@ tracesys:
movq %r10, %rcx /* fixup for C */
call *sys_call_table(, %rax, 8)
movq %rax, RAX(%rsp)
+ RESTORE_EXTRA_REGS
1:
/* Use IRET because user could have changed pt_regs->foo */
@@ -357,83 +357,6 @@ ENTRY(stub_ptregs_64)
jmp tracesys
END(stub_ptregs_64)
- .macro FORK_LIKE func
-ENTRY(stub_\func)
- SAVE_EXTRA_REGS 8
- jmp sys_\func
-END(stub_\func)
- .endm
-
- FORK_LIKE clone
- FORK_LIKE fork
- FORK_LIKE vfork
-
-ENTRY(stub_execve)
- call sys_execve
-return_from_execve:
- testl %eax, %eax
- jz 1f
- /* exec failed, can use fast SYSRET code path in this case */
- ret
-1:
- /* must use IRET code path (pt_regs->cs may have changed) */
- addq $8, %rsp
- ZERO_EXTRA_REGS
- movq %rax, RAX(%rsp)
- jmp int_ret_from_sys_call
-END(stub_execve)
-/*
- * Remaining execve stubs are only 7 bytes long.
- * ENTRY() often aligns to 16 bytes, which in this case has no benefits.
- */
- .align 8
-GLOBAL(stub_execveat)
- call sys_execveat
- jmp return_from_execve
-END(stub_execveat)
-
-#if defined(CONFIG_X86_X32_ABI)
- .align 8
-GLOBAL(stub_x32_execve)
- call compat_sys_execve
- jmp return_from_execve
-END(stub_x32_execve)
- .align 8
-GLOBAL(stub_x32_execveat)
- call compat_sys_execveat
- jmp return_from_execve
-END(stub_x32_execveat)
-#endif
-
-/*
- * sigreturn is special because it needs to restore all registers on return.
- * This cannot be done with SYSRET, so use the IRET return path instead.
- */
-ENTRY(stub_rt_sigreturn)
- /*
- * SAVE_EXTRA_REGS result is not normally needed:
- * sigreturn overwrites all pt_regs->GPREGS.
- * But sigreturn can fail (!), and there is no easy way to detect that.
- * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
- * we SAVE_EXTRA_REGS here.
- */
- SAVE_EXTRA_REGS 8
- call sys_rt_sigreturn
-return_from_stub:
- addq $8, %rsp
- RESTORE_EXTRA_REGS
- movq %rax, RAX(%rsp)
- jmp int_ret_from_sys_call
-END(stub_rt_sigreturn)
-
-#ifdef CONFIG_X86_X32_ABI
-ENTRY(stub_x32_rt_sigreturn)
- SAVE_EXTRA_REGS 8
- call sys32_x32_rt_sigreturn
- jmp return_from_stub
-END(stub_x32_rt_sigreturn)
-#endif
-
/*
* A newly forked process directly context switches into this address.
*
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 6b9db2e338f4..3039faa42061 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -21,7 +21,7 @@
12 common brk sys_brk
13 64 rt_sigaction sys_rt_sigaction
14 common rt_sigprocmask sys_rt_sigprocmask
-15 64 rt_sigreturn stub_rt_sigreturn/ptregs
+15 64 rt_sigreturn sys_rt_sigreturn/ptregs
16 64 ioctl sys_ioctl
17 common pread64 sys_pread64
18 common pwrite64 sys_pwrite64
@@ -62,10 +62,10 @@
53 common socketpair sys_socketpair
54 64 setsockopt sys_setsockopt
55 64 getsockopt sys_getsockopt
-56 common clone stub_clone/ptregs
-57 common fork stub_fork/ptregs
-58 common vfork stub_vfork/ptregs
-59 64 execve stub_execve/ptregs
+56 common clone sys_clone/ptregs
+57 common fork sys_fork/ptregs
+58 common vfork sys_vfork/ptregs
+59 64 execve sys_execve/ptregs
60 common exit sys_exit
61 common wait4 sys_wait4
62 common kill sys_kill
@@ -328,7 +328,7 @@
319 common memfd_create sys_memfd_create
320 common kexec_file_load sys_kexec_file_load
321 common bpf sys_bpf
-322 64 execveat stub_execveat/ptregs
+322 64 execveat sys_execveat/ptregs
323 common userfaultfd sys_userfaultfd
324 common membarrier sys_membarrier
@@ -337,14 +337,14 @@
# for native 64-bit operation.
#
512 x32 rt_sigaction compat_sys_rt_sigaction
-513 x32 rt_sigreturn stub_x32_rt_sigreturn
+513 x32 rt_sigreturn sys32_x32_rt_sigreturn
514 x32 ioctl compat_sys_ioctl
515 x32 readv compat_sys_readv
516 x32 writev compat_sys_writev
517 x32 recvfrom compat_sys_recvfrom
518 x32 sendmsg compat_sys_sendmsg
519 x32 recvmsg compat_sys_recvmsg
-520 x32 execve stub_x32_execve/ptregs
+520 x32 execve compat_sys_execve/ptregs
521 x32 ptrace compat_sys_ptrace
522 x32 rt_sigpending compat_sys_rt_sigpending
523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait
@@ -369,4 +369,4 @@
542 x32 getsockopt compat_sys_getsockopt
543 x32 io_setup compat_sys_io_setup
544 x32 io_submit compat_sys_io_submit
-545 x32 execveat stub_x32_execveat/ptregs
+545 x32 execveat compat_sys_execveat/ptregs
--
2.5.0
ret_from_fork is now open-coded and is no longer tangled up with the
syscall code. This isn't so bad -- this adds very little code, and
IMO the result is much easier to understand.
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/entry_64.S | 35 +++++++++++++++++++----------------
1 file changed, 19 insertions(+), 16 deletions(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 8a6b7ce2beff..81b0944708c5 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -363,7 +363,6 @@ END(stub_ptregs_64)
* rdi: prev task we switched from
*/
ENTRY(ret_from_fork)
-
LOCK ; btr $TIF_FORK, TI_flags(%r8)
pushq $0x0002
@@ -371,28 +370,32 @@ ENTRY(ret_from_fork)
call schedule_tail /* rdi: 'prev' task parameter */
- RESTORE_EXTRA_REGS
-
testb $3, CS(%rsp) /* from kernel_thread? */
+ jnz 1f
/*
- * By the time we get here, we have no idea whether our pt_regs,
- * ti flags, and ti status came from the 64-bit SYSCALL fast path,
- * the slow path, or one of the 32-bit compat paths.
- * Use IRET code path to return, since it can safely handle
- * all of the above.
+ * We came from kernel_thread. This code path is quite twisted, and
+ * someone should clean it up.
+ *
+ * copy_thread_tls stashes the function pointer in RBX and the
+ * parameter to be passed in RBP. The called function is permitted
+ * to call do_execve and thereby jump to user mode.
*/
- jnz int_ret_from_sys_call
+ movq RBP(%rsp), %rdi
+ call *RBX(%rsp)
+ movl $0, RAX(%rsp)
/*
- * We came from kernel_thread
- * nb: we depend on RESTORE_EXTRA_REGS above
+ * Fall through as though we're exiting a syscall. This makes a
+ * twisted sort of sense if we just called do_execve.
*/
- movq %rbp, %rdi
- call *%rbx
- movl $0, RAX(%rsp)
- RESTORE_EXTRA_REGS
- jmp int_ret_from_sys_call
+
+1:
+ movq %rsp, %rdi
+ call syscall_return_slowpath /* returns with IRQs disabled */
+ TRACE_IRQS_ON /* user mode is traced as IRQS on */
+ SWAPGS
+ jmp restore_regs_and_iret
END(ret_from_fork)
/*
--
2.5.0
This is more complicated than the 32-bit and compat cases because it
preserves an asm fast path for the case where the callee-saved regs
aren't needed in pt_regs and no entry or exit work needs to be done.
This appears to slow down fastpath syscalls by no more than one cycle
on my Skylake laptop.
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/common.c | 26 ++++++++++
arch/x86/entry/entry_64.S | 124 +++++++++++++++-------------------------------
2 files changed, 67 insertions(+), 83 deletions(-)
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index a89fdbc1f0be..d45119e770ef 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -344,6 +344,32 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs)
prepare_exit_to_usermode(regs);
}
+#ifdef CONFIG_X86_64
+__visible void do_syscall_64(struct pt_regs *regs)
+{
+ struct thread_info *ti = pt_regs_to_thread_info(regs);
+ unsigned long nr = regs->orig_ax;
+
+ local_irq_enable();
+
+ if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
+ nr = syscall_trace_enter(regs);
+
+ /*
+ * NB: Native and x32 syscalls are dispatched from the same
+ * table. The only functional difference is the x32 bit in
+ * regs->orig_ax, which changes the behavior of some syscalls.
+ */
+ if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
+ regs->ax = sys_call_table[nr & __SYSCALL_MASK](
+ regs->di, regs->si, regs->dx,
+ regs->r10, regs->r8, regs->r9);
+ }
+
+ syscall_return_slowpath(regs);
+}
+#endif
+
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
/*
* Does a 32-bit syscall. Called with IRQs on and does all entry and
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 81b0944708c5..1ab5362f241d 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -145,17 +145,11 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ TRACE_IRQS_OFF
+
/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
- /*
- * Re-enable interrupts.
- * We use 'rsp_scratch' as a scratch space, hence irq-off block above
- * must execute atomically in the face of possible interrupt-driven
- * task preemption. We must enable interrupts only after we're done
- * with using rsp_scratch:
- */
- ENABLE_INTERRUPTS(CLBR_NONE)
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
@@ -171,9 +165,21 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
pushq %r11 /* pt_regs->r11 */
sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
- testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz tracesys
+ /*
+ * If we need to do entry work or if we guess we'll need to do
+ * exit work, go straight to the slow path.
+ */
+ testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jnz entry_SYSCALL64_slow_path
+
entry_SYSCALL_64_fastpath:
+ /*
+ * Easy case: enable interrupts and issue the syscall. If the syscall
+ * needs pt_regs, we'll call a stub that disables interrupts again
+ * and jumps to the slow path.
+ */
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
#if __SYSCALL_MASK == ~0
cmpq $__NR_syscall_max, %rax
#else
@@ -185,93 +191,43 @@ entry_SYSCALL_64_fastpath:
call *sys_call_table_fastpath_64(, %rax, 8)
movq %rax, RAX(%rsp)
1:
-/*
- * Syscall return path ending with SYSRET (fast path).
- * Has incompletely filled pt_regs.
- */
- LOCKDEP_SYS_EXIT
- /*
- * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
- * it is too small to ever cause noticeable irq latency.
- */
- DISABLE_INTERRUPTS(CLBR_NONE)
/*
- * We must check ti flags with interrupts (or at least preemption)
- * off because we must *never* return to userspace without
- * processing exit work that is enqueued if we're preempted here.
- * In particular, returning to userspace with any of the one-shot
- * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
- * very bad.
+ * If we get here, then we know that pt_regs is clean for SYSRET64.
+ * If we see that no exit work is required (which we are required
+ * to check with IRQs off), then we can go straight to SYSRET64.
*/
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
+ jnz 1f
- RESTORE_C_REGS_EXCEPT_RCX_R11
- movq RIP(%rsp), %rcx
- movq EFLAGS(%rsp), %r11
+ LOCKDEP_SYS_EXIT
+ TRACE_IRQS_ON /* user mode is traced as IRQs on */
+ RESTORE_C_REGS
movq RSP(%rsp), %rsp
- /*
- * 64-bit SYSRET restores rip from rcx,
- * rflags from r11 (but RF and VM bits are forced to 0),
- * cs and ss are loaded from MSRs.
- * Restoration of rflags re-enables interrupts.
- *
- * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
- * descriptor is not reinitialized. This means that we should
- * avoid SYSRET with SS == NULL, which could happen if we schedule,
- * exit the kernel, and re-enter using an interrupt vector. (All
- * interrupt entries on x86_64 set SS to NULL.) We prevent that
- * from happening by reloading SS in __switch_to. (Actually
- * detecting the failure in 64-bit userspace is tricky but can be
- * done.)
- */
USERGS_SYSRET64
-GLOBAL(int_ret_from_sys_call_irqs_off)
+1:
+ /*
+ * The fast path looked good when we started, but something changed
+ * along the way and we need to switch to the slow path. Calling
+ * raise(3) will trigger this, for example. IRQs are off.
+ */
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- jmp int_ret_from_sys_call
-
- /* Do syscall entry tracing */
-tracesys:
- movq %rsp, %rdi
- movl $AUDIT_ARCH_X86_64, %esi
- call syscall_trace_enter_phase1
SAVE_EXTRA_REGS
movq %rsp, %rdi
- movl $AUDIT_ARCH_X86_64, %esi
- movq %rax, %rdx
- call syscall_trace_enter_phase2
-
- /*
- * Reload registers from stack in case ptrace changed them.
- * We don't reload %rax because syscall_trace_entry_phase2() returned
- * the value it wants us to use in the table lookup.
- */
- RESTORE_C_REGS_EXCEPT_RAX
-#if __SYSCALL_MASK == ~0
- cmpq $__NR_syscall_max, %rax
-#else
- andl $__SYSCALL_MASK, %eax
- cmpl $__NR_syscall_max, %eax
-#endif
- ja 1f /* return -ENOSYS (already in pt_regs->ax) */
- movq %r10, %rcx /* fixup for C */
- call *sys_call_table(, %rax, 8)
- movq %rax, RAX(%rsp)
- RESTORE_EXTRA_REGS
-1:
- /* Use IRET because user could have changed pt_regs->foo */
+ call syscall_return_slowpath /* returns with IRQs disabled */
+ jmp return_from_SYSCALL_64
-/*
- * Syscall return path ending with IRET.
- * Has correct iret frame.
- */
-GLOBAL(int_ret_from_sys_call)
+entry_SYSCALL64_slow_path:
+ /* IRQs are off. */
SAVE_EXTRA_REGS
movq %rsp, %rdi
- call syscall_return_slowpath /* returns with IRQs disabled */
+ call do_syscall_64 /* returns with IRQs disabled */
+
+return_from_SYSCALL_64:
RESTORE_EXTRA_REGS
TRACE_IRQS_IRETQ /* we're about to change IF */
@@ -353,8 +309,10 @@ ENTRY(stub_ptregs_64)
* Syscalls marked as needing ptregs that go through the fast path
* land here. We transfer to the slow path.
*/
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
addq $8, %rsp
- jmp tracesys
+ jmp entry_SYSCALL64_slow_path
END(stub_ptregs_64)
/*
--
2.5.0
I want all of the syscall entries to run with interrupts off so that
I can efficiently run context tracking before enabling interrupts.
This will regress int $0x80 performance on 32-bit kernels by a
couple of cycles. This shouldn't matter much -- int $0x80 is not a
fast path.
This effectively reverts 657c1eea0019 ("x86/entry/32: Fix
entry_INT80_32() to expect interrupts to be on") and fixes the issue
differently.
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/common.c | 15 +++------------
arch/x86/entry/entry_32.S | 8 ++++----
arch/x86/entry/entry_64_compat.S | 2 +-
arch/x86/kernel/traps.c | 2 +-
4 files changed, 9 insertions(+), 18 deletions(-)
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index d45119e770ef..b8a848f80b2a 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -377,14 +377,7 @@ __visible void do_syscall_64(struct pt_regs *regs)
* in workloads that use it, and it's usually called from
* do_fast_syscall_32, so forcibly inline it to improve performance.
*/
-#ifdef CONFIG_X86_32
-/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */
-__visible
-#else
-/* 64-bit kernels use do_syscall_32_irqs_off() instead. */
-static
-#endif
-__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
{
struct thread_info *ti = pt_regs_to_thread_info(regs);
unsigned int nr = (unsigned int)regs->orig_ax;
@@ -419,14 +412,12 @@ __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
syscall_return_slowpath(regs);
}
-#ifdef CONFIG_X86_64
-/* Handles INT80 on 64-bit kernels */
-__visible void do_syscall_32_irqs_off(struct pt_regs *regs)
+/* Handles int $0x80 */
+__visible void do_int80_syscall_32(struct pt_regs *regs)
{
local_irq_enable();
do_syscall_32_irqs_on(regs);
}
-#endif
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
__visible long do_fast_syscall_32(struct pt_regs *regs)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 6a5fe7ff333a..d3f409b670bf 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -346,13 +346,13 @@ ENTRY(entry_INT80_32)
SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
/*
- * User mode is traced as though IRQs are on. Unlike the 64-bit
- * case, INT80 is a trap gate on 32-bit kernels, so interrupts
- * are already on (unless user code is messing around with iopl).
+ * User mode is traced as though IRQs are on, and the interrupt gate
+ * turned them off.
*/
+ TRACE_IRQS_OFF
movl %esp, %eax
- call do_syscall_32_irqs_on
+ call do_int80_syscall_32
.Lsyscall_32_done:
restore_all:
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index dd160e4e2ef5..78becafe60d1 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -303,7 +303,7 @@ ENTRY(entry_INT80_compat)
TRACE_IRQS_OFF
movq %rsp, %rdi
- call do_syscall_32_irqs_off
+ call do_int80_syscall_32
.Lsyscall_32_done:
/* Go back to user mode. */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 346eec73f7db..0ad441f721f5 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -868,7 +868,7 @@ void __init trap_init(void)
#endif
#ifdef CONFIG_X86_32
- set_system_trap_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
+ set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
set_bit(IA32_SYSCALL_VECTOR, used_vectors);
#endif
--
2.5.0
Now that slow-path syscalls always enter C before enabling
interrupts, it's straightforward to do enter_from_user_mode before
enabling interrupts rather than doing it as part of entry tracing.
With this change, we should finally be able to retire
exception_enter.
This will also enable optimizations based on knowing that we never
change context tracking state with interrupts on.
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/common.c | 39 ++++++++++++++------------------------
arch/x86/include/asm/thread_info.h | 5 ++++-
2 files changed, 18 insertions(+), 26 deletions(-)
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index b8a848f80b2a..016ac47c954b 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -37,14 +37,17 @@ static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
return (struct thread_info *)(top_of_stack - THREAD_SIZE);
}
-#ifdef CONFIG_CONTEXT_TRACKING
+#ifndef CONFIG_CONTEXT_TRACKING
+static
+#else
+__visible
+#endif
/* Called on entry from user mode with IRQs off. */
-__visible void enter_from_user_mode(void)
+void enter_from_user_mode(void)
{
CT_WARN_ON(ct_state() != CONTEXT_USER);
user_exit();
}
-#endif
static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
{
@@ -84,17 +87,6 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
-#ifdef CONFIG_CONTEXT_TRACKING
- /*
- * If TIF_NOHZ is set, we are required to call user_exit() before
- * doing anything that could touch RCU.
- */
- if (work & _TIF_NOHZ) {
- enter_from_user_mode();
- work &= ~_TIF_NOHZ;
- }
-#endif
-
#ifdef CONFIG_SECCOMP
/*
* Do seccomp first -- it should minimize exposure of other
@@ -350,6 +342,7 @@ __visible void do_syscall_64(struct pt_regs *regs)
struct thread_info *ti = pt_regs_to_thread_info(regs);
unsigned long nr = regs->orig_ax;
+ enter_from_user_mode();
local_irq_enable();
if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
@@ -372,9 +365,9 @@ __visible void do_syscall_64(struct pt_regs *regs)
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
/*
- * Does a 32-bit syscall. Called with IRQs on and does all entry and
- * exit work and returns with IRQs off. This function is extremely hot
- * in workloads that use it, and it's usually called from
+ * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does
+ * all entry and exit work and returns with IRQs off. This function is
+ * extremely hot in workloads that use it, and it's usually called from
* do_fast_syscall_32, so forcibly inline it to improve performance.
*/
static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
@@ -415,6 +408,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
/* Handles int $0x80 */
__visible void do_int80_syscall_32(struct pt_regs *regs)
{
+ enter_from_user_mode();
local_irq_enable();
do_syscall_32_irqs_on(regs);
}
@@ -437,11 +431,9 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
*/
regs->ip = landing_pad;
- /*
- * Fetch ECX from where the vDSO stashed it.
- *
- * WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
- */
+ enter_from_user_mode();
+
+ /* Fetch ECX from where the vDSO stashed it. */
local_irq_enable();
if (
#ifdef CONFIG_X86_64
@@ -460,9 +452,6 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
/* User code screwed up. */
local_irq_disable();
regs->ax = -EFAULT;
-#ifdef CONFIG_CONTEXT_TRACKING
- enter_from_user_mode();
-#endif
prepare_exit_to_usermode(regs);
return 0; /* Keep it simple: use IRET. */
}
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a1ecd214d227..ae210d6159d3 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -136,7 +136,10 @@ struct thread_info {
#define _TIF_ADDR32 (1 << TIF_ADDR32)
#define _TIF_X32 (1 << TIF_X32)
-/* work to do in syscall_trace_enter() */
+/*
+ * work to do in syscall_trace_enter(). Also includes TIF_NOHZ for
+ * enter_from_user_mode()
+ */
#define _TIF_WORK_SYSCALL_ENTRY \
(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \
_TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \
--
2.5.0
On Mon, Dec 7, 2015 at 1:51 PM, Andy Lutomirski <[email protected]> wrote:
> This is kind of like the 32-bit and compat code, except that I
> preserved the fast path this time. I was unable to measure any
> significant performance change on my laptop in the fast path.
>
> What do you all think?
For completeness, if I zap the fast path entirely (see attached), I
lose 20 cycles (148 cycles vs 128 cycles) on Skylake. Switching
between movq and pushq for stack setup makes no difference whatsoever,
interestingly. I haven't tried to figure out exactly where those 20
cycles go.
--Andy
On Mon, Dec 7, 2015 at 4:51 PM, Andy Lutomirski <[email protected]> wrote:
> 64-bit syscalls currently have an optimization in which they are
> called with partial pt_regs. A small handful require full pt_regs.
>
> In the 32-bit and compat cases, I cleaned this up by forcing full
> pt_regs for all syscalls. The performance hit doesn't really matter.
>
> I want to clean up the 64-bit case as well, but I don't want to hurt
> fast path performance. To do that, I want to force the syscalls
> that use pt_regs onto the slow path. This will enable us to make
> slow path syscalls be real ABI-compliant C functions.
>
> Use the new syscall entry qualification machinery for this.
> stub_clone is now stub_clone/ptregs.
>
> The next patch will eliminate the stubs, and we'll just have
> sys_clone/ptregs.
I've got an idea on how to do this without the duplicate syscall table.
ptregs_foo:
leaq sys_foo(%rip), %rax
jmp stub_ptregs_64
stub_ptregs_64:
testl $TS_EXTRAREGS, <current->ti_status>
jnz 1f
SAVE_EXTRA_REGS
call *%rax
RESTORE_EXTRA_REGS
ret
1:
call *%rax
--
Brian Gerst
On Mon, Dec 7, 2015 at 7:50 PM, Brian Gerst <[email protected]> wrote:
> On Mon, Dec 7, 2015 at 4:51 PM, Andy Lutomirski <[email protected]> wrote:
>> 64-bit syscalls currently have an optimization in which they are
>> called with partial pt_regs. A small handful require full pt_regs.
>>
>> In the 32-bit and compat cases, I cleaned this up by forcing full
>> pt_regs for all syscalls. The performance hit doesn't really matter.
>>
>> I want to clean up the 64-bit case as well, but I don't want to hurt
>> fast path performance. To do that, I want to force the syscalls
>> that use pt_regs onto the slow path. This will enable us to make
>> slow path syscalls be real ABI-compliant C functions.
>>
>> Use the new syscall entry qualification machinery for this.
>> stub_clone is now stub_clone/ptregs.
>>
>> The next patch will eliminate the stubs, and we'll just have
>> sys_clone/ptregs.
[Resend after gmail web interface fail]
I've got an idea on how to do this without the duplicate syscall table.
ptregs_foo:
leaq sys_foo(%rip), %rax
jmp stub_ptregs_64
stub_ptregs_64:
testl $TS_EXTRAREGS, <current->ti_status>
jnz 1f
SAVE_EXTRA_REGS
call *%rax
RESTORE_EXTRA_REGS
ret
1:
call *%rax
ret
This makes sure that the extra regs don't get saved a second time if
coming in from the slow path, but preserves the fast path if not
tracing.
--
Brian Gerst
On Mon, Dec 7, 2015 at 4:54 PM, Brian Gerst <[email protected]> wrote:
> On Mon, Dec 7, 2015 at 7:50 PM, Brian Gerst <[email protected]> wrote:
>> On Mon, Dec 7, 2015 at 4:51 PM, Andy Lutomirski <[email protected]> wrote:
>>> 64-bit syscalls currently have an optimization in which they are
>>> called with partial pt_regs. A small handful require full pt_regs.
>>>
>>> In the 32-bit and compat cases, I cleaned this up by forcing full
>>> pt_regs for all syscalls. The performance hit doesn't really matter.
>>>
>>> I want to clean up the 64-bit case as well, but I don't want to hurt
>>> fast path performance. To do that, I want to force the syscalls
>>> that use pt_regs onto the slow path. This will enable us to make
>>> slow path syscalls be real ABI-compliant C functions.
>>>
>>> Use the new syscall entry qualification machinery for this.
>>> stub_clone is now stub_clone/ptregs.
>>>
>>> The next patch will eliminate the stubs, and we'll just have
>>> sys_clone/ptregs.
>
> [Resend after gmail web interface fail]
>
> I've got an idea on how to do this without the duplicate syscall table.
>
> ptregs_foo:
> leaq sys_foo(%rip), %rax
> jmp stub_ptregs_64
>
> stub_ptregs_64:
> testl $TS_EXTRAREGS, <current->ti_status>
> jnz 1f
> SAVE_EXTRA_REGS
> call *%rax
> RESTORE_EXTRA_REGS
> ret
> 1:
> call *%rax
> ret
>
> This makes sure that the extra regs don't get saved a second time if
> coming in from the slow path, but preserves the fast path if not
> tracing.
I think there's value in having the entries in the table be genuine C
ABI-compliant function pointers. In your example, it only barely
works -- you can call them from C only if you have TS_EXTRAREGS set
appropriately -- -otherwise you crash and burn. That will break the
rest of the series.
We could adjust it a bit and check whether we're in C land (by
checking rsp for ts) and jump into the slow path if we aren't, but I'm
not sure this is a huge win. It does save some rodata space by
avoiding duplicating the table.
--Andy
* Andy Lutomirski <[email protected]> wrote:
> On Mon, Dec 7, 2015 at 1:51 PM, Andy Lutomirski <[email protected]> wrote:
>
> > This is kind of like the 32-bit and compat code, except that I preserved the
> > fast path this time. I was unable to measure any significant performance
> > change on my laptop in the fast path.
> >
> > What do you all think?
>
> For completeness, if I zap the fast path entirely (see attached), I lose 20
> cycles (148 cycles vs 128 cycles) on Skylake. Switching between movq and pushq
> for stack setup makes no difference whatsoever, interestingly. I haven't tried
> to figure out exactly where those 20 cycles go.
So I asked for this before, and I'll do so again: could you please stick the cycle
granular system call performance test into a 'perf bench' variant so that:
1) More people can run it all on various pieces of hardware and help out quantify
the patches.
2) We can keep an eye on not regressing base system call performance in the
future, with a good in-tree testcase.
Thanks!!
Ingo
On Mon, Dec 7, 2015 at 8:42 PM, Ingo Molnar <[email protected]> wrote:
>
> * Andy Lutomirski <[email protected]> wrote:
>
>> On Mon, Dec 7, 2015 at 1:51 PM, Andy Lutomirski <[email protected]> wrote:
>>
>> > This is kind of like the 32-bit and compat code, except that I preserved the
>> > fast path this time. I was unable to measure any significant performance
>> > change on my laptop in the fast path.
>> >
>> > What do you all think?
>>
>> For completeness, if I zap the fast path entirely (see attached), I lose 20
>> cycles (148 cycles vs 128 cycles) on Skylake. Switching between movq and pushq
>> for stack setup makes no difference whatsoever, interestingly. I haven't tried
>> to figure out exactly where those 20 cycles go.
>
> So I asked for this before, and I'll do so again: could you please stick the cycle
> granular system call performance test into a 'perf bench' variant so that:
>
> 1) More people can run it all on various pieces of hardware and help out quantify
> the patches.
>
> 2) We can keep an eye on not regressing base system call performance in the
> future, with a good in-tree testcase.
>
Is it okay if it's not particularly shiny or modular? The tool I'm
using is here:
https://git.kernel.org/cgit/linux/kernel/git/luto/misc-tests.git/tree/tight_loop/perf_self_monitor.c
and I can certainly stick it into 'perf bench' pretty easily. Can I
leave making it into a proper library to some future contributor?
It's actually decently fancy. It allocates a perf self-monitoring
instance that counts cycles, and then it takes a bunch of samples and
discards any that flagged a context switch. It does some very
rudimentary statistics on the rest. It's utterly devoid of a fancy
UI, though.
It works very well on native, and it works better than I had expected
under KVM. (KVM traps RDPMC because neither Intel nor AMD has seen
fit to provide any sensible way to virtualize RDPMC without exiting.)
--Andy
* Andy Lutomirski <[email protected]> wrote:
> On Mon, Dec 7, 2015 at 8:42 PM, Ingo Molnar <[email protected]> wrote:
> >
> > * Andy Lutomirski <[email protected]> wrote:
> >
> >> On Mon, Dec 7, 2015 at 1:51 PM, Andy Lutomirski <[email protected]> wrote:
> >>
> >> > This is kind of like the 32-bit and compat code, except that I preserved the
> >> > fast path this time. I was unable to measure any significant performance
> >> > change on my laptop in the fast path.
> >> >
> >> > What do you all think?
> >>
> >> For completeness, if I zap the fast path entirely (see attached), I lose 20
> >> cycles (148 cycles vs 128 cycles) on Skylake. Switching between movq and pushq
> >> for stack setup makes no difference whatsoever, interestingly. I haven't tried
> >> to figure out exactly where those 20 cycles go.
> >
> > So I asked for this before, and I'll do so again: could you please stick the cycle
> > granular system call performance test into a 'perf bench' variant so that:
> >
> > 1) More people can run it all on various pieces of hardware and help out quantify
> > the patches.
> >
> > 2) We can keep an eye on not regressing base system call performance in the
> > future, with a good in-tree testcase.
> >
>
> Is it okay if it's not particularly shiny or modular? [...]
Absolutely!
> [...] The tool I'm using is here:
>
> https://git.kernel.org/cgit/linux/kernel/git/luto/misc-tests.git/tree/tight_loop/perf_self_monitor.c
>
> and I can certainly stick it into 'perf bench' pretty easily. Can I
> leave making it into a proper library to some future contributor?
Sure - 'perf bench' tests aren't librarized generally - the goal is to make it
easy to create a new measurement.
> It's actually decently fancy. It allocates a perf self-monitoring
> instance that counts cycles, and then it takes a bunch of samples and
> discards any that flagged a context switch. It does some very
> rudimentary statistics on the rest. It's utterly devoid of a fancy
> UI, though.
>
> It works very well on native, and it works better than I had expected
> under KVM. (KVM traps RDPMC because neither Intel nor AMD has seen
> fit to provide any sensible way to virtualize RDPMC without exiting.)
Sounds fantastic to me!
Thanks,
Ingo
On Mon, Dec 07, 2015 at 01:51:26PM -0800, Andy Lutomirski wrote:
> There aren't any yet, but there might be a few some day.
>
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
> tools/testing/selftests/x86/Makefile | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
> index 389701f59940..a460fe7c5365 100644
> --- a/tools/testing/selftests/x86/Makefile
> +++ b/tools/testing/selftests/x86/Makefile
> @@ -8,8 +8,9 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt ptr
> TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso
>
> TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
> +TARGETS_C_64BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_64BIT_ONLY)
> BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
> -BINARIES_64 := $(TARGETS_C_BOTHBITS:%=%_64)
> +BINARIES_64 := $(TARGETS_C_64BIT_ALL:%=%_64)
>
> CFLAGS := -O2 -g -std=gnu99 -pthread -Wall
>
> @@ -37,7 +38,7 @@ clean:
> $(TARGETS_C_32BIT_ALL:%=%_32): %_32: %.c
> $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
>
> -$(TARGETS_C_BOTHBITS:%=%_64): %_64: %.c
> +$(TARGETS_C_64BIT_ALL:%=%_64): %_64: %.c
> $(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
>
> # x86_64 users should be encouraged to install 32-bit libraries
> --
It doesn't build some of the tests here if I run make in the x86 dir.
This is unrelated but maybe for the future we should add some feature
testing like perf tool does to warn people if stuff is missing on the
system...
$ cd tools/testing/selftests/x86
$ make
gcc -m32 -o single_step_syscall_32 -O2 -g -std=gnu99 -pthread -Wall single_step_syscall.c -lrt -ldl -lm
gcc -m32 -o sysret_ss_attrs_32 -O2 -g -std=gnu99 -pthread -Wall sysret_ss_attrs.c -lrt -ldl -lm
gcc -m32 -o ldt_gdt_32 -O2 -g -std=gnu99 -pthread -Wall ldt_gdt.c -lrt -ldl -lm
gcc -m32 -o syscall_nt_32 -O2 -g -std=gnu99 -pthread -Wall syscall_nt.c -lrt -ldl -lm
gcc -m32 -o ptrace_syscall_32 -O2 -g -std=gnu99 -pthread -Wall ptrace_syscall.c raw_syscall_helper_32.S -lrt -ldl -lm
gcc -m32 -o entry_from_vm86_32 -O2 -g -std=gnu99 -pthread -Wall entry_from_vm86.c -lrt -ldl -lm
gcc -m32 -o syscall_arg_fault_32 -O2 -g -std=gnu99 -pthread -Wall syscall_arg_fault.c -lrt -ldl -lm
gcc -m32 -o sigreturn_32 -O2 -g -std=gnu99 -pthread -Wall sigreturn.c -lrt -ldl -lm
gcc -m32 -o test_syscall_vdso_32 -O2 -g -std=gnu99 -pthread -Wall test_syscall_vdso.c thunks_32.S -lrt -ldl -lm
In file included from ptrace_syscall.c:6:0:
/usr/include/sys/syscall.h:24:24: fatal error: asm/unistd.h: No such file or directory
compilation terminated.
In file included from entry_from_vm86.c:14:0:
/usr/include/sys/syscall.h:24:24: fatal error: asm/unistd.h: No such file or directory
compilation terminated.
...
--
Regards/Gruss,
Boris.
ECO tip #101: Trim your mails when you reply.
On Mon, Dec 07, 2015 at 01:51:27PM -0800, Andy Lutomirski wrote:
> This checks that ELF binaries are started with an appropriately
> blank register state.
>
> (There's currently a nasty special case in the entry asm to arrange
> for this. I'm planning on removing the special case, and this will
> help make sure I don't break it.)
>
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
> tools/testing/selftests/x86/Makefile | 8 +-
> .../selftests/x86/check_initial_reg_state.c | 108 +++++++++++++++++++++
> 2 files changed, 115 insertions(+), 1 deletion(-)
> create mode 100644 tools/testing/selftests/x86/check_initial_reg_state.c
>
> diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
> index a460fe7c5365..b82409421fa6 100644
> --- a/tools/testing/selftests/x86/Makefile
> +++ b/tools/testing/selftests/x86/Makefile
> @@ -4,7 +4,7 @@ include ../lib.mk
>
> .PHONY: all all_32 all_64 warn_32bit_failure clean
>
> -TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt ptrace_syscall
> +TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt ptrace_syscall check_initial_reg_state
> TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso
>
> TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
> @@ -63,3 +63,9 @@ endif
> sysret_ss_attrs_64: thunks.S
> ptrace_syscall_32: raw_syscall_helper_32.S
> test_syscall_vdso_32: thunks_32.S
> +
> +# check_initial_reg_state is special: it needs a custom entry, and it
> +# needs to be static so that its interpreter doesn't destroy its initial
> +# state.
> +check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static
> +check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static
> diff --git a/tools/testing/selftests/x86/check_initial_reg_state.c b/tools/testing/selftests/x86/check_initial_reg_state.c
> new file mode 100644
> index 000000000000..0cb565f7786d
> --- /dev/null
> +++ b/tools/testing/selftests/x86/check_initial_reg_state.c
> @@ -0,0 +1,108 @@
> +/*
> + * check_initial_reg_state.c - check that execve sets the correct state
> + * Copyright (c) 2014-2015 Andrew Lutomirski
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * General Public License for more details.
> + */
> +
> +#define _GNU_SOURCE
> +
> +#include <stdio.h>
> +
> +unsigned long ax, bx, cx, dx, si, di, bp, sp, flags;
> +unsigned long r8, r9, r10, r11, r12, r13, r14, r15;
> +
> +asm (".pushsection .text\n\t"
WARNING: please, no spaces at the start of a line
#82: FILE: tools/testing/selftests/x86/check_initial_reg_state.c:23:
+ ".type real_start, @function\n\t"$
Something trampled over the preceding tabs in that whole asm().
--
Regards/Gruss,
Boris.
ECO tip #101: Trim your mails when you reply.
On Mon, Dec 7, 2015 at 8:12 PM, Andy Lutomirski <[email protected]> wrote:
> On Mon, Dec 7, 2015 at 4:54 PM, Brian Gerst <[email protected]> wrote:
>> On Mon, Dec 7, 2015 at 7:50 PM, Brian Gerst <[email protected]> wrote:
>>> On Mon, Dec 7, 2015 at 4:51 PM, Andy Lutomirski <[email protected]> wrote:
>>>> 64-bit syscalls currently have an optimization in which they are
>>>> called with partial pt_regs. A small handful require full pt_regs.
>>>>
>>>> In the 32-bit and compat cases, I cleaned this up by forcing full
>>>> pt_regs for all syscalls. The performance hit doesn't really matter.
>>>>
>>>> I want to clean up the 64-bit case as well, but I don't want to hurt
>>>> fast path performance. To do that, I want to force the syscalls
>>>> that use pt_regs onto the slow path. This will enable us to make
>>>> slow path syscalls be real ABI-compliant C functions.
>>>>
>>>> Use the new syscall entry qualification machinery for this.
>>>> stub_clone is now stub_clone/ptregs.
>>>>
>>>> The next patch will eliminate the stubs, and we'll just have
>>>> sys_clone/ptregs.
>>
>> [Resend after gmail web interface fail]
>>
>> I've got an idea on how to do this without the duplicate syscall table.
>>
>> ptregs_foo:
>> leaq sys_foo(%rip), %rax
>> jmp stub_ptregs_64
>>
>> stub_ptregs_64:
>> testl $TS_EXTRAREGS, <current->ti_status>
>> jnz 1f
>> SAVE_EXTRA_REGS
>> call *%rax
>> RESTORE_EXTRA_REGS
>> ret
>> 1:
>> call *%rax
>> ret
>>
>> This makes sure that the extra regs don't get saved a second time if
>> coming in from the slow path, but preserves the fast path if not
>> tracing.
>
> I think there's value in having the entries in the table be genuine C
> ABI-compliant function pointers. In your example, it only barely
> works -- you can call them from C only if you have TS_EXTRAREGS set
> appropriately -- -otherwise you crash and burn. That will break the
> rest of the series.
I'm working on a full patch. It will set the flag (renamed
TS_SLOWPATH) in do_syscall_64(), which is the only place these
functions can get called from C code. Your changes already have it
set up so that the slow path saved these registers before calling any
C code. Where else do you expect them to be called from?
> We could adjust it a bit and check whether we're in C land (by
> checking rsp for ts) and jump into the slow path if we aren't, but I'm
> not sure this is a huge win. It does save some rodata space by
> avoiding duplicating the table.
The syscall table is huge. 545*8 bytes, over a full page.
Duplicating it for just a few different entries is wasteful.
--
Brian Gerst
* Brian Gerst <[email protected]> wrote:
> > We could adjust it a bit and check whether we're in C land (by checking rsp
> > for ts) and jump into the slow path if we aren't, but I'm not sure this is a
> > huge win. It does save some rodata space by avoiding duplicating the table.
>
> The syscall table is huge. 545*8 bytes, over a full page. Duplicating it for
> just a few different entries is wasteful.
Note that what matters more is cache footprint, not pure size: 1K of RAM overhead
for something as fundamental as system calls is trivial cost.
So the questions to ask are along these lines:
- what is the typical locality of access (do syscall numbers cluster in time and
space)
- how frequently would the two tables be accessed (is one accessed less
frequently than the other?)
- subsequently how does the effective cache footprint change with the
duplication?
it might still end up not being worth it - but it's not the RAM cost that is the
main factor IMHO.
Thanks,
Ingo
On Tue, Dec 8, 2015 at 10:56 AM, Ingo Molnar <[email protected]> wrote:
>
> * Brian Gerst <[email protected]> wrote:
>
>> > We could adjust it a bit and check whether we're in C land (by checking rsp
>> > for ts) and jump into the slow path if we aren't, but I'm not sure this is a
>> > huge win. It does save some rodata space by avoiding duplicating the table.
>>
>> The syscall table is huge. 545*8 bytes, over a full page. Duplicating it for
>> just a few different entries is wasteful.
>
> Note that what matters more is cache footprint, not pure size: 1K of RAM overhead
> for something as fundamental as system calls is trivial cost.
>
> So the questions to ask are along these lines:
>
> - what is the typical locality of access (do syscall numbers cluster in time and
> space)
>
I suspect that they do. Web servers will call send over and over, for example.
> - how frequently would the two tables be accessed (is one accessed less
> frequently than the other?)
On setups that don't bail right away, the fast path table gets hit
most of the time. On setups that do bail right away (context tracking
on, for example), we exclusively use the slow path table.
>
> - subsequently how does the effective cache footprint change with the
> duplication?
In the worst case (repeatedly forking, for example, but I doubt we
care about that case), the duplication adds one extra cacheline.
>
> it might still end up not being worth it - but it's not the RAM cost that is the
> main factor IMHO.
Agreed.
One option: borrow the high bit to indicate "needs ptregs". This adds
a branch to both the fast path and the slow path, but it avoids the
cache hit.
Brian's approach gets the best of all worlds except that, if I
understand it right, it's a bit fragile.
--Andy
On Mon, Dec 7, 2015 at 4:51 PM, Andy Lutomirski <[email protected]> wrote:
> 64-bit syscalls currently have an optimization in which they are
> called with partial pt_regs. A small handful require full pt_regs.
>
> In the 32-bit and compat cases, I cleaned this up by forcing full
> pt_regs for all syscalls. The performance hit doesn't really matter.
>
> I want to clean up the 64-bit case as well, but I don't want to hurt
> fast path performance. To do that, I want to force the syscalls
> that use pt_regs onto the slow path. This will enable us to make
> slow path syscalls be real ABI-compliant C functions.
>
> Use the new syscall entry qualification machinery for this.
> stub_clone is now stub_clone/ptregs.
>
> The next patch will eliminate the stubs, and we'll just have
> sys_clone/ptregs.
>
> Signed-off-by: Andy Lutomirski <[email protected]>
Fails to boot, bisected to this patch:
[ 32.675319] kernel BUG at kernel/auditsc.c:1504!
[ 32.675325] invalid opcode: 0000 [#65] SMP
[ 32.675328] Modules linked in:
[ 32.675333] CPU: 1 PID: 216 Comm: systemd-cgroups Tainted: G D
4.3.0-rc4+ #7
[ 32.675336] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[ 32.675339] task: ffff880000075340 ti: ffff880036520000 task.ti:
ffff880036520000
[ 32.675350] RIP: 0010:[<ffffffff8113d9ed>] [<ffffffff8113d9ed>]
__audit_syscall_entry+0xcd/0xf0
[ 32.675353] RSP: 0018:ffff880036523ef0 EFLAGS: 00010202
[ 32.675355] RAX: 000000000000000c RBX: ffff8800797b3000 RCX: 00007ffef8504e88
[ 32.675357] RDX: 000056172f37cfd0 RSI: 0000000000000000 RDI: 000000000000000c
[ 32.675359] RBP: ffff880036523f00 R08: 0000000000000001 R09: ffff880000075340
[ 32.675361] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
[ 32.675363] R13: 00000000c000003e R14: 0000000000000001 R15: 0000000000001000
[ 32.675380] FS: 00007f02b4ff48c0(0000) GS:ffff88007fc80000(0000)
knlGS:0000000000000000
[ 32.675383] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 32.675385] CR2: 00007f93d47ea0e0 CR3: 0000000036aa9000 CR4: 00000000000006e0
[ 32.675391] Stack:
[ 32.675396] ffff880036523f58 0000000000000000 ffff880036523f10
ffffffff8100321b
[ 32.675401] ffff880036523f48 ffffffff81003ad0 000056172f374040
00007f93d45c9990
[ 32.675404] 0000000000000001 0000000000000001 0000000000001000
000000000000000a
[ 32.675405] Call Trace:
[ 32.675414] [<ffffffff8100321b>] do_audit_syscall_entry+0x4b/0x70
[ 32.675420] [<ffffffff81003ad0>] syscall_trace_enter_phase2+0x110/0x1d0
[ 32.675425] [<ffffffff81761d94>] tracesys+0x3a/0x96
[ 32.675464] Code: 00 00 00 00 e8 a5 e0 fc ff c7 43 04 01 00 00 00
48 89 43 18 48 89 53 20 44 89 63 0c c7 83 94 02 00 00 00 00 00 00 5b
41 5c 5d c3 <0f> 0b 48 c7 43 50 00 00 00 00 48 c7 c2 60 b4 c5 81 48 89
de 4c
[ 32.675469] RIP [<ffffffff8113d9ed>] __audit_syscall_entry+0xcd/0xf0
[ 32.675471] RSP <ffff880036523ef0>
--
Brian Gerst
On Tue, Dec 8, 2015 at 8:43 PM, Brian Gerst <[email protected]> wrote:
> On Mon, Dec 7, 2015 at 4:51 PM, Andy Lutomirski <[email protected]> wrote:
>> 64-bit syscalls currently have an optimization in which they are
>> called with partial pt_regs. A small handful require full pt_regs.
>>
>> In the 32-bit and compat cases, I cleaned this up by forcing full
>> pt_regs for all syscalls. The performance hit doesn't really matter.
>>
>> I want to clean up the 64-bit case as well, but I don't want to hurt
>> fast path performance. To do that, I want to force the syscalls
>> that use pt_regs onto the slow path. This will enable us to make
>> slow path syscalls be real ABI-compliant C functions.
>>
>> Use the new syscall entry qualification machinery for this.
>> stub_clone is now stub_clone/ptregs.
>>
>> The next patch will eliminate the stubs, and we'll just have
>> sys_clone/ptregs.
>>
>> Signed-off-by: Andy Lutomirski <[email protected]>
>
> Fails to boot, bisected to this patch:
> [ 32.675319] kernel BUG at kernel/auditsc.c:1504!
> [ 32.675325] invalid opcode: 0000 [#65] SMP
> [ 32.675328] Modules linked in:
> [ 32.675333] CPU: 1 PID: 216 Comm: systemd-cgroups Tainted: G D
> 4.3.0-rc4+ #7
> [ 32.675336] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
> [ 32.675339] task: ffff880000075340 ti: ffff880036520000 task.ti:
> ffff880036520000
> [ 32.675350] RIP: 0010:[<ffffffff8113d9ed>] [<ffffffff8113d9ed>]
> __audit_syscall_entry+0xcd/0xf0
> [ 32.675353] RSP: 0018:ffff880036523ef0 EFLAGS: 00010202
> [ 32.675355] RAX: 000000000000000c RBX: ffff8800797b3000 RCX: 00007ffef8504e88
> [ 32.675357] RDX: 000056172f37cfd0 RSI: 0000000000000000 RDI: 000000000000000c
> [ 32.675359] RBP: ffff880036523f00 R08: 0000000000000001 R09: ffff880000075340
> [ 32.675361] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
> [ 32.675363] R13: 00000000c000003e R14: 0000000000000001 R15: 0000000000001000
> [ 32.675380] FS: 00007f02b4ff48c0(0000) GS:ffff88007fc80000(0000)
> knlGS:0000000000000000
> [ 32.675383] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [ 32.675385] CR2: 00007f93d47ea0e0 CR3: 0000000036aa9000 CR4: 00000000000006e0
> [ 32.675391] Stack:
> [ 32.675396] ffff880036523f58 0000000000000000 ffff880036523f10
> ffffffff8100321b
> [ 32.675401] ffff880036523f48 ffffffff81003ad0 000056172f374040
> 00007f93d45c9990
> [ 32.675404] 0000000000000001 0000000000000001 0000000000001000
> 000000000000000a
> [ 32.675405] Call Trace:
> [ 32.675414] [<ffffffff8100321b>] do_audit_syscall_entry+0x4b/0x70
> [ 32.675420] [<ffffffff81003ad0>] syscall_trace_enter_phase2+0x110/0x1d0
> [ 32.675425] [<ffffffff81761d94>] tracesys+0x3a/0x96
> [ 32.675464] Code: 00 00 00 00 e8 a5 e0 fc ff c7 43 04 01 00 00 00
> 48 89 43 18 48 89 53 20 44 89 63 0c c7 83 94 02 00 00 00 00 00 00 5b
> 41 5c 5d c3 <0f> 0b 48 c7 43 50 00 00 00 00 48 c7 c2 60 b4 c5 81 48 89
> de 4c
> [ 32.675469] RIP [<ffffffff8113d9ed>] __audit_syscall_entry+0xcd/0xf0
> [ 32.675471] RSP <ffff880036523ef0>
I'm not reproducing this, even with audit manually enabled. Can you
send a .config?
--Andy
On Tue, Dec 8, 2015 at 9:45 PM, Andy Lutomirski <[email protected]> wrote:
> On Tue, Dec 8, 2015 at 8:43 PM, Brian Gerst <[email protected]> wrote:
>> On Mon, Dec 7, 2015 at 4:51 PM, Andy Lutomirski <[email protected]> wrote:
>>> 64-bit syscalls currently have an optimization in which they are
>>> called with partial pt_regs. A small handful require full pt_regs.
>>>
>>> In the 32-bit and compat cases, I cleaned this up by forcing full
>>> pt_regs for all syscalls. The performance hit doesn't really matter.
>>>
>>> I want to clean up the 64-bit case as well, but I don't want to hurt
>>> fast path performance. To do that, I want to force the syscalls
>>> that use pt_regs onto the slow path. This will enable us to make
>>> slow path syscalls be real ABI-compliant C functions.
>>>
>>> Use the new syscall entry qualification machinery for this.
>>> stub_clone is now stub_clone/ptregs.
>>>
>>> The next patch will eliminate the stubs, and we'll just have
>>> sys_clone/ptregs.
>>>
>>> Signed-off-by: Andy Lutomirski <[email protected]>
>>
>> Fails to boot, bisected to this patch:
>> [ 32.675319] kernel BUG at kernel/auditsc.c:1504!
>> [ 32.675325] invalid opcode: 0000 [#65] SMP
>> [ 32.675328] Modules linked in:
>> [ 32.675333] CPU: 1 PID: 216 Comm: systemd-cgroups Tainted: G D
>> 4.3.0-rc4+ #7
>> [ 32.675336] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
>> [ 32.675339] task: ffff880000075340 ti: ffff880036520000 task.ti:
>> ffff880036520000
>> [ 32.675350] RIP: 0010:[<ffffffff8113d9ed>] [<ffffffff8113d9ed>]
>> __audit_syscall_entry+0xcd/0xf0
>> [ 32.675353] RSP: 0018:ffff880036523ef0 EFLAGS: 00010202
>> [ 32.675355] RAX: 000000000000000c RBX: ffff8800797b3000 RCX: 00007ffef8504e88
>> [ 32.675357] RDX: 000056172f37cfd0 RSI: 0000000000000000 RDI: 000000000000000c
>> [ 32.675359] RBP: ffff880036523f00 R08: 0000000000000001 R09: ffff880000075340
>> [ 32.675361] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
>> [ 32.675363] R13: 00000000c000003e R14: 0000000000000001 R15: 0000000000001000
>> [ 32.675380] FS: 00007f02b4ff48c0(0000) GS:ffff88007fc80000(0000)
>> knlGS:0000000000000000
>> [ 32.675383] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
>> [ 32.675385] CR2: 00007f93d47ea0e0 CR3: 0000000036aa9000 CR4: 00000000000006e0
>> [ 32.675391] Stack:
>> [ 32.675396] ffff880036523f58 0000000000000000 ffff880036523f10
>> ffffffff8100321b
>> [ 32.675401] ffff880036523f48 ffffffff81003ad0 000056172f374040
>> 00007f93d45c9990
>> [ 32.675404] 0000000000000001 0000000000000001 0000000000001000
>> 000000000000000a
>> [ 32.675405] Call Trace:
>> [ 32.675414] [<ffffffff8100321b>] do_audit_syscall_entry+0x4b/0x70
>> [ 32.675420] [<ffffffff81003ad0>] syscall_trace_enter_phase2+0x110/0x1d0
>> [ 32.675425] [<ffffffff81761d94>] tracesys+0x3a/0x96
>> [ 32.675464] Code: 00 00 00 00 e8 a5 e0 fc ff c7 43 04 01 00 00 00
>> 48 89 43 18 48 89 53 20 44 89 63 0c c7 83 94 02 00 00 00 00 00 00 5b
>> 41 5c 5d c3 <0f> 0b 48 c7 43 50 00 00 00 00 48 c7 c2 60 b4 c5 81 48 89
>> de 4c
>> [ 32.675469] RIP [<ffffffff8113d9ed>] __audit_syscall_entry+0xcd/0xf0
>> [ 32.675471] RSP <ffff880036523ef0>
>
> I'm not reproducing this, even with audit manually enabled. Can you
> send a .config?
Never mind, I found the bug by inspection. I'll send a fixed up
series tomorrow.
Can you send the boot failure you got with the full series applied,
though? I think that the bug I found is only triggerable part-way
through the series -- I think I inadvertently fixed it later on.
--Andy
On Wed, Dec 9, 2015 at 1:21 AM, Andy Lutomirski <[email protected]> wrote:
> On Tue, Dec 8, 2015 at 9:45 PM, Andy Lutomirski <[email protected]> wrote:
>> On Tue, Dec 8, 2015 at 8:43 PM, Brian Gerst <[email protected]> wrote:
>>> On Mon, Dec 7, 2015 at 4:51 PM, Andy Lutomirski <[email protected]> wrote:
>>>> 64-bit syscalls currently have an optimization in which they are
>>>> called with partial pt_regs. A small handful require full pt_regs.
>>>>
>>>> In the 32-bit and compat cases, I cleaned this up by forcing full
>>>> pt_regs for all syscalls. The performance hit doesn't really matter.
>>>>
>>>> I want to clean up the 64-bit case as well, but I don't want to hurt
>>>> fast path performance. To do that, I want to force the syscalls
>>>> that use pt_regs onto the slow path. This will enable us to make
>>>> slow path syscalls be real ABI-compliant C functions.
>>>>
>>>> Use the new syscall entry qualification machinery for this.
>>>> stub_clone is now stub_clone/ptregs.
>>>>
>>>> The next patch will eliminate the stubs, and we'll just have
>>>> sys_clone/ptregs.
>>>>
>>>> Signed-off-by: Andy Lutomirski <[email protected]>
>>>
>>> Fails to boot, bisected to this patch:
>>> [ 32.675319] kernel BUG at kernel/auditsc.c:1504!
>>> [ 32.675325] invalid opcode: 0000 [#65] SMP
>>> [ 32.675328] Modules linked in:
>>> [ 32.675333] CPU: 1 PID: 216 Comm: systemd-cgroups Tainted: G D
>>> 4.3.0-rc4+ #7
>>> [ 32.675336] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
>>> [ 32.675339] task: ffff880000075340 ti: ffff880036520000 task.ti:
>>> ffff880036520000
>>> [ 32.675350] RIP: 0010:[<ffffffff8113d9ed>] [<ffffffff8113d9ed>]
>>> __audit_syscall_entry+0xcd/0xf0
>>> [ 32.675353] RSP: 0018:ffff880036523ef0 EFLAGS: 00010202
>>> [ 32.675355] RAX: 000000000000000c RBX: ffff8800797b3000 RCX: 00007ffef8504e88
>>> [ 32.675357] RDX: 000056172f37cfd0 RSI: 0000000000000000 RDI: 000000000000000c
>>> [ 32.675359] RBP: ffff880036523f00 R08: 0000000000000001 R09: ffff880000075340
>>> [ 32.675361] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
>>> [ 32.675363] R13: 00000000c000003e R14: 0000000000000001 R15: 0000000000001000
>>> [ 32.675380] FS: 00007f02b4ff48c0(0000) GS:ffff88007fc80000(0000)
>>> knlGS:0000000000000000
>>> [ 32.675383] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
>>> [ 32.675385] CR2: 00007f93d47ea0e0 CR3: 0000000036aa9000 CR4: 00000000000006e0
>>> [ 32.675391] Stack:
>>> [ 32.675396] ffff880036523f58 0000000000000000 ffff880036523f10
>>> ffffffff8100321b
>>> [ 32.675401] ffff880036523f48 ffffffff81003ad0 000056172f374040
>>> 00007f93d45c9990
>>> [ 32.675404] 0000000000000001 0000000000000001 0000000000001000
>>> 000000000000000a
>>> [ 32.675405] Call Trace:
>>> [ 32.675414] [<ffffffff8100321b>] do_audit_syscall_entry+0x4b/0x70
>>> [ 32.675420] [<ffffffff81003ad0>] syscall_trace_enter_phase2+0x110/0x1d0
>>> [ 32.675425] [<ffffffff81761d94>] tracesys+0x3a/0x96
>>> [ 32.675464] Code: 00 00 00 00 e8 a5 e0 fc ff c7 43 04 01 00 00 00
>>> 48 89 43 18 48 89 53 20 44 89 63 0c c7 83 94 02 00 00 00 00 00 00 5b
>>> 41 5c 5d c3 <0f> 0b 48 c7 43 50 00 00 00 00 48 c7 c2 60 b4 c5 81 48 89
>>> de 4c
>>> [ 32.675469] RIP [<ffffffff8113d9ed>] __audit_syscall_entry+0xcd/0xf0
>>> [ 32.675471] RSP <ffff880036523ef0>
>>
>> I'm not reproducing this, even with audit manually enabled. Can you
>> send a .config?
>
> Never mind, I found the bug by inspection. I'll send a fixed up
> series tomorrow.
>
> Can you send the boot failure you got with the full series applied,
> though? I think that the bug I found is only triggerable part-way
> through the series -- I think I inadvertently fixed it later on.
I can't reproduce it now. It was a hang, or I just didn't get the
oops displayed on the screen. Could have been somethng unrelated.
--
Brian Gerst
Instead of using a duplicate syscall table for the fast path, create stubs for
the syscalls that need pt_regs that save the extra registers if a flag for the
slow path is not set.
Signed-off-by: Brian Gerst <[email protected]>
To: Andy Lutomirski <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: the arch/x86 maintainers <[email protected]>
Cc: Linux Kernel Mailing List <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Frédéric Weisbecker <[email protected]>
Cc: Denys Vlasenko <[email protected]>
Cc: Linus Torvalds <[email protected]>
---
Applies on top of Andy's syscall cleanup series.
arch/x86/entry/calling.h | 32 ++++++++++++++++----------------
arch/x86/entry/common.c | 4 ++++
arch/x86/entry/entry_64.S | 36 +++++++++++++++++++++++++++++-------
arch/x86/entry/syscall_64.c | 25 +++++--------------------
arch/x86/include/asm/thread_info.h | 1 +
5 files changed, 55 insertions(+), 43 deletions(-)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index e32206e..7c58bd2 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -129,22 +129,22 @@ For 32-bit we have the following conventions - kernel is built with
SAVE_C_REGS_HELPER 0, 0, 0, 1, 0
.endm
- .macro SAVE_EXTRA_REGS offset=0
- movq %r15, 0*8+\offset(%rsp)
- movq %r14, 1*8+\offset(%rsp)
- movq %r13, 2*8+\offset(%rsp)
- movq %r12, 3*8+\offset(%rsp)
- movq %rbp, 4*8+\offset(%rsp)
- movq %rbx, 5*8+\offset(%rsp)
- .endm
-
- .macro RESTORE_EXTRA_REGS offset=0
- movq 0*8+\offset(%rsp), %r15
- movq 1*8+\offset(%rsp), %r14
- movq 2*8+\offset(%rsp), %r13
- movq 3*8+\offset(%rsp), %r12
- movq 4*8+\offset(%rsp), %rbp
- movq 5*8+\offset(%rsp), %rbx
+ .macro SAVE_EXTRA_REGS offset=0 base=rsp
+ movq %r15, 0*8+\offset(%\base)
+ movq %r14, 1*8+\offset(%\base)
+ movq %r13, 2*8+\offset(%\base)
+ movq %r12, 3*8+\offset(%\base)
+ movq %rbp, 4*8+\offset(%\base)
+ movq %rbx, 5*8+\offset(%\base)
+ .endm
+
+ .macro RESTORE_EXTRA_REGS offset=0 base=rsp
+ movq 0*8+\offset(%\base), %r15
+ movq 1*8+\offset(%\base), %r14
+ movq 2*8+\offset(%\base), %r13
+ movq 3*8+\offset(%\base), %r12
+ movq 4*8+\offset(%\base), %rbp
+ movq 5*8+\offset(%\base), %rbx
.endm
.macro ZERO_EXTRA_REGS
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 016ac47..4381aca 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -342,6 +342,8 @@ __visible void do_syscall_64(struct pt_regs *regs)
struct thread_info *ti = pt_regs_to_thread_info(regs);
unsigned long nr = regs->orig_ax;
+ ti->status |= TS_SLOWPATH;
+
enter_from_user_mode();
local_irq_enable();
@@ -360,6 +362,8 @@ __visible void do_syscall_64(struct pt_regs *regs)
}
syscall_return_slowpath(regs);
+
+ ti->status &= ~TS_SLOWPATH;
}
#endif
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1ab5362..5852ec6 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -188,7 +188,7 @@ entry_SYSCALL_64_fastpath:
#endif
ja 1f /* return -ENOSYS (already in pt_regs->ax) */
movq %r10, %rcx
- call *sys_call_table_fastpath_64(, %rax, 8)
+ call *sys_call_table(, %rax, 8)
movq %rax, RAX(%rsp)
1:
@@ -306,15 +306,37 @@ END(entry_SYSCALL_64)
ENTRY(stub_ptregs_64)
/*
- * Syscalls marked as needing ptregs that go through the fast path
- * land here. We transfer to the slow path.
+ * Syscalls marked as needing ptregs land here.
+ * If we are on the fast path, we need to save the extra regs.
+ * If we are on the slow path, the extra regs are already saved.
*/
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- addq $8, %rsp
- jmp entry_SYSCALL64_slow_path
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %r10
+ testl $TS_SLOWPATH, ASM_THREAD_INFO(TI_status, %r10, 0)
+ jnz 1f
+ subq $SIZEOF_PTREGS, %r10
+ SAVE_EXTRA_REGS base=r10
+ movq %r10, %rbx
+ call *%rax
+ movq %rbx, %r10
+ RESTORE_EXTRA_REGS base=r10
+ ret
+1:
+ jmp *%rax
END(stub_ptregs_64)
+.macro ptregs_stub func
+ENTRY(ptregs_\func)
+ leaq \func(%rip), %rax
+ jmp stub_ptregs_64
+END(ptregs_\func)
+.endm
+
+#define __SYSCALL_64_QUAL_(sym)
+#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
+
+#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
+#include <asm/syscalls_64.h>
+
/*
* A newly forked process directly context switches into this address.
*
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 601745c..9dbc5ab 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -6,11 +6,14 @@
#include <asm/asm-offsets.h>
#include <asm/syscall.h>
-#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64_QUAL_(sym) sym
+#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
+
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
#include <asm/syscalls_64.h>
#undef __SYSCALL_64
-#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
+#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
@@ -22,21 +25,3 @@ asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
[0 ... __NR_syscall_max] = &sys_ni_syscall,
#include <asm/syscalls_64.h>
};
-
-#undef __SYSCALL_64
-
-extern long stub_ptregs_64(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
-
-#define __SYSCALL_64_QUAL_(nr, sym) [nr] = sym,
-#define __SYSCALL_64_QUAL_ptregs(nr, sym) [nr] = stub_ptregs_64,
-
-#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(nr, sym)
-
-asmlinkage const sys_call_ptr_t sys_call_table_fastpath_64[__NR_syscall_max+1] = {
- /*
- * Smells like a compiler bug -- it doesn't work
- * when the & below is removed.
- */
- [0 ... __NR_syscall_max] = &sys_ni_syscall,
-#include <asm/syscalls_64.h>
-};
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index ae210d6..358e3a9 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -229,6 +229,7 @@ static inline unsigned long current_stack_pointer(void)
* ever touches our thread-synchronous status, so we don't
* have to worry about atomic accesses.
*/
+#define TS_SLOWPATH 0x0001 /* syscall slowpath (64BIT) */
#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */
--
2.5.0
On Wed, Dec 9, 2015 at 5:02 AM, Brian Gerst <[email protected]> wrote:
> Instead of using a duplicate syscall table for the fast path, create stubs for
> the syscalls that need pt_regs that save the extra registers if a flag for the
> slow path is not set.
>
> Signed-off-by: Brian Gerst <[email protected]>
> To: Andy Lutomirski <[email protected]>
> Cc: Andy Lutomirski <[email protected]>
> Cc: the arch/x86 maintainers <[email protected]>
> Cc: Linux Kernel Mailing List <[email protected]>
> Cc: Borislav Petkov <[email protected]>
> Cc: Frédéric Weisbecker <[email protected]>
> Cc: Denys Vlasenko <[email protected]>
> Cc: Linus Torvalds <[email protected]>
> ---
>
> Applies on top of Andy's syscall cleanup series.
A couple questions:
> @@ -306,15 +306,37 @@ END(entry_SYSCALL_64)
>
> ENTRY(stub_ptregs_64)
> /*
> - * Syscalls marked as needing ptregs that go through the fast path
> - * land here. We transfer to the slow path.
> + * Syscalls marked as needing ptregs land here.
> + * If we are on the fast path, we need to save the extra regs.
> + * If we are on the slow path, the extra regs are already saved.
> */
> - DISABLE_INTERRUPTS(CLBR_NONE)
> - TRACE_IRQS_OFF
> - addq $8, %rsp
> - jmp entry_SYSCALL64_slow_path
> + movq PER_CPU_VAR(cpu_current_top_of_stack), %r10
> + testl $TS_SLOWPATH, ASM_THREAD_INFO(TI_status, %r10, 0)
> + jnz 1f
OK (but see below), but why not do:
addq $8, %rsp
jmp entry_SYSCALL64_slow_path
here instead of the stack munging below?
> + subq $SIZEOF_PTREGS, %r10
> + SAVE_EXTRA_REGS base=r10
> + movq %r10, %rbx
> + call *%rax
> + movq %rbx, %r10
> + RESTORE_EXTRA_REGS base=r10
> + ret
> +1:
> + jmp *%rax
> END(stub_ptregs_64)
Also, can we not get away with keying off rip or rsp instead of
ti->status? That should be faster and less magical IMO.
--Andy
On Tue, Dec 8, 2015 at 1:34 AM, Borislav Petkov <[email protected]> wrote:
> On Mon, Dec 07, 2015 at 01:51:26PM -0800, Andy Lutomirski wrote:
>> There aren't any yet, but there might be a few some day.
>>
>> Signed-off-by: Andy Lutomirski <[email protected]>
>> ---
>> tools/testing/selftests/x86/Makefile | 5 +++--
>> 1 file changed, 3 insertions(+), 2 deletions(-)
>>
>> diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
>> index 389701f59940..a460fe7c5365 100644
>> --- a/tools/testing/selftests/x86/Makefile
>> +++ b/tools/testing/selftests/x86/Makefile
>> @@ -8,8 +8,9 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt ptr
>> TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso
>>
>> TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
>> +TARGETS_C_64BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_64BIT_ONLY)
>> BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
>> -BINARIES_64 := $(TARGETS_C_BOTHBITS:%=%_64)
>> +BINARIES_64 := $(TARGETS_C_64BIT_ALL:%=%_64)
>>
>> CFLAGS := -O2 -g -std=gnu99 -pthread -Wall
>>
>> @@ -37,7 +38,7 @@ clean:
>> $(TARGETS_C_32BIT_ALL:%=%_32): %_32: %.c
>> $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
>>
>> -$(TARGETS_C_BOTHBITS:%=%_64): %_64: %.c
>> +$(TARGETS_C_64BIT_ALL:%=%_64): %_64: %.c
>> $(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
>>
>> # x86_64 users should be encouraged to install 32-bit libraries
>> --
>
> It doesn't build some of the tests here if I run make in the x86 dir.
> This is unrelated but maybe for the future we should add some feature
> testing like perf tool does to warn people if stuff is missing on the
> system...
>
> $ cd tools/testing/selftests/x86
> $ make
> gcc -m32 -o single_step_syscall_32 -O2 -g -std=gnu99 -pthread -Wall single_step_syscall.c -lrt -ldl -lm
> gcc -m32 -o sysret_ss_attrs_32 -O2 -g -std=gnu99 -pthread -Wall sysret_ss_attrs.c -lrt -ldl -lm
> gcc -m32 -o ldt_gdt_32 -O2 -g -std=gnu99 -pthread -Wall ldt_gdt.c -lrt -ldl -lm
> gcc -m32 -o syscall_nt_32 -O2 -g -std=gnu99 -pthread -Wall syscall_nt.c -lrt -ldl -lm
> gcc -m32 -o ptrace_syscall_32 -O2 -g -std=gnu99 -pthread -Wall ptrace_syscall.c raw_syscall_helper_32.S -lrt -ldl -lm
> gcc -m32 -o entry_from_vm86_32 -O2 -g -std=gnu99 -pthread -Wall entry_from_vm86.c -lrt -ldl -lm
> gcc -m32 -o syscall_arg_fault_32 -O2 -g -std=gnu99 -pthread -Wall syscall_arg_fault.c -lrt -ldl -lm
> gcc -m32 -o sigreturn_32 -O2 -g -std=gnu99 -pthread -Wall sigreturn.c -lrt -ldl -lm
> gcc -m32 -o test_syscall_vdso_32 -O2 -g -std=gnu99 -pthread -Wall test_syscall_vdso.c thunks_32.S -lrt -ldl -lm
> In file included from ptrace_syscall.c:6:0:
> /usr/include/sys/syscall.h:24:24: fatal error: asm/unistd.h: No such file or directory
> compilation terminated.
> In file included from entry_from_vm86.c:14:0:
> /usr/include/sys/syscall.h:24:24: fatal error: asm/unistd.h: No such file or directory
> compilation terminated.
Ick. What are you missing? That's weird.
We actually do have a test in the makefile, but it's obviously
incomplete given the failure you're seeing.
--Andy
On Tue, Dec 8, 2015 at 1:54 AM, Borislav Petkov <[email protected]> wrote:
> On Mon, Dec 07, 2015 at 01:51:27PM -0800, Andy Lutomirski wrote:
>> This checks that ELF binaries are started with an appropriately
>> blank register state.
>>
>> (There's currently a nasty special case in the entry asm to arrange
>> for this. I'm planning on removing the special case, and this will
>> help make sure I don't break it.)
>>
>> Signed-off-by: Andy Lutomirski <[email protected]>
>> ---
>> tools/testing/selftests/x86/Makefile | 8 +-
>> .../selftests/x86/check_initial_reg_state.c | 108 +++++++++++++++++++++
>> 2 files changed, 115 insertions(+), 1 deletion(-)
>> create mode 100644 tools/testing/selftests/x86/check_initial_reg_state.c
>>
>> diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
>> index a460fe7c5365..b82409421fa6 100644
>> --- a/tools/testing/selftests/x86/Makefile
>> +++ b/tools/testing/selftests/x86/Makefile
>> @@ -4,7 +4,7 @@ include ../lib.mk
>>
>> .PHONY: all all_32 all_64 warn_32bit_failure clean
>>
>> -TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt ptrace_syscall
>> +TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt ptrace_syscall check_initial_reg_state
>> TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso
>>
>> TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
>> @@ -63,3 +63,9 @@ endif
>> sysret_ss_attrs_64: thunks.S
>> ptrace_syscall_32: raw_syscall_helper_32.S
>> test_syscall_vdso_32: thunks_32.S
>> +
>> +# check_initial_reg_state is special: it needs a custom entry, and it
>> +# needs to be static so that its interpreter doesn't destroy its initial
>> +# state.
>> +check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static
>> +check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static
>> diff --git a/tools/testing/selftests/x86/check_initial_reg_state.c b/tools/testing/selftests/x86/check_initial_reg_state.c
>> new file mode 100644
>> index 000000000000..0cb565f7786d
>> --- /dev/null
>> +++ b/tools/testing/selftests/x86/check_initial_reg_state.c
>> @@ -0,0 +1,108 @@
>> +/*
>> + * check_initial_reg_state.c - check that execve sets the correct state
>> + * Copyright (c) 2014-2015 Andrew Lutomirski
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms and conditions of the GNU General Public License,
>> + * version 2, as published by the Free Software Foundation.
>> + *
>> + * This program is distributed in the hope it will be useful, but
>> + * WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> + * General Public License for more details.
>> + */
>> +
>> +#define _GNU_SOURCE
>> +
>> +#include <stdio.h>
>> +
>> +unsigned long ax, bx, cx, dx, si, di, bp, sp, flags;
>> +unsigned long r8, r9, r10, r11, r12, r13, r14, r15;
>> +
>> +asm (".pushsection .text\n\t"
>
> WARNING: please, no spaces at the start of a line
> #82: FILE: tools/testing/selftests/x86/check_initial_reg_state.c:23:
> + ".type real_start, @function\n\t"$
>
> Something trampled over the preceding tabs in that whole asm().
That was intentional -- everything lines up with the top-level "asm(".
checkpatch is confused because it understands "\t " at the front of a
line but not just a space.
That being said, I could easily be convinced to switch to tabs there.
--Andy
On Wed, Dec 09, 2015 at 10:56:36AM -0800, Andy Lutomirski wrote:
> That was intentional -- everything lines up with the top-level "asm(".
> checkpatch is confused because it understands "\t " at the front of a
> line but not just a space.
>
> That being said, I could easily be convinced to switch to tabs there.
I'd say kernel coding style is tabs of 8 chars...
You could do
asm(".pushsection .text\n\t"
".type real_start, @function\n\t"
".global real_start\n\t"
"real_start:\n\t"
#ifdef __x86_64__
"mov %rax, ax\n\t"
"mov %rbx, bx\n\t"
...
and align the opening "asm(" to the first tab...
Bah, whatever, I'm not that finicky to care enough.
--
Regards/Gruss,
Boris.
ECO tip #101: Trim your mails when you reply.
On Mon, Dec 7, 2015 at 2:51 PM, Andy Lutomirski <[email protected]> wrote:
> There aren't any yet, but there might be a few some day.
Andy,
Hmm. I would think get_maintainer script should have included
linux-api and as well as my email for this patch.
Anyway, I would like to see a better worded changelog.
Something along the lines
Makefile changes to enable x86_64 tests.
thanks,
-- Shuah
>
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
> tools/testing/selftests/x86/Makefile | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
> index 389701f59940..a460fe7c5365 100644
> --- a/tools/testing/selftests/x86/Makefile
> +++ b/tools/testing/selftests/x86/Makefile
> @@ -8,8 +8,9 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt ptr
> TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso
>
> TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
> +TARGETS_C_64BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_64BIT_ONLY)
> BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
> -BINARIES_64 := $(TARGETS_C_BOTHBITS:%=%_64)
> +BINARIES_64 := $(TARGETS_C_64BIT_ALL:%=%_64)
>
> CFLAGS := -O2 -g -std=gnu99 -pthread -Wall
>
> @@ -37,7 +38,7 @@ clean:
> $(TARGETS_C_32BIT_ALL:%=%_32): %_32: %.c
> $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
>
> -$(TARGETS_C_BOTHBITS:%=%_64): %_64: %.c
> +$(TARGETS_C_64BIT_ALL:%=%_64): %_64: %.c
> $(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
>
> # x86_64 users should be encouraged to install 32-bit libraries
> --
> 2.5.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
On Wed, Dec 9, 2015 at 11:09 AM, Borislav Petkov <[email protected]> wrote:
> On Wed, Dec 09, 2015 at 10:56:36AM -0800, Andy Lutomirski wrote:
>> That was intentional -- everything lines up with the top-level "asm(".
>> checkpatch is confused because it understands "\t " at the front of a
>> line but not just a space.
>>
>> That being said, I could easily be convinced to switch to tabs there.
>
> I'd say kernel coding style is tabs of 8 chars...
emacs, the One True Reference Of Kernel Style, disagrees with you :)
I'll change it.
--Andy
On Wed, Dec 9, 2015 at 11:11 AM, Shuah Khan <[email protected]> wrote:
> On Mon, Dec 7, 2015 at 2:51 PM, Andy Lutomirski <[email protected]> wrote:
>> There aren't any yet, but there might be a few some day.
>
> Andy,
>
> Hmm. I would think get_maintainer script should have included
> linux-api and as well as my email for this patch.
Whoops, my bad.
Although... isn't it about time that selftests got its own list?
>
> Anyway, I would like to see a better worded changelog.
> Something along the lines
>
> Makefile changes to enable x86_64 tests.
I find that confusing. The Makefile already supports x86_64 tests.
What's missing is support for tests that are x86_64 but not x86_32.
--Andy
On Wed, Dec 09, 2015 at 11:20:53AM -0800, Andy Lutomirski wrote:
> emacs, the One True Reference Of Kernel Style, disagrees with you :)
Not me, my friend - the bible: Documentation/CodingStyle
:-P
--
Regards/Gruss,
Boris.
ECO tip #101: Trim your mails when you reply.
Instead of using a duplicate syscall table for the fast path, create
stubs for the syscalls that need pt_regs that dispatch based on the
call site.
I think that this is very likely to introduce a mis-predicted branch
in all such syscalls. I think that's fine -- all of them are
already very slow.
Heavily based on a patch from Brian Gerst [1].
[1] http://lkml.kernel.org/g/[email protected]
Signed-off-by: Brian Gerst <[email protected]>
Cc: the arch/x86 maintainers <[email protected]>
Cc: Linux Kernel Mailing List <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Frédéric Weisbecker <[email protected]>
Cc: Denys Vlasenko <[email protected]>
Cc: Linus Torvalds <[email protected]>
Signed-off-by: Andy Lutomirski <[email protected]>
---
Brian, here's a counter-proposal. It's derived from your patch, but it works
differently.
If people like this, I'll send a new version of the whole series that includes
it at the end.
arch/x86/entry/entry_64.S | 49 ++++++++++++++++++++++++++++++++++++++-------
arch/x86/entry/syscall_64.c | 25 +++++------------------
2 files changed, 47 insertions(+), 27 deletions(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1ab5362f241d..16779b52419e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -188,7 +188,15 @@ entry_SYSCALL_64_fastpath:
#endif
ja 1f /* return -ENOSYS (already in pt_regs->ax) */
movq %r10, %rcx
- call *sys_call_table_fastpath_64(, %rax, 8)
+
+ /*
+ * This call instruction is handled specially in stub_ptregs_64.
+ * It might end up jumping to the slow path. If it jumps, rax and
+ * r11 are clobbered.
+ */
+ call *sys_call_table(, %rax, 8)
+.Lentry_SYSCALL_64_after_fastpath_call:
+
movq %rax, RAX(%rsp)
1:
@@ -306,15 +314,42 @@ END(entry_SYSCALL_64)
ENTRY(stub_ptregs_64)
/*
- * Syscalls marked as needing ptregs that go through the fast path
- * land here. We transfer to the slow path.
+ * Syscalls marked as needing ptregs land here.
+ * If we are on the fast path, we need to save the extra regs.
+ * If we are on the slow path, the extra regs are already saved.
+ *
+ * RAX stores a pointer to the C function implementing the syscall.
+ *
+ * We can safely clobber RAX (clobbered by return value regardless)
+ * and R11 (owned by callee and never stores an argument) regardless
+ * of which path we take.
*/
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- addq $8, %rsp
- jmp entry_SYSCALL64_slow_path
+ leaq .Lentry_SYSCALL_64_after_fastpath_call(%rip), %r11
+ cmpq %r11, (%rsp)
+ jne 1f
+
+ /* Called from fast path -- pop return address and jump to slow path */
+ popq %rax
+ jmp entry_SYSCALL64_slow_path /* called from fast path */
+
+1:
+ /* Called from C */
+ jmp *%rax /* called from C */
END(stub_ptregs_64)
+.macro ptregs_stub func
+ENTRY(ptregs_\func)
+ leaq \func(%rip), %rax
+ jmp stub_ptregs_64
+END(ptregs_\func)
+.endm
+
+/* Instantiate ptregs_stub for each ptregs-using syscall */
+#define __SYSCALL_64_QUAL_(sym)
+#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
+#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
+#include <asm/syscalls_64.h>
+
/*
* A newly forked process directly context switches into this address.
*
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 601745c667ce..9dbc5abb6162 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -6,11 +6,14 @@
#include <asm/asm-offsets.h>
#include <asm/syscall.h>
-#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64_QUAL_(sym) sym
+#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
+
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
#include <asm/syscalls_64.h>
#undef __SYSCALL_64
-#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
+#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
@@ -22,21 +25,3 @@ asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
[0 ... __NR_syscall_max] = &sys_ni_syscall,
#include <asm/syscalls_64.h>
};
-
-#undef __SYSCALL_64
-
-extern long stub_ptregs_64(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
-
-#define __SYSCALL_64_QUAL_(nr, sym) [nr] = sym,
-#define __SYSCALL_64_QUAL_ptregs(nr, sym) [nr] = stub_ptregs_64,
-
-#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(nr, sym)
-
-asmlinkage const sys_call_ptr_t sys_call_table_fastpath_64[__NR_syscall_max+1] = {
- /*
- * Smells like a compiler bug -- it doesn't work
- * when the & below is removed.
- */
- [0 ... __NR_syscall_max] = &sys_ni_syscall,
-#include <asm/syscalls_64.h>
-};
--
2.5.0
On Wed, Dec 9, 2015 at 12:22 PM, Andy Lutomirski <[email protected]> wrote:
> On Wed, Dec 9, 2015 at 11:11 AM, Shuah Khan <[email protected]> wrote:
>> On Mon, Dec 7, 2015 at 2:51 PM, Andy Lutomirski <[email protected]> wrote:
>>> There aren't any yet, but there might be a few some day.
>>
>> Andy,
>>
>> Hmm. I would think get_maintainer script should have included
>> linux-api and as well as my email for this patch.
>
> Whoops, my bad.
>
> Although... isn't it about time that selftests got its own list?
>
I probably should do that.
>>
>> Anyway, I would like to see a better worded changelog.
>> Something along the lines
>>
>> Makefile changes to enable x86_64 tests.
>
> I find that confusing. The Makefile already supports x86_64 tests.
> What's missing is support for tests that are x86_64 but not x86_32.
>
Right. Exactly why I asked you to make the change log better.
You could hrase what you just told me and that would help me
understand the change better. Something along the lines:
Change Makefille to add support for x86_64 tests that don't
run on x86_32.
thanks,
-- Shuah
On Wed, Dec 9, 2015 at 1:53 PM, Andy Lutomirski <[email protected]> wrote:
> On Wed, Dec 9, 2015 at 5:02 AM, Brian Gerst <[email protected]> wrote:
>> Instead of using a duplicate syscall table for the fast path, create stubs for
>> the syscalls that need pt_regs that save the extra registers if a flag for the
>> slow path is not set.
>>
>> Signed-off-by: Brian Gerst <[email protected]>
>> To: Andy Lutomirski <[email protected]>
>> Cc: Andy Lutomirski <[email protected]>
>> Cc: the arch/x86 maintainers <[email protected]>
>> Cc: Linux Kernel Mailing List <[email protected]>
>> Cc: Borislav Petkov <[email protected]>
>> Cc: Frédéric Weisbecker <[email protected]>
>> Cc: Denys Vlasenko <[email protected]>
>> Cc: Linus Torvalds <[email protected]>
>> ---
>>
>> Applies on top of Andy's syscall cleanup series.
>
> A couple questions:
>
>> @@ -306,15 +306,37 @@ END(entry_SYSCALL_64)
>>
>> ENTRY(stub_ptregs_64)
>> /*
>> - * Syscalls marked as needing ptregs that go through the fast path
>> - * land here. We transfer to the slow path.
>> + * Syscalls marked as needing ptregs land here.
>> + * If we are on the fast path, we need to save the extra regs.
>> + * If we are on the slow path, the extra regs are already saved.
>> */
>> - DISABLE_INTERRUPTS(CLBR_NONE)
>> - TRACE_IRQS_OFF
>> - addq $8, %rsp
>> - jmp entry_SYSCALL64_slow_path
>> + movq PER_CPU_VAR(cpu_current_top_of_stack), %r10
>> + testl $TS_SLOWPATH, ASM_THREAD_INFO(TI_status, %r10, 0)
>> + jnz 1f
>
> OK (but see below), but why not do:
>
> addq $8, %rsp
> jmp entry_SYSCALL64_slow_path
I've always been adverse to doing things like that because it breaks
call/return branch prediction.
Also, are there any side effects to calling enter_from_user_mode()
more than once?
> here instead of the stack munging below?
>
>> + subq $SIZEOF_PTREGS, %r10
>> + SAVE_EXTRA_REGS base=r10
>> + movq %r10, %rbx
>> + call *%rax
>> + movq %rbx, %r10
>> + RESTORE_EXTRA_REGS base=r10
>> + ret
>> +1:
>> + jmp *%rax
>> END(stub_ptregs_64)
After some thought, that can be simplified. It's only executed on the
fast path, so pt_regs is at 8(%rsp).
> Also, can we not get away with keying off rip or rsp instead of
> ti->status? That should be faster and less magical IMO.
Checking if the return address is the instruction after the fast path
dispatch would work.
Simplified version:
ENTRY(stub_ptregs_64)
cmpl $fast_path_return, (%rsp)
jne 1f
SAVE_EXTRA_REGS offset=8
call *%rax
RESTORE_EXTRA_REGS offset=8
ret
1:
jmp *%rax
END(stub_ptregs_64)
--
Brian Gerst
On Wed, Dec 9, 2015 at 1:08 PM, Brian Gerst <[email protected]> wrote:
> On Wed, Dec 9, 2015 at 1:53 PM, Andy Lutomirski <[email protected]> wrote:
>> On Wed, Dec 9, 2015 at 5:02 AM, Brian Gerst <[email protected]> wrote:
>>> Instead of using a duplicate syscall table for the fast path, create stubs for
>>> the syscalls that need pt_regs that save the extra registers if a flag for the
>>> slow path is not set.
>>>
>>> Signed-off-by: Brian Gerst <[email protected]>
>>> To: Andy Lutomirski <[email protected]>
>>> Cc: Andy Lutomirski <[email protected]>
>>> Cc: the arch/x86 maintainers <[email protected]>
>>> Cc: Linux Kernel Mailing List <[email protected]>
>>> Cc: Borislav Petkov <[email protected]>
>>> Cc: Frédéric Weisbecker <[email protected]>
>>> Cc: Denys Vlasenko <[email protected]>
>>> Cc: Linus Torvalds <[email protected]>
>>> ---
>>>
>>> Applies on top of Andy's syscall cleanup series.
>>
>> A couple questions:
>>
>>> @@ -306,15 +306,37 @@ END(entry_SYSCALL_64)
>>>
>>> ENTRY(stub_ptregs_64)
>>> /*
>>> - * Syscalls marked as needing ptregs that go through the fast path
>>> - * land here. We transfer to the slow path.
>>> + * Syscalls marked as needing ptregs land here.
>>> + * If we are on the fast path, we need to save the extra regs.
>>> + * If we are on the slow path, the extra regs are already saved.
>>> */
>>> - DISABLE_INTERRUPTS(CLBR_NONE)
>>> - TRACE_IRQS_OFF
>>> - addq $8, %rsp
>>> - jmp entry_SYSCALL64_slow_path
>>> + movq PER_CPU_VAR(cpu_current_top_of_stack), %r10
>>> + testl $TS_SLOWPATH, ASM_THREAD_INFO(TI_status, %r10, 0)
>>> + jnz 1f
>>
>> OK (but see below), but why not do:
>>
>> addq $8, %rsp
>> jmp entry_SYSCALL64_slow_path
>
> I've always been adverse to doing things like that because it breaks
> call/return branch prediction.
I'd agree with you there except that the syscalls in question really
don't matter for performance enough that we should worry about a
handful of cycles from a return misprediction. We're still avoiding
IRET regardless (to the extent possible), and that was always the
major factor.
> Also, are there any side effects to calling enter_from_user_mode()
> more than once?
A warning that invariants are broken if you have an appropriately
configured kernel.
>
>> here instead of the stack munging below?
>>
>>> + subq $SIZEOF_PTREGS, %r10
>>> + SAVE_EXTRA_REGS base=r10
>>> + movq %r10, %rbx
>>> + call *%rax
>>> + movq %rbx, %r10
>>> + RESTORE_EXTRA_REGS base=r10
>>> + ret
>>> +1:
>>> + jmp *%rax
>>> END(stub_ptregs_64)
>
> After some thought, that can be simplified. It's only executed on the
> fast path, so pt_regs is at 8(%rsp).
>
>> Also, can we not get away with keying off rip or rsp instead of
>> ti->status? That should be faster and less magical IMO.
>
> Checking if the return address is the instruction after the fast path
> dispatch would work.
>
> Simplified version:
> ENTRY(stub_ptregs_64)
> cmpl $fast_path_return, (%rsp)
Does that instruction actually work the way you want it to? (Does it
link?) I think you might need to use leaq the way I did in my patch.
> jne 1f
> SAVE_EXTRA_REGS offset=8
> call *%rax
> RESTORE_EXTRA_REGS offset=8
> ret
> 1:
> jmp *%rax
> END(stub_ptregs_64)
This'll work, I think, but I still think I prefer keeping as much
complexity as possible in the slow path. I could be convinced
otherwise, though -- this variant is reasonably clean.
--Andy
On Wed, Dec 9, 2015 at 1:15 PM, Andy Lutomirski <[email protected]> wrote:
> On Wed, Dec 9, 2015 at 1:08 PM, Brian Gerst <[email protected]> wrote:
>> Simplified version:
>> ENTRY(stub_ptregs_64)
>> cmpl $fast_path_return, (%rsp)
>
> Does that instruction actually work the way you want it to? (Does it
> link?) I think you might need to use leaq the way I did in my patch.
>
>> jne 1f
>> SAVE_EXTRA_REGS offset=8
>> call *%rax
>> RESTORE_EXTRA_REGS offset=8
>> ret
>> 1:
>> jmp *%rax
>> END(stub_ptregs_64)
>
> This'll work, I think, but I still think I prefer keeping as much
> complexity as possible in the slow path. I could be convinced
> otherwise, though -- this variant is reasonably clean.
On further reflection, there's at least one functional difference.
With my variant, modifying pt_regs from sys_foo/ptregs is safe. In
your variant, it's unsafe unless force_iret() is called. I don't know
whether we care.
--Andy
On Wed, Dec 9, 2015 at 6:50 PM, Andy Lutomirski <[email protected]> wrote:
> On Wed, Dec 9, 2015 at 1:15 PM, Andy Lutomirski <[email protected]> wrote:
>> On Wed, Dec 9, 2015 at 1:08 PM, Brian Gerst <[email protected]> wrote:
>>> Simplified version:
>>> ENTRY(stub_ptregs_64)
>>> cmpl $fast_path_return, (%rsp)
>>
>> Does that instruction actually work the way you want it to? (Does it
>> link?) I think you might need to use leaq the way I did in my patch.
It should have been cmpq. leaq isn't necessary, since immediates are
sign-extended to 64-bit.
>>> jne 1f
>>> SAVE_EXTRA_REGS offset=8
>>> call *%rax
>>> RESTORE_EXTRA_REGS offset=8
>>> ret
>>> 1:
>>> jmp *%rax
>>> END(stub_ptregs_64)
>>
>> This'll work, I think, but I still think I prefer keeping as much
>> complexity as possible in the slow path. I could be convinced
>> otherwise, though -- this variant is reasonably clean.
>
> On further reflection, there's at least one functional difference.
> With my variant, modifying pt_regs from sys_foo/ptregs is safe. In
> your variant, it's unsafe unless force_iret() is called. I don't know
> whether we care.
I can go either way at this point. My main concern was getting rid of
the duplicate table.
--
Brian Gerst
On Wed, Dec 9, 2015 at 9:42 PM, Brian Gerst <[email protected]> wrote:
> On Wed, Dec 9, 2015 at 6:50 PM, Andy Lutomirski <[email protected]> wrote:
>> On Wed, Dec 9, 2015 at 1:15 PM, Andy Lutomirski <[email protected]> wrote:
>>> On Wed, Dec 9, 2015 at 1:08 PM, Brian Gerst <[email protected]> wrote:
>>>> Simplified version:
>>>> ENTRY(stub_ptregs_64)
>>>> cmpl $fast_path_return, (%rsp)
>>>
>>> Does that instruction actually work the way you want it to? (Does it
>>> link?) I think you might need to use leaq the way I did in my patch.
>
> It should have been cmpq. leaq isn't necessary, since immediates are
> sign-extended to 64-bit.
Right, I always forget that they're sign-extended and not zero-extended.
I folded that bit in to my queue.
>
>>>> jne 1f
>>>> SAVE_EXTRA_REGS offset=8
>>>> call *%rax
>>>> RESTORE_EXTRA_REGS offset=8
>>>> ret
>>>> 1:
>>>> jmp *%rax
>>>> END(stub_ptregs_64)
>>>
>>> This'll work, I think, but I still think I prefer keeping as much
>>> complexity as possible in the slow path. I could be convinced
>>> otherwise, though -- this variant is reasonably clean.
>>
>> On further reflection, there's at least one functional difference.
>> With my variant, modifying pt_regs from sys_foo/ptregs is safe. In
>> your variant, it's unsafe unless force_iret() is called. I don't know
>> whether we care.
>
> I can go either way at this point. My main concern was getting rid of
> the duplicate table.
Agreed. I'll sleep on it, and maybe someone else has some reason to
prefer one approach over the other.
--Andy
--
Andy Lutomirski
AMA Capital Management, LLC
Andy Lutomirski <[email protected]> writes:
> I want all of the syscall entries to run with interrupts off so that
> I can efficiently run context tracking before enabling interrupts.
>
> This will regress int $0x80 performance on 32-bit kernels by a
> couple of cycles. This shouldn't matter much -- int $0x80 is not a
> fast path.
>
> This effectively reverts 657c1eea0019 ("x86/entry/32: Fix
> entry_INT80_32() to expect interrupts to be on") and fixes the issue
> differently.
>
> Signed-off-by: Andy Lutomirski <[email protected]>
This broke lguest. Wow, diving into that code again was a bit of a
trip.
Thanks!
Rusty.
---
From: Rusty Russell <[email protected]>
Subject: lguest: fix handling of guest syscalls using interrupt gates.
In a798f091113e ("x86/entry/32: Change INT80 to be an interrupt gate")
Andy broke lguest. This is because lguest had special code to allow
the 0x80 trap gate go straight into the guest itself; interrupts gates
(without more work, as mentioned in the file's comments) bounce via
the hypervisor.
His change made them go via the hypervisor, but as it's in the range of
normal hardware interrupts, they were not directed through to the guest
at all. Turns out the guest userspace isn't very effective if syscalls
are all noops.
I haven't ripped out all the now-useless trap-direct-to-guest-kernel
code yet, since it will still be needed if someone decides to update
this optimization.
Signed-off-by: Rusty Russell <[email protected]>
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index eb934b0242e0..ab1535f032c2 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -331,7 +331,7 @@ void set_interrupt(struct lg_cpu *cpu, unsigned int irq)
* Actually now I think of it, it's possible that Ron *is* half the Plan 9
* userbase. Oh well.
*/
-static bool could_be_syscall(unsigned int num)
+bool could_be_syscall(unsigned int num)
{
/* Normal Linux IA32_SYSCALL_VECTOR or reserved vector? */
return num == IA32_SYSCALL_VECTOR || num == syscall_vector;
@@ -416,6 +416,10 @@ bool deliver_trap(struct lg_cpu *cpu, unsigned int num)
*
* This routine indicates if a particular trap number could be delivered
* directly.
+ *
+ * Unfortunately, Linux 4.6 started using an interrupt gate instead of a
+ * trap gate for syscalls, so this trick is ineffective. See Mastery for
+ * how we could do this anyway...
*/
static bool direct_trap(unsigned int num)
{
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index ac8ad0461e80..69b3814afd2f 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -167,6 +167,7 @@ void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
bool send_notify_to_eventfd(struct lg_cpu *cpu);
void init_clockdev(struct lg_cpu *cpu);
bool check_syscall_vector(struct lguest *lg);
+bool could_be_syscall(unsigned int num);
int init_interrupts(void);
void free_interrupts(void);
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 6a4cd771a2be..adc162c7040d 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -429,8 +429,12 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
return;
break;
case 32 ... 255:
+ /* This might be a syscall. */
+ if (could_be_syscall(cpu->regs->trapnum))
+ break;
+
/*
- * These values mean a real interrupt occurred, in which case
+ * Other values mean a real interrupt occurred, in which case
* the Host handler has already been run. We just do a
* friendly check if another process should now be run, then
* return to run the Guest again.
Commit-ID: f87e0434a3bedeb5e4d75d96d9f3ad424dae6b33
Gitweb: http://git.kernel.org/tip/f87e0434a3bedeb5e4d75d96d9f3ad424dae6b33
Author: Rusty Russell <[email protected]>
AuthorDate: Fri, 1 Apr 2016 12:15:46 +1030
Committer: Ingo Molnar <[email protected]>
CommitDate: Fri, 1 Apr 2016 08:58:13 +0200
lguest, x86/entry/32: Fix handling of guest syscalls using interrupt gates
In a798f091113e ("x86/entry/32: Change INT80 to be an interrupt gate")
Andy broke lguest. This is because lguest had special code to allow
the 0x80 trap gate go straight into the guest itself; interrupts gates
(without more work, as mentioned in the file's comments) bounce via
the hypervisor.
His change made them go via the hypervisor, but as it's in the range of
normal hardware interrupts, they were not directed through to the guest
at all. Turns out the guest userspace isn't very effective if syscalls
are all noops.
I haven't ripped out all the now-useless trap-direct-to-guest-kernel
code yet, since it will still be needed if someone decides to update
this optimization.
Signed-off-by: Rusty Russell <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Brian Gerst <[email protected]>
Cc: Denys Vlasenko <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Weisbecker <[email protected]>
Cc: x86\@kernel.org
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
drivers/lguest/interrupts_and_traps.c | 6 +++++-
drivers/lguest/lg.h | 1 +
drivers/lguest/x86/core.c | 6 +++++-
3 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index eb934b0..67392b6 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -331,7 +331,7 @@ void set_interrupt(struct lg_cpu *cpu, unsigned int irq)
* Actually now I think of it, it's possible that Ron *is* half the Plan 9
* userbase. Oh well.
*/
-static bool could_be_syscall(unsigned int num)
+bool could_be_syscall(unsigned int num)
{
/* Normal Linux IA32_SYSCALL_VECTOR or reserved vector? */
return num == IA32_SYSCALL_VECTOR || num == syscall_vector;
@@ -416,6 +416,10 @@ bool deliver_trap(struct lg_cpu *cpu, unsigned int num)
*
* This routine indicates if a particular trap number could be delivered
* directly.
+ *
+ * Unfortunately, Linux 4.6 started using an interrupt gate instead of a
+ * trap gate for syscalls, so this trick is ineffective. See Mastery for
+ * how we could do this anyway...
*/
static bool direct_trap(unsigned int num)
{
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index ac8ad04..69b3814 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -167,6 +167,7 @@ void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
bool send_notify_to_eventfd(struct lg_cpu *cpu);
void init_clockdev(struct lg_cpu *cpu);
bool check_syscall_vector(struct lguest *lg);
+bool could_be_syscall(unsigned int num);
int init_interrupts(void);
void free_interrupts(void);
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 6a4cd77..adc162c 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -429,8 +429,12 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
return;
break;
case 32 ... 255:
+ /* This might be a syscall. */
+ if (could_be_syscall(cpu->regs->trapnum))
+ break;
+
/*
- * These values mean a real interrupt occurred, in which case
+ * Other values mean a real interrupt occurred, in which case
* the Host handler has already been run. We just do a
* friendly check if another process should now be run, then
* return to run the Guest again.