2024-05-07 05:30:27

by Josh Poimboeuf

[permalink] [raw]
Subject: [PATCH v5 0/3] x86/bugs: more BHI

Patch 1 fixes some objtool warnings and enables noreturn-related
optimizations for direct-called syscall handlers.

Patches 2 and 3 add 'spectre_bhi=vmexit' which is useful for mitigating
BHI in cloud host environments.

v5:
- dropped syscall hardening patch for now
- dropped "Fix CPU mitigation defaults for !x86" in favor of Sean's fix
- patch 1 fixes (Paul)

Josh Poimboeuf (3):
x86/syscall: Mark exit[_group] syscall handlers __noreturn
x86/bugs: Remove duplicate Spectre cmdline option descriptions
x86/bugs: Add 'spectre_bhi=vmexit' cmdline option

Documentation/admin-guide/hw-vuln/spectre.rst | 84 ++-----------------
.../admin-guide/kernel-parameters.txt | 12 ++-
arch/x86/entry/syscall_32.c | 10 ++-
arch/x86/entry/syscall_64.c | 9 +-
arch/x86/entry/syscall_x32.c | 7 +-
arch/x86/entry/syscalls/syscall_32.tbl | 6 +-
arch/x86/entry/syscalls/syscall_64.tbl | 6 +-
arch/x86/kernel/cpu/bugs.c | 16 ++--
arch/x86/um/sys_call_table_32.c | 10 ++-
arch/x86/um/sys_call_table_64.c | 11 ++-
scripts/syscalltbl.sh | 18 +++-
tools/objtool/noreturns.h | 4 +
12 files changed, 85 insertions(+), 108 deletions(-)

--
2.44.0



2024-05-07 05:30:39

by Josh Poimboeuf

[permalink] [raw]
Subject: [PATCH v5 1/3] x86/syscall: Mark exit[_group] syscall handlers __noreturn

The direct-call syscall dispatch function doesn't know that the exit()
and exit_group() syscall handlers don't return, so the call sites aren't
optimized accordingly.

Fix that by marking those exit syscall declarations __noreturn.

Fixes the following warnings:

vmlinux.o: warning: objtool: x64_sys_call+0x2804: __x64_sys_exit() is missing a __noreturn annotation
vmlinux.o: warning: objtool: ia32_sys_call+0x29b6: __ia32_sys_exit_group() is missing a __noreturn annotation

Fixes: 7390db8aea0d ("x86/bhi: Add support for clearing branch history at syscall entry")
Reported-by: "Paul E. McKenney" <[email protected]>
Closes: https://lkml.kernel.org/lkml/6dba9b32-db2c-4e6d-9500-7a08852f17a3@paulmck-laptop
Tested-by: Paul E. McKenney <[email protected]>
Signed-off-by: Josh Poimboeuf <[email protected]>
---
arch/x86/entry/syscall_32.c | 10 ++++++----
arch/x86/entry/syscall_64.c | 9 ++++++---
arch/x86/entry/syscall_x32.c | 7 +++++--
arch/x86/entry/syscalls/syscall_32.tbl | 6 +++---
arch/x86/entry/syscalls/syscall_64.tbl | 6 +++---
arch/x86/um/sys_call_table_32.c | 10 ++++++----
arch/x86/um/sys_call_table_64.c | 11 +++++++----
scripts/syscalltbl.sh | 18 ++++++++++++++++--
tools/objtool/noreturns.h | 4 ++++
9 files changed, 56 insertions(+), 25 deletions(-)

diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index c2235bae17ef..8cc9950d7104 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -14,9 +14,12 @@
#endif

#define __SYSCALL(nr, sym) extern long __ia32_##sym(const struct pt_regs *);
-
+#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __ia32_##sym(const struct pt_regs *);
#include <asm/syscalls_32.h>
-#undef __SYSCALL
+#undef __SYSCALL
+
+#undef __SYSCALL_NORETURN
+#define __SYSCALL_NORETURN __SYSCALL

/*
* The sys_call_table[] is no longer used for system calls, but
@@ -28,11 +31,10 @@
const sys_call_ptr_t sys_call_table[] = {
#include <asm/syscalls_32.h>
};
-#undef __SYSCALL
+#undef __SYSCALL
#endif

#define __SYSCALL(nr, sym) case nr: return __ia32_##sym(regs);
-
long ia32_sys_call(const struct pt_regs *regs, unsigned int nr)
{
switch (nr) {
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 33b3f09e6f15..ba8354424860 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -8,8 +8,12 @@
#include <asm/syscall.h>

#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
+#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
#include <asm/syscalls_64.h>
-#undef __SYSCALL
+#undef __SYSCALL
+
+#undef __SYSCALL_NORETURN
+#define __SYSCALL_NORETURN __SYSCALL

/*
* The sys_call_table[] is no longer used for system calls, but
@@ -20,10 +24,9 @@
const sys_call_ptr_t sys_call_table[] = {
#include <asm/syscalls_64.h>
};
-#undef __SYSCALL
+#undef __SYSCALL

#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
-
long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
{
switch (nr) {
diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c
index 03de4a932131..fb77908f44f3 100644
--- a/arch/x86/entry/syscall_x32.c
+++ b/arch/x86/entry/syscall_x32.c
@@ -8,11 +8,14 @@
#include <asm/syscall.h>

#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
+#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
#include <asm/syscalls_x32.h>
-#undef __SYSCALL
+#undef __SYSCALL
+
+#undef __SYSCALL_NORETURN
+#define __SYSCALL_NORETURN __SYSCALL

#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
-
long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
{
switch (nr) {
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 5f8591ce7f25..9e9a908cd50d 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -2,7 +2,7 @@
# 32-bit system call numbers and entry vectors
#
# The format is:
-# <number> <abi> <name> <entry point> <compat entry point>
+# <number> <abi> <name> <entry point> [<compat entry point> [noreturn]]
#
# The __ia32_sys and __ia32_compat_sys stubs are created on-the-fly for
# sys_*() system calls and compat_sys_*() compat system calls if
@@ -12,7 +12,7 @@
# The abi is always "i386" for this file.
#
0 i386 restart_syscall sys_restart_syscall
-1 i386 exit sys_exit
+1 i386 exit sys_exit - noreturn
2 i386 fork sys_fork
3 i386 read sys_read
4 i386 write sys_write
@@ -263,7 +263,7 @@
249 i386 io_cancel sys_io_cancel
250 i386 fadvise64 sys_ia32_fadvise64
# 251 is available for reuse (was briefly sys_set_zone_reclaim)
-252 i386 exit_group sys_exit_group
+252 i386 exit_group sys_exit_group - noreturn
253 i386 lookup_dcookie
254 i386 epoll_create sys_epoll_create
255 i386 epoll_ctl sys_epoll_ctl
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 7e8d46f4147f..5ea7387c1aa1 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -2,7 +2,7 @@
# 64-bit system call numbers and entry vectors
#
# The format is:
-# <number> <abi> <name> <entry point>
+# <number> <abi> <name> <entry point> [<compat entry point> [noreturn]]
#
# The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls
#
@@ -68,7 +68,7 @@
57 common fork sys_fork
58 common vfork sys_vfork
59 64 execve sys_execve
-60 common exit sys_exit
+60 common exit sys_exit - noreturn
61 common wait4 sys_wait4
62 common kill sys_kill
63 common uname sys_newuname
@@ -239,7 +239,7 @@
228 common clock_gettime sys_clock_gettime
229 common clock_getres sys_clock_getres
230 common clock_nanosleep sys_clock_nanosleep
-231 common exit_group sys_exit_group
+231 common exit_group sys_exit_group - noreturn
232 common epoll_wait sys_epoll_wait
233 common epoll_ctl sys_epoll_ctl
234 common tgkill sys_tgkill
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index 89df5d89d664..51655133eee3 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -9,6 +9,10 @@
#include <linux/cache.h>
#include <asm/syscall.h>

+extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long,
+ unsigned long, unsigned long,
+ unsigned long, unsigned long);
+
/*
* Below you can see, in terms of #define's, the differences between the x86-64
* and the UML syscall table.
@@ -22,15 +26,13 @@
#define sys_vm86 sys_ni_syscall

#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native)
+#define __SYSCALL_NORETURN __SYSCALL

#define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
#include <asm/syscalls_32.h>
+#undef __SYSCALL

-#undef __SYSCALL
#define __SYSCALL(nr, sym) sym,
-
-extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
-
const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = {
#include <asm/syscalls_32.h>
};
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index b0b4cfd2308c..943d414f2109 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -9,6 +9,10 @@
#include <linux/cache.h>
#include <asm/syscall.h>

+extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long,
+ unsigned long, unsigned long,
+ unsigned long, unsigned long);
+
/*
* Below you can see, in terms of #define's, the differences between the x86-64
* and the UML syscall table.
@@ -18,14 +22,13 @@
#define sys_iopl sys_ni_syscall
#define sys_ioperm sys_ni_syscall

+#define __SYSCALL_NORETURN __SYSCALL
+
#define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
#include <asm/syscalls_64.h>
+#undef __SYSCALL

-#undef __SYSCALL
#define __SYSCALL(nr, sym) sym,
-
-extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
-
const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = {
#include <asm/syscalls_64.h>
};
diff --git a/scripts/syscalltbl.sh b/scripts/syscalltbl.sh
index 6abe143889ef..6a903b87a7c2 100755
--- a/scripts/syscalltbl.sh
+++ b/scripts/syscalltbl.sh
@@ -54,7 +54,7 @@ nxt=0

grep -E "^[0-9]+[[:space:]]+$abis" "$infile" | {

- while read nr abi name native compat ; do
+ while read nr abi name native compat noreturn; do

if [ $nxt -gt $nr ]; then
echo "error: $infile: syscall table is not sorted or duplicates the same syscall number" >&2
@@ -66,7 +66,21 @@ grep -E "^[0-9]+[[:space:]]+$abis" "$infile" | {
nxt=$((nxt + 1))
done

- if [ -n "$compat" ]; then
+ if [ "$compat" = "-" ]; then
+ unset compat
+ fi
+
+ if [ -n "$noreturn" ]; then
+ if [ "$noreturn" != "noreturn" ]; then
+ echo "error: $infile: invalid string \"$noreturn\" in 'noreturn' column"
+ exit 1
+ fi
+ if [ -n "$compat" ]; then
+ echo "__SYSCALL_COMPAT_NORETURN($nr, $native, $compat)"
+ else
+ echo "__SYSCALL_NORETURN($nr, $native)"
+ fi
+ elif [ -n "$compat" ]; then
echo "__SYSCALL_WITH_COMPAT($nr, $native, $compat)"
elif [ -n "$native" ]; then
echo "__SYSCALL($nr, $native)"
diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h
index 7ebf29c91184..1e8141ef1b15 100644
--- a/tools/objtool/noreturns.h
+++ b/tools/objtool/noreturns.h
@@ -7,12 +7,16 @@
* Yes, this is unfortunate. A better solution is in the works.
*/
NORETURN(__fortify_panic)
+NORETURN(__ia32_sys_exit)
+NORETURN(__ia32_sys_exit_group)
NORETURN(__kunit_abort)
NORETURN(__module_put_and_kthread_exit)
NORETURN(__reiserfs_panic)
NORETURN(__stack_chk_fail)
NORETURN(__tdx_hypercall_failed)
NORETURN(__ubsan_handle_builtin_unreachable)
+NORETURN(__x64_sys_exit)
+NORETURN(__x64_sys_exit_group)
NORETURN(arch_cpu_idle_dead)
NORETURN(bch2_trans_in_restart_error)
NORETURN(bch2_trans_restart_error)
--
2.44.0


2024-05-07 05:30:48

by Josh Poimboeuf

[permalink] [raw]
Subject: [PATCH v5 3/3] x86/bugs: Add 'spectre_bhi=vmexit' cmdline option

In cloud environments it can be useful to *only* enable the vmexit
mitigation and leave syscalls vulnerable. Add that as an option.

This is similar to the old spectre_bhi=auto option which was removed
with the following commit:

36d4fe147c87 ("x86/bugs: Remove CONFIG_BHI_MITIGATION_AUTO and spectre_bhi=auto")

with the main difference being that this has a more descriptive name and
is disabled by default.

Requested-by: Maksim Davydov <[email protected]>
Signed-off-by: Josh Poimboeuf <[email protected]>
---
Documentation/admin-guide/kernel-parameters.txt | 12 +++++++++---
arch/x86/kernel/cpu/bugs.c | 16 +++++++++++-----
2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 213d0719e2b7..9c1f63f04502 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6072,9 +6072,15 @@
deployment of the HW BHI control and the SW BHB
clearing sequence.

- on - (default) Enable the HW or SW mitigation
- as needed.
- off - Disable the mitigation.
+ on - (default) Enable the HW or SW mitigation as
+ needed. This protects the kernel from
+ both syscalls and VMs.
+ vmexit - On systems which don't have the HW mitigation
+ available, enable the SW mitigation on vmexit
+ ONLY. On such systems, the host kernel is
+ protected from VM-originated BHI attacks, but
+ may still be vulnerable to syscall attacks.
+ off - Disable the mitigation.

spectre_v2= [X86,EARLY] Control mitigation of Spectre variant 2
(indirect branch speculation) vulnerability.
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index ab18185894df..6974c8c9792d 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1625,6 +1625,7 @@ static bool __init spec_ctrl_bhi_dis(void)
enum bhi_mitigations {
BHI_MITIGATION_OFF,
BHI_MITIGATION_ON,
+ BHI_MITIGATION_VMEXIT_ONLY,
};

static enum bhi_mitigations bhi_mitigation __ro_after_init =
@@ -1639,6 +1640,8 @@ static int __init spectre_bhi_parse_cmdline(char *str)
bhi_mitigation = BHI_MITIGATION_OFF;
else if (!strcmp(str, "on"))
bhi_mitigation = BHI_MITIGATION_ON;
+ else if (!strcmp(str, "vmexit"))
+ bhi_mitigation = BHI_MITIGATION_VMEXIT_ONLY;
else
pr_err("Ignoring unknown spectre_bhi option (%s)", str);

@@ -1659,19 +1662,22 @@ static void __init bhi_select_mitigation(void)
return;
}

+ /* Mitigate in hardware if supported */
if (spec_ctrl_bhi_dis())
return;

if (!IS_ENABLED(CONFIG_X86_64))
return;

- /* Mitigate KVM by default */
- setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
- pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit\n");
+ if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) {
+ pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit only\n");
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
+ return;
+ }

- /* Mitigate syscalls when the mitigation is forced =on */
+ pr_info("Spectre BHI mitigation: SW BHB clearing on syscall and vm exit\n");
setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP);
- pr_info("Spectre BHI mitigation: SW BHB clearing on syscall\n");
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
}

static void __init spectre_v2_select_mitigation(void)
--
2.44.0


2024-05-07 05:31:30

by Josh Poimboeuf

[permalink] [raw]
Subject: [PATCH v5 2/3] x86/bugs: Remove duplicate Spectre cmdline option descriptions

Duplicating the documentation of all the Spectre kernel cmdline options
in two separate files is unwieldy and error-prone. Instead just add a
reference to kernel-parameters.txt from spectre.rst.

Signed-off-by: Josh Poimboeuf <[email protected]>
---
Documentation/admin-guide/hw-vuln/spectre.rst | 84 ++-----------------
1 file changed, 9 insertions(+), 75 deletions(-)

diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst
index 25a04cda4c2c..f9797ab6b38f 100644
--- a/Documentation/admin-guide/hw-vuln/spectre.rst
+++ b/Documentation/admin-guide/hw-vuln/spectre.rst
@@ -592,85 +592,19 @@ Spectre variant 2
Mitigation control on the kernel command line
---------------------------------------------

-Spectre variant 2 mitigation can be disabled or force enabled at the
-kernel command line.
+In general the kernel selects reasonable default mitigations for the
+current CPU.
+
+Spectre default mitigations can be disabled or changed at the kernel
+command line with the following options:

nospectre_v1
-
- [X86,PPC] Disable mitigations for Spectre Variant 1
- (bounds check bypass). With this option data leaks are
- possible in the system.
-
nospectre_v2
+ spectre_v2={option}
+ spectre_v2_user={option}
+ spectre_bhi={option}

- [X86] Disable all mitigations for the Spectre variant 2
- (indirect branch prediction) vulnerability. System may
- allow data leaks with this option, which is equivalent
- to spectre_v2=off.
-
-
- spectre_v2=
-
- [X86] Control mitigation of Spectre variant 2
- (indirect branch speculation) vulnerability.
- The default operation protects the kernel from
- user space attacks.
-
- on
- unconditionally enable, implies
- spectre_v2_user=on
- off
- unconditionally disable, implies
- spectre_v2_user=off
- auto
- kernel detects whether your CPU model is
- vulnerable
-
- Selecting 'on' will, and 'auto' may, choose a
- mitigation method at run time according to the
- CPU, the available microcode, the setting of the
- CONFIG_MITIGATION_RETPOLINE configuration option,
- and the compiler with which the kernel was built.
-
- Selecting 'on' will also enable the mitigation
- against user space to user space task attacks.
-
- Selecting 'off' will disable both the kernel and
- the user space protections.
-
- Specific mitigations can also be selected manually:
-
- retpoline auto pick between generic,lfence
- retpoline,generic Retpolines
- retpoline,lfence LFENCE; indirect branch
- retpoline,amd alias for retpoline,lfence
- eibrs Enhanced/Auto IBRS
- eibrs,retpoline Enhanced/Auto IBRS + Retpolines
- eibrs,lfence Enhanced/Auto IBRS + LFENCE
- ibrs use IBRS to protect kernel
-
- Not specifying this option is equivalent to
- spectre_v2=auto.
-
- In general the kernel by default selects
- reasonable mitigations for the current CPU. To
- disable Spectre variant 2 mitigations, boot with
- spectre_v2=off. Spectre variant 1 mitigations
- cannot be disabled.
-
- spectre_bhi=
-
- [X86] Control mitigation of Branch History Injection
- (BHI) vulnerability. This setting affects the deployment
- of the HW BHI control and the SW BHB clearing sequence.
-
- on
- (default) Enable the HW or SW mitigation as
- needed.
- off
- Disable the mitigation.
-
-For spectre_v2_user see Documentation/admin-guide/kernel-parameters.txt
+For more details on the available options, refer to Documentation/admin-guide/kernel-parameters.txt

Mitigation selection guide
--------------------------
--
2.44.0


2024-05-07 14:38:40

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH v5 1/3] x86/syscall: Mark exit[_group] syscall handlers __noreturn

On Mon, May 06, 2024 at 10:30:04PM -0700, Josh Poimboeuf wrote:
> The direct-call syscall dispatch function doesn't know that the exit()
> and exit_group() syscall handlers don't return, so the call sites aren't
> optimized accordingly.
>
> Fix that by marking those exit syscall declarations __noreturn.
>
> Fixes the following warnings:
>
> vmlinux.o: warning: objtool: x64_sys_call+0x2804: __x64_sys_exit() is missing a __noreturn annotation
> vmlinux.o: warning: objtool: ia32_sys_call+0x29b6: __ia32_sys_exit_group() is missing a __noreturn annotation
>
> Fixes: 7390db8aea0d ("x86/bhi: Add support for clearing branch history at syscall entry")
> Reported-by: "Paul E. McKenney" <[email protected]>
> Closes: https://lkml.kernel.org/lkml/6dba9b32-db2c-4e6d-9500-7a08852f17a3@paulmck-laptop
> Tested-by: Paul E. McKenney <[email protected]>

Just reaffirming my Tested-by, and thank you!

Thanx, Paul

> Signed-off-by: Josh Poimboeuf <[email protected]>
> ---
> arch/x86/entry/syscall_32.c | 10 ++++++----
> arch/x86/entry/syscall_64.c | 9 ++++++---
> arch/x86/entry/syscall_x32.c | 7 +++++--
> arch/x86/entry/syscalls/syscall_32.tbl | 6 +++---
> arch/x86/entry/syscalls/syscall_64.tbl | 6 +++---
> arch/x86/um/sys_call_table_32.c | 10 ++++++----
> arch/x86/um/sys_call_table_64.c | 11 +++++++----
> scripts/syscalltbl.sh | 18 ++++++++++++++++--
> tools/objtool/noreturns.h | 4 ++++
> 9 files changed, 56 insertions(+), 25 deletions(-)
>
> diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
> index c2235bae17ef..8cc9950d7104 100644
> --- a/arch/x86/entry/syscall_32.c
> +++ b/arch/x86/entry/syscall_32.c
> @@ -14,9 +14,12 @@
> #endif
>
> #define __SYSCALL(nr, sym) extern long __ia32_##sym(const struct pt_regs *);
> -
> +#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __ia32_##sym(const struct pt_regs *);
> #include <asm/syscalls_32.h>
> -#undef __SYSCALL
> +#undef __SYSCALL
> +
> +#undef __SYSCALL_NORETURN
> +#define __SYSCALL_NORETURN __SYSCALL
>
> /*
> * The sys_call_table[] is no longer used for system calls, but
> @@ -28,11 +31,10 @@
> const sys_call_ptr_t sys_call_table[] = {
> #include <asm/syscalls_32.h>
> };
> -#undef __SYSCALL
> +#undef __SYSCALL
> #endif
>
> #define __SYSCALL(nr, sym) case nr: return __ia32_##sym(regs);
> -
> long ia32_sys_call(const struct pt_regs *regs, unsigned int nr)
> {
> switch (nr) {
> diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
> index 33b3f09e6f15..ba8354424860 100644
> --- a/arch/x86/entry/syscall_64.c
> +++ b/arch/x86/entry/syscall_64.c
> @@ -8,8 +8,12 @@
> #include <asm/syscall.h>
>
> #define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
> +#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
> #include <asm/syscalls_64.h>
> -#undef __SYSCALL
> +#undef __SYSCALL
> +
> +#undef __SYSCALL_NORETURN
> +#define __SYSCALL_NORETURN __SYSCALL
>
> /*
> * The sys_call_table[] is no longer used for system calls, but
> @@ -20,10 +24,9 @@
> const sys_call_ptr_t sys_call_table[] = {
> #include <asm/syscalls_64.h>
> };
> -#undef __SYSCALL
> +#undef __SYSCALL
>
> #define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
> -
> long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
> {
> switch (nr) {
> diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c
> index 03de4a932131..fb77908f44f3 100644
> --- a/arch/x86/entry/syscall_x32.c
> +++ b/arch/x86/entry/syscall_x32.c
> @@ -8,11 +8,14 @@
> #include <asm/syscall.h>
>
> #define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
> +#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
> #include <asm/syscalls_x32.h>
> -#undef __SYSCALL
> +#undef __SYSCALL
> +
> +#undef __SYSCALL_NORETURN
> +#define __SYSCALL_NORETURN __SYSCALL
>
> #define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
> -
> long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
> {
> switch (nr) {
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 5f8591ce7f25..9e9a908cd50d 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -2,7 +2,7 @@
> # 32-bit system call numbers and entry vectors
> #
> # The format is:
> -# <number> <abi> <name> <entry point> <compat entry point>
> +# <number> <abi> <name> <entry point> [<compat entry point> [noreturn]]
> #
> # The __ia32_sys and __ia32_compat_sys stubs are created on-the-fly for
> # sys_*() system calls and compat_sys_*() compat system calls if
> @@ -12,7 +12,7 @@
> # The abi is always "i386" for this file.
> #
> 0 i386 restart_syscall sys_restart_syscall
> -1 i386 exit sys_exit
> +1 i386 exit sys_exit - noreturn
> 2 i386 fork sys_fork
> 3 i386 read sys_read
> 4 i386 write sys_write
> @@ -263,7 +263,7 @@
> 249 i386 io_cancel sys_io_cancel
> 250 i386 fadvise64 sys_ia32_fadvise64
> # 251 is available for reuse (was briefly sys_set_zone_reclaim)
> -252 i386 exit_group sys_exit_group
> +252 i386 exit_group sys_exit_group - noreturn
> 253 i386 lookup_dcookie
> 254 i386 epoll_create sys_epoll_create
> 255 i386 epoll_ctl sys_epoll_ctl
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 7e8d46f4147f..5ea7387c1aa1 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -2,7 +2,7 @@
> # 64-bit system call numbers and entry vectors
> #
> # The format is:
> -# <number> <abi> <name> <entry point>
> +# <number> <abi> <name> <entry point> [<compat entry point> [noreturn]]
> #
> # The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls
> #
> @@ -68,7 +68,7 @@
> 57 common fork sys_fork
> 58 common vfork sys_vfork
> 59 64 execve sys_execve
> -60 common exit sys_exit
> +60 common exit sys_exit - noreturn
> 61 common wait4 sys_wait4
> 62 common kill sys_kill
> 63 common uname sys_newuname
> @@ -239,7 +239,7 @@
> 228 common clock_gettime sys_clock_gettime
> 229 common clock_getres sys_clock_getres
> 230 common clock_nanosleep sys_clock_nanosleep
> -231 common exit_group sys_exit_group
> +231 common exit_group sys_exit_group - noreturn
> 232 common epoll_wait sys_epoll_wait
> 233 common epoll_ctl sys_epoll_ctl
> 234 common tgkill sys_tgkill
> diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
> index 89df5d89d664..51655133eee3 100644
> --- a/arch/x86/um/sys_call_table_32.c
> +++ b/arch/x86/um/sys_call_table_32.c
> @@ -9,6 +9,10 @@
> #include <linux/cache.h>
> #include <asm/syscall.h>
>
> +extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long,
> + unsigned long, unsigned long,
> + unsigned long, unsigned long);
> +
> /*
> * Below you can see, in terms of #define's, the differences between the x86-64
> * and the UML syscall table.
> @@ -22,15 +26,13 @@
> #define sys_vm86 sys_ni_syscall
>
> #define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native)
> +#define __SYSCALL_NORETURN __SYSCALL
>
> #define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
> #include <asm/syscalls_32.h>
> +#undef __SYSCALL
>
> -#undef __SYSCALL
> #define __SYSCALL(nr, sym) sym,
> -
> -extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
> -
> const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = {
> #include <asm/syscalls_32.h>
> };
> diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
> index b0b4cfd2308c..943d414f2109 100644
> --- a/arch/x86/um/sys_call_table_64.c
> +++ b/arch/x86/um/sys_call_table_64.c
> @@ -9,6 +9,10 @@
> #include <linux/cache.h>
> #include <asm/syscall.h>
>
> +extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long,
> + unsigned long, unsigned long,
> + unsigned long, unsigned long);
> +
> /*
> * Below you can see, in terms of #define's, the differences between the x86-64
> * and the UML syscall table.
> @@ -18,14 +22,13 @@
> #define sys_iopl sys_ni_syscall
> #define sys_ioperm sys_ni_syscall
>
> +#define __SYSCALL_NORETURN __SYSCALL
> +
> #define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
> #include <asm/syscalls_64.h>
> +#undef __SYSCALL
>
> -#undef __SYSCALL
> #define __SYSCALL(nr, sym) sym,
> -
> -extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
> -
> const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = {
> #include <asm/syscalls_64.h>
> };
> diff --git a/scripts/syscalltbl.sh b/scripts/syscalltbl.sh
> index 6abe143889ef..6a903b87a7c2 100755
> --- a/scripts/syscalltbl.sh
> +++ b/scripts/syscalltbl.sh
> @@ -54,7 +54,7 @@ nxt=0
>
> grep -E "^[0-9]+[[:space:]]+$abis" "$infile" | {
>
> - while read nr abi name native compat ; do
> + while read nr abi name native compat noreturn; do
>
> if [ $nxt -gt $nr ]; then
> echo "error: $infile: syscall table is not sorted or duplicates the same syscall number" >&2
> @@ -66,7 +66,21 @@ grep -E "^[0-9]+[[:space:]]+$abis" "$infile" | {
> nxt=$((nxt + 1))
> done
>
> - if [ -n "$compat" ]; then
> + if [ "$compat" = "-" ]; then
> + unset compat
> + fi
> +
> + if [ -n "$noreturn" ]; then
> + if [ "$noreturn" != "noreturn" ]; then
> + echo "error: $infile: invalid string \"$noreturn\" in 'noreturn' column"
> + exit 1
> + fi
> + if [ -n "$compat" ]; then
> + echo "__SYSCALL_COMPAT_NORETURN($nr, $native, $compat)"
> + else
> + echo "__SYSCALL_NORETURN($nr, $native)"
> + fi
> + elif [ -n "$compat" ]; then
> echo "__SYSCALL_WITH_COMPAT($nr, $native, $compat)"
> elif [ -n "$native" ]; then
> echo "__SYSCALL($nr, $native)"
> diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h
> index 7ebf29c91184..1e8141ef1b15 100644
> --- a/tools/objtool/noreturns.h
> +++ b/tools/objtool/noreturns.h
> @@ -7,12 +7,16 @@
> * Yes, this is unfortunate. A better solution is in the works.
> */
> NORETURN(__fortify_panic)
> +NORETURN(__ia32_sys_exit)
> +NORETURN(__ia32_sys_exit_group)
> NORETURN(__kunit_abort)
> NORETURN(__module_put_and_kthread_exit)
> NORETURN(__reiserfs_panic)
> NORETURN(__stack_chk_fail)
> NORETURN(__tdx_hypercall_failed)
> NORETURN(__ubsan_handle_builtin_unreachable)
> +NORETURN(__x64_sys_exit)
> +NORETURN(__x64_sys_exit_group)
> NORETURN(arch_cpu_idle_dead)
> NORETURN(bch2_trans_in_restart_error)
> NORETURN(bch2_trans_restart_error)
> --
> 2.44.0
>

2024-05-07 18:15:19

by Daniel Sneddon

[permalink] [raw]
Subject: Re: [PATCH v5 2/3] x86/bugs: Remove duplicate Spectre cmdline option descriptions

I love the idea here, but

> nospectre_v2
> + spectre_v2={option}
> + spectre_v2_user={option}
> + spectre_bhi={option}
>

this comes out as just a single line when I run make htmldocs.


2024-05-07 19:20:02

by Daniel Sneddon

[permalink] [raw]
Subject: Re: [PATCH v5 3/3] x86/bugs: Add 'spectre_bhi=vmexit' cmdline option

On 5/6/24 22:30, Josh Poimboeuf wrote:
> In cloud environments it can be useful to *only* enable the vmexit
> mitigation and leave syscalls vulnerable. Add that as an option.
>
> This is similar to the old spectre_bhi=auto option which was removed
> with the following commit:
>
> 36d4fe147c87 ("x86/bugs: Remove CONFIG_BHI_MITIGATION_AUTO and spectre_bhi=auto")
>
> with the main difference being that this has a more descriptive name and
> is disabled by default.
>
> Requested-by: Maksim Davydov <[email protected]>
> Signed-off-by: Josh Poimboeuf <[email protected]>
> ---

Does the KConfig option need to be updated to support this as well? Other than
that,
Reviewed-by: Daniel Sneddon <[email protected]>

> Documentation/admin-guide/kernel-parameters.txt | 12 +++++++++---
> arch/x86/kernel/cpu/bugs.c | 16 +++++++++++-----
> 2 files changed, 20 insertions(+), 8 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 213d0719e2b7..9c1f63f04502 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -6072,9 +6072,15 @@
> deployment of the HW BHI control and the SW BHB
> clearing sequence.
>
> - on - (default) Enable the HW or SW mitigation
> - as needed.
> - off - Disable the mitigation.
> + on - (default) Enable the HW or SW mitigation as
> + needed. This protects the kernel from
> + both syscalls and VMs.
> + vmexit - On systems which don't have the HW mitigation
> + available, enable the SW mitigation on vmexit
> + ONLY. On such systems, the host kernel is
> + protected from VM-originated BHI attacks, but
> + may still be vulnerable to syscall attacks.
> + off - Disable the mitigation.
>
> spectre_v2= [X86,EARLY] Control mitigation of Spectre variant 2
> (indirect branch speculation) vulnerability.
> diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
> index ab18185894df..6974c8c9792d 100644
> --- a/arch/x86/kernel/cpu/bugs.c
> +++ b/arch/x86/kernel/cpu/bugs.c
> @@ -1625,6 +1625,7 @@ static bool __init spec_ctrl_bhi_dis(void)
> enum bhi_mitigations {
> BHI_MITIGATION_OFF,
> BHI_MITIGATION_ON,
> + BHI_MITIGATION_VMEXIT_ONLY,
> };
>
> static enum bhi_mitigations bhi_mitigation __ro_after_init =
> @@ -1639,6 +1640,8 @@ static int __init spectre_bhi_parse_cmdline(char *str)
> bhi_mitigation = BHI_MITIGATION_OFF;
> else if (!strcmp(str, "on"))
> bhi_mitigation = BHI_MITIGATION_ON;
> + else if (!strcmp(str, "vmexit"))
> + bhi_mitigation = BHI_MITIGATION_VMEXIT_ONLY;
> else
> pr_err("Ignoring unknown spectre_bhi option (%s)", str);
>
> @@ -1659,19 +1662,22 @@ static void __init bhi_select_mitigation(void)
> return;
> }
>
> + /* Mitigate in hardware if supported */
> if (spec_ctrl_bhi_dis())
> return;
>
> if (!IS_ENABLED(CONFIG_X86_64))
> return;
>
> - /* Mitigate KVM by default */
> - setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
> - pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit\n");
> + if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) {
> + pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit only\n");
> + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
> + return;
> + }
>
> - /* Mitigate syscalls when the mitigation is forced =on */
> + pr_info("Spectre BHI mitigation: SW BHB clearing on syscall and vm exit\n");
> setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP);
> - pr_info("Spectre BHI mitigation: SW BHB clearing on syscall\n");
> + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
> }
>
> static void __init spectre_v2_select_mitigation(void)


2024-05-08 05:20:06

by Josh Poimboeuf

[permalink] [raw]
Subject: Re: [PATCH v5 3/3] x86/bugs: Add 'spectre_bhi=vmexit' cmdline option

On Tue, May 07, 2024 at 07:58:07AM -0700, Daniel Sneddon wrote:
> On 5/6/24 22:30, Josh Poimboeuf wrote:
> > In cloud environments it can be useful to *only* enable the vmexit
> > mitigation and leave syscalls vulnerable. Add that as an option.
> >
> > This is similar to the old spectre_bhi=auto option which was removed
> > with the following commit:
> >
> > 36d4fe147c87 ("x86/bugs: Remove CONFIG_BHI_MITIGATION_AUTO and spectre_bhi=auto")
> >
> > with the main difference being that this has a more descriptive name and
> > is disabled by default.
> >
> > Requested-by: Maksim Davydov <[email protected]>
> > Signed-off-by: Josh Poimboeuf <[email protected]>
> > ---
>
> Does the KConfig option need to be updated to support this as well?

In general we don't provide a config option for every possible
mitigation cmdline option. If someone requests it we could add it
later.

> Reviewed-by: Daniel Sneddon <[email protected]>

Thanks!

--
Josh

2024-05-08 05:56:15

by Josh Poimboeuf

[permalink] [raw]
Subject: Re: [PATCH v5 2/3] x86/bugs: Remove duplicate Spectre cmdline option descriptions

On Tue, May 07, 2024 at 08:04:37AM -0700, Daniel Sneddon wrote:
> I love the idea here, but
>
> > nospectre_v2
> > + spectre_v2={option}
> > + spectre_v2_user={option}
> > + spectre_bhi={option}
> >
>
> this comes out as just a single line when I run make htmldocs.

Thanks, the below turns it into a bulleted list:

diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst
index f9797ab6b38f..132e0bc6007e 100644
--- a/Documentation/admin-guide/hw-vuln/spectre.rst
+++ b/Documentation/admin-guide/hw-vuln/spectre.rst
@@ -598,11 +598,11 @@ current CPU.
Spectre default mitigations can be disabled or changed at the kernel
command line with the following options:

- nospectre_v1
- nospectre_v2
- spectre_v2={option}
- spectre_v2_user={option}
- spectre_bhi={option}
+ - nospectre_v1
+ - nospectre_v2
+ - spectre_v2={option}
+ - spectre_v2_user={option}
+ - spectre_bhi={option}

For more details on the available options, refer to Documentation/admin-guide/kernel-parameters.txt


2024-05-08 14:28:50

by Daniel Sneddon

[permalink] [raw]
Subject: Re: [PATCH v5 2/3] x86/bugs: Remove duplicate Spectre cmdline option descriptions

On 5/7/24 22:55, Josh Poimboeuf wrote:
> On Tue, May 07, 2024 at 08:04:37AM -0700, Daniel Sneddon wrote:
>> I love the idea here, but
>>
>>> nospectre_v2
>>> + spectre_v2={option}
>>> + spectre_v2_user={option}
>>> + spectre_bhi={option}
>>>
>>
>> this comes out as just a single line when I run make htmldocs.
>
> Thanks, the below turns it into a bulleted list:
>
> diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst
> index f9797ab6b38f..132e0bc6007e 100644
> --- a/Documentation/admin-guide/hw-vuln/spectre.rst
> +++ b/Documentation/admin-guide/hw-vuln/spectre.rst
> @@ -598,11 +598,11 @@ current CPU.
> Spectre default mitigations can be disabled or changed at the kernel
> command line with the following options:
>
> - nospectre_v1
> - nospectre_v2
> - spectre_v2={option}
> - spectre_v2_user={option}
> - spectre_bhi={option}
> + - nospectre_v1
> + - nospectre_v2
> + - spectre_v2={option}
> + - spectre_v2_user={option}
> + - spectre_bhi={option}
>
> For more details on the available options, refer to Documentation/admin-guide/kernel-parameters.txt
>

Looks good.

Reviewed-by: Daniel Sneddon <[email protected]>


2024-05-08 15:10:38

by Nikolay Borisov

[permalink] [raw]
Subject: Re: [PATCH v5 3/3] x86/bugs: Add 'spectre_bhi=vmexit' cmdline option



On 7.05.24 г. 8:30 ч., Josh Poimboeuf wrote:
> In cloud environments it can be useful to *only* enable the vmexit
> mitigation and leave syscalls vulnerable. Add that as an option.
>
> This is similar to the old spectre_bhi=auto option which was removed
> with the following commit:
>
> 36d4fe147c87 ("x86/bugs: Remove CONFIG_BHI_MITIGATION_AUTO and spectre_bhi=auto")
>
> with the main difference being that this has a more descriptive name and
> is disabled by default.
>
> Requested-by: Maksim Davydov <[email protected]>
> Signed-off-by: Josh Poimboeuf <[email protected]>
> ---
> Documentation/admin-guide/kernel-parameters.txt | 12 +++++++++---
> arch/x86/kernel/cpu/bugs.c | 16 +++++++++++-----
> 2 files changed, 20 insertions(+), 8 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 213d0719e2b7..9c1f63f04502 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -6072,9 +6072,15 @@
> deployment of the HW BHI control and the SW BHB
> clearing sequence.
>
> - on - (default) Enable the HW or SW mitigation
> - as needed.
> - off - Disable the mitigation.
> + on - (default) Enable the HW or SW mitigation as
> + needed. This protects the kernel from
> + both syscalls and VMs.
> + vmexit - On systems which don't have the HW mitigation
> + available, enable the SW mitigation on vmexit
> + ONLY. On such systems, the host kernel is
> + protected from VM-originated BHI attacks, but
> + may still be vulnerable to syscall attacks.
> + off - Disable the mitigation.
>
> spectre_v2= [X86,EARLY] Control mitigation of Spectre variant 2
> (indirect branch speculation) vulnerability.
> diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
> index ab18185894df..6974c8c9792d 100644
> --- a/arch/x86/kernel/cpu/bugs.c
> +++ b/arch/x86/kernel/cpu/bugs.c
> @@ -1625,6 +1625,7 @@ static bool __init spec_ctrl_bhi_dis(void)
> enum bhi_mitigations {
> BHI_MITIGATION_OFF,
> BHI_MITIGATION_ON,
> + BHI_MITIGATION_VMEXIT_ONLY,
> };
>
> static enum bhi_mitigations bhi_mitigation __ro_after_init =
> @@ -1639,6 +1640,8 @@ static int __init spectre_bhi_parse_cmdline(char *str)
> bhi_mitigation = BHI_MITIGATION_OFF;
> else if (!strcmp(str, "on"))
> bhi_mitigation = BHI_MITIGATION_ON;
> + else if (!strcmp(str, "vmexit"))
> + bhi_mitigation = BHI_MITIGATION_VMEXIT_ONLY;
> else
> pr_err("Ignoring unknown spectre_bhi option (%s)", str);
>
> @@ -1659,19 +1662,22 @@ static void __init bhi_select_mitigation(void)
> return;
> }
>
> + /* Mitigate in hardware if supported */
> if (spec_ctrl_bhi_dis())
> return;
>
> if (!IS_ENABLED(CONFIG_X86_64))
> return;
>
> - /* Mitigate KVM by default */
> - setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
> - pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit\n");
> + if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) {
> + pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit only\n");
> + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
> + return;
> + }

nit: How about setting CLEAR_BHB_LOOP_ON_VMEXIT unconditionally, then
afterwards checking if MITIGATION_VMEXIT_ONLY is set and if yes simply
return, that way you don't duplicate the setup of the VMEXIT code

>
> - /* Mitigate syscalls when the mitigation is forced =on */
> + pr_info("Spectre BHI mitigation: SW BHB clearing on syscall and vm exit\n");
> setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP);
> - pr_info("Spectre BHI mitigation: SW BHB clearing on syscall\n");
> + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
> }
>
> static void __init spectre_v2_select_mitigation(void)

2024-05-09 05:24:55

by Josh Poimboeuf

[permalink] [raw]
Subject: Re: [PATCH v5 3/3] x86/bugs: Add 'spectre_bhi=vmexit' cmdline option

On Wed, May 08, 2024 at 06:10:21PM +0300, Nikolay Borisov wrote:
> > @@ -1659,19 +1662,22 @@ static void __init bhi_select_mitigation(void)
> > return;
> > }
> > + /* Mitigate in hardware if supported */
> > if (spec_ctrl_bhi_dis())
> > return;
> > if (!IS_ENABLED(CONFIG_X86_64))
> > return;
> > - /* Mitigate KVM by default */
> > - setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
> > - pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit\n");
> > + if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) {
> > + pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit only\n");
> > + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
> > + return;
> > + }
>
> nit: How about setting CLEAR_BHB_LOOP_ON_VMEXIT unconditionally, then
> afterwards checking if MITIGATION_VMEXIT_ONLY is set and if yes simply
> return, that way you don't duplicate the setup of the VMEXIT code

I think the duplication actually makes it more readable. In both cases
it puts the setting of the features together along with the
corresponding pr_info().

--
Josh

2024-05-09 08:29:23

by Nikolay Borisov

[permalink] [raw]
Subject: Re: [PATCH v5 3/3] x86/bugs: Add 'spectre_bhi=vmexit' cmdline option



On 9.05.24 г. 8:24 ч., Josh Poimboeuf wrote:
> On Wed, May 08, 2024 at 06:10:21PM +0300, Nikolay Borisov wrote:
>>> @@ -1659,19 +1662,22 @@ static void __init bhi_select_mitigation(void)
>>> return;
>>> }
>>> + /* Mitigate in hardware if supported */
>>> if (spec_ctrl_bhi_dis())
>>> return;
>>> if (!IS_ENABLED(CONFIG_X86_64))
>>> return;
>>> - /* Mitigate KVM by default */
>>> - setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
>>> - pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit\n");
>>> + if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) {
>>> + pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit only\n");
>>> + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
>>> + return;
>>> + }
>>
>> nit: How about setting CLEAR_BHB_LOOP_ON_VMEXIT unconditionally, then
>> afterwards checking if MITIGATION_VMEXIT_ONLY is set and if yes simply
>> return, that way you don't duplicate the setup of the VMEXIT code
>
> I think the duplication actually makes it more readable. In both cases
> it puts the setting of the features together along with the
> corresponding pr_info().

Right, my suggestion also meant that setting + pr info will be together,
unconditional and if MITIGATION_VMEXIT_ONLY is set we return early,
without setting X86_FEATURE_CLEAR_BHB_LOOP. In any case it's a minor
remark, feel free to ignore.

Reviewed-by: Nikolay Borisov <[email protected]>

>

2024-05-20 13:15:01

by Maksim Davydov

[permalink] [raw]
Subject: Re: [PATCH v5 3/3] x86/bugs: Add 'spectre_bhi=vmexit' cmdline option

Hi!
What is the current status of the series?


On 5/7/24 08:30, Josh Poimboeuf wrote:
> In cloud environments it can be useful to *only* enable the vmexit
> mitigation and leave syscalls vulnerable. Add that as an option.
>
> This is similar to the old spectre_bhi=auto option which was removed
> with the following commit:
>
> 36d4fe147c87 ("x86/bugs: Remove CONFIG_BHI_MITIGATION_AUTO and spectre_bhi=auto")
>
> with the main difference being that this has a more descriptive name and
> is disabled by default.
>
> Requested-by: Maksim Davydov <[email protected]>
> Signed-off-by: Josh Poimboeuf <[email protected]>
> ---
> Documentation/admin-guide/kernel-parameters.txt | 12 +++++++++---
> arch/x86/kernel/cpu/bugs.c | 16 +++++++++++-----
> 2 files changed, 20 insertions(+), 8 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 213d0719e2b7..9c1f63f04502 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -6072,9 +6072,15 @@
> deployment of the HW BHI control and the SW BHB
> clearing sequence.
>
> - on - (default) Enable the HW or SW mitigation
> - as needed.
> - off - Disable the mitigation.
> + on - (default) Enable the HW or SW mitigation as
> + needed. This protects the kernel from
> + both syscalls and VMs.
> + vmexit - On systems which don't have the HW mitigation
> + available, enable the SW mitigation on vmexit
> + ONLY. On such systems, the host kernel is
> + protected from VM-originated BHI attacks, but
> + may still be vulnerable to syscall attacks.
> + off - Disable the mitigation.
>
> spectre_v2= [X86,EARLY] Control mitigation of Spectre variant 2
> (indirect branch speculation) vulnerability.
> diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
> index ab18185894df..6974c8c9792d 100644
> --- a/arch/x86/kernel/cpu/bugs.c
> +++ b/arch/x86/kernel/cpu/bugs.c
> @@ -1625,6 +1625,7 @@ static bool __init spec_ctrl_bhi_dis(void)
> enum bhi_mitigations {
> BHI_MITIGATION_OFF,
> BHI_MITIGATION_ON,
> + BHI_MITIGATION_VMEXIT_ONLY,
> };
>
> static enum bhi_mitigations bhi_mitigation __ro_after_init =
> @@ -1639,6 +1640,8 @@ static int __init spectre_bhi_parse_cmdline(char *str)
> bhi_mitigation = BHI_MITIGATION_OFF;
> else if (!strcmp(str, "on"))
> bhi_mitigation = BHI_MITIGATION_ON;
> + else if (!strcmp(str, "vmexit"))
> + bhi_mitigation = BHI_MITIGATION_VMEXIT_ONLY;
> else
> pr_err("Ignoring unknown spectre_bhi option (%s)", str);
>
> @@ -1659,19 +1662,22 @@ static void __init bhi_select_mitigation(void)
> return;
> }
>
> + /* Mitigate in hardware if supported */
> if (spec_ctrl_bhi_dis())
> return;
>
> if (!IS_ENABLED(CONFIG_X86_64))
> return;
>
> - /* Mitigate KVM by default */
> - setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
> - pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit\n");
> + if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) {
> + pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit only\n");
> + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
> + return;
> + }
>
> - /* Mitigate syscalls when the mitigation is forced =on */
> + pr_info("Spectre BHI mitigation: SW BHB clearing on syscall and vm exit\n");
> setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP);
> - pr_info("Spectre BHI mitigation: SW BHB clearing on syscall\n");
> + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
> }
>
> static void __init spectre_v2_select_mitigation(void)

--
Best regards,
Maksim Davydov

2024-05-23 01:04:29

by Josh Poimboeuf

[permalink] [raw]
Subject: Re: [PATCH v5 3/3] x86/bugs: Add 'spectre_bhi=vmexit' cmdline option

On Mon, May 20, 2024 at 04:12:58PM +0300, Maksim Davydov wrote:
> Hi!
> What is the current status of the series?

Looks like it didn't make the merge window. I can post a new version of
the series next week (with the minor documentation fix in patch 2).

--
Josh

2024-05-27 10:46:37

by Maksim Davydov

[permalink] [raw]
Subject: Re: [PATCH v5 3/3] x86/bugs: Add 'spectre_bhi=vmexit' cmdline option



On 5/8/24 08:19, Josh Poimboeuf wrote:
> On Tue, May 07, 2024 at 07:58:07AM -0700, Daniel Sneddon wrote:
>> On 5/6/24 22:30, Josh Poimboeuf wrote:
>>> In cloud environments it can be useful to *only* enable the vmexit
>>> mitigation and leave syscalls vulnerable. Add that as an option.
>>>
>>> This is similar to the old spectre_bhi=auto option which was removed
>>> with the following commit:
>>>
>>> 36d4fe147c87 ("x86/bugs: Remove CONFIG_BHI_MITIGATION_AUTO and spectre_bhi=auto")
>>>
>>> with the main difference being that this has a more descriptive name and
>>> is disabled by default.
>>>
>>> Requested-by: Maksim Davydov <[email protected]>
>>> Signed-off-by: Josh Poimboeuf <[email protected]>
>>> ---
>>
>> Does the KConfig option need to be updated to support this as well?
>
> In general we don't provide a config option for every possible
> mitigation cmdline option. If someone requests it we could add it
> later.
>
>> Reviewed-by: Daniel Sneddon <[email protected]>
>
> Thanks!
>

I think it will be useful for us to have appropriate Kconfig option.
Could you please add it to the next version?

--
Best regards,
Maksim Davydov

2024-05-27 11:16:57

by Nikolay Borisov

[permalink] [raw]
Subject: Re: [PATCH v5 1/3] x86/syscall: Mark exit[_group] syscall handlers __noreturn



On 7.05.24 г. 8:30 ч., Josh Poimboeuf wrote:
> The direct-call syscall dispatch function doesn't know that the exit()
> and exit_group() syscall handlers don't return, so the call sites aren't
> optimized accordingly.
>
> Fix that by marking those exit syscall declarations __noreturn.
>
> Fixes the following warnings:
>
> vmlinux.o: warning: objtool: x64_sys_call+0x2804: __x64_sys_exit() is missing a __noreturn annotation
> vmlinux.o: warning: objtool: ia32_sys_call+0x29b6: __ia32_sys_exit_group() is missing a __noreturn annotation
>
> Fixes: 7390db8aea0d ("x86/bhi: Add support for clearing branch history at syscall entry")
> Reported-by: "Paul E. McKenney" <[email protected]>
> Closes: https://lkml.kernel.org/lkml/6dba9b32-db2c-4e6d-9500-7a08852f17a3@paulmck-laptop
> Tested-by: Paul E. McKenney <[email protected]>
> Signed-off-by: Josh Poimboeuf <[email protected]>
> ---
> arch/x86/entry/syscall_32.c | 10 ++++++----
> arch/x86/entry/syscall_64.c | 9 ++++++---
> arch/x86/entry/syscall_x32.c | 7 +++++--
> arch/x86/entry/syscalls/syscall_32.tbl | 6 +++---
> arch/x86/entry/syscalls/syscall_64.tbl | 6 +++---
> arch/x86/um/sys_call_table_32.c | 10 ++++++----
> arch/x86/um/sys_call_table_64.c | 11 +++++++----
> scripts/syscalltbl.sh | 18 ++++++++++++++++--
> tools/objtool/noreturns.h | 4 ++++
> 9 files changed, 56 insertions(+), 25 deletions(-)
>
> diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
> index c2235bae17ef..8cc9950d7104 100644
> --- a/arch/x86/entry/syscall_32.c
> +++ b/arch/x86/entry/syscall_32.c
> @@ -14,9 +14,12 @@
> #endif
>
> #define __SYSCALL(nr, sym) extern long __ia32_##sym(const struct pt_regs *);
> -
> +#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __ia32_##sym(const struct pt_regs *);
> #include <asm/syscalls_32.h>
> -#undef __SYSCALL
> +#undef __SYSCALL
> +
> +#undef __SYSCALL_NORETURN
> +#define __SYSCALL_NORETURN __SYSCALL
>
> /*
> * The sys_call_table[] is no longer used for system calls, but
> @@ -28,11 +31,10 @@
> const sys_call_ptr_t sys_call_table[] = {
> #include <asm/syscalls_32.h>
> };
> -#undef __SYSCALL
> +#undef __SYSCALL

nit: Am I blind or all the __SYSCALL lines have an extra whitespace?

<snip>