When we context switch from a shallow call stack to a deeper one, as we
'ret' up the deeper side we may encounter RSB entries (predictions for
where the 'ret' goes to) which were populated in userspace. This is
problematic if we have neither SMEP nor KPTI (the latter of which marks
userspace pages as NX for the kernel), as malicious code in userspace
may then be executed speculatively. So overwrite the CPU's return
prediction stack with calls which are predicted to return to an infinite
loop, to "capture" speculation if this happens. This is required both
for retpoline, and also in conjunction with IBRS for !SMEP && !KPTI.
On Skylake+ the problem is slightly different, and an *underflow* of the
RSB may cause errant branch predictions to occur. So there it's not so
much overwrite, as *filling* the RSB to attempt to prevent it getting
empty. This is only a partial solution for Skylake+ since there are many
other conditions which may result in the RSB becoming empty. The full
solution on Skylake+ is to use IBRS, which will prevent the problem even
when the RSB becomes empty. With IBRS, the RSB-stuffing will not be
required on context switch.
Signed-off-by: David Woodhouse <[email protected]>
Acked-by: Arjan van de Ven <[email protected]>
---
arch/x86/entry/entry_32.S | 11 +++++++++++
arch/x86/entry/entry_64.S | 11 +++++++++++
arch/x86/include/asm/cpufeatures.h | 1 +
arch/x86/kernel/cpu/bugs.c | 34 ++++++++++++++++++++++++++++++++++
4 files changed, 57 insertions(+)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index a1f28a5..ef0e478 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -244,6 +244,17 @@ ENTRY(__switch_to_asm)
movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
#endif
+#ifdef CONFIG_RETPOLINE
+ /*
+ * When we switch from a shallower to a deeper call stack
+ * the RSB may either underflow or use entries populated
+ * with userspace addresses. On CPUs where those concerns
+ * exist, overwrite the RSB with entries which capture
+ * speculative execution to prevent attack.
+ */
+ FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+#endif
+
/* restore callee-saved registers */
popl %esi
popl %edi
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 59874bc..b2937d8 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -487,6 +487,17 @@ ENTRY(__switch_to_asm)
movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset
#endif
+#ifdef CONFIG_RETPOLINE
+ /*
+ * When we switch from a shallower to a deeper call stack
+ * the RSB may either underflow or use entries populated
+ * with userspace addresses. On CPUs where those concerns
+ * exist, overwrite the RSB with entries which capture
+ * speculative execution to prevent attack.
+ */
+ FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+#endif
+
/* restore callee-saved registers */
popq %r15
popq %r14
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index f275447..aa09559 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -211,6 +211,7 @@
#define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */
#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
+#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index e4dc261..c17cce3 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -23,6 +23,7 @@
#include <asm/alternative.h>
#include <asm/pgtable.h>
#include <asm/set_memory.h>
+#include <asm/intel-family.h>
static void __init spectre_v2_select_mitigation(void);
@@ -155,6 +156,22 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
return SPECTRE_V2_CMD_NONE;
}
+/* Check for Skylake-like CPUs (for RSB handling) */
+static bool __init is_skylake_era(void)
+{
+ if (boot_cpu_data.x86 == 6) {
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_SKYLAKE_MOBILE:
+ case INTEL_FAM6_SKYLAKE_DESKTOP:
+ case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_FAM6_KABYLAKE_MOBILE:
+ case INTEL_FAM6_KABYLAKE_DESKTOP:
+ return true;
+ }
+ }
+ return false;
+}
+
static void __init spectre_v2_select_mitigation(void)
{
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@ -213,6 +230,23 @@ static void __init spectre_v2_select_mitigation(void)
spectre_v2_enabled = mode;
pr_info("%s\n", spectre_v2_strings[mode]);
+
+ /*
+ * If we don't have SMEP or KPTI, then we run the risk of hitting
+ * userspace addresses in the RSB after a context switch from a
+ * shallow call stack to a deeper one. We must must fill the entire
+ * RSB to avoid that, even when using IBRS.
+ *
+ * Skylake era CPUs have a separate issue with *underflow* of the
+ * RSB, when they will predict 'ret' targets from the generic BTB.
+ * IBRS makes that safe, but we need to fill the RSB on context
+ * switch if we're using retpoline.
+ */
+ if ((!boot_cpu_has(X86_FEATURE_PTI) &&
+ !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
+ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
+ pr_info("Filling RSB on context switch\n");
+ }
}
#undef pr_fmt
--
2.7.4
> + if ((!boot_cpu_has(X86_FEATURE_PTI) &&
> + !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
> + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
> + pr_info("Filling RSB on context switch\n");
We need to do more things for Skylake (like idle and interrupt fill
and possibly deep call cahin), so I don't think it makes sense to
- have an individual flag for each of these. It can be just a single
flag that enables all of this for Skylake
- print something for each of them. that will just be very noisy
without any useful benefit to the user.
-Andi
On 12/01/18 17:49, David Woodhouse wrote:
> When we context switch from a shallow call stack to a deeper one, as we
> 'ret' up the deeper side we may encounter RSB entries (predictions for
> where the 'ret' goes to) which were populated in userspace. This is
> problematic if we have neither SMEP nor KPTI (the latter of which marks
> userspace pages as NX for the kernel), as malicious code in userspace
> may then be executed speculatively. So overwrite the CPU's return
> prediction stack with calls which are predicted to return to an infinite
> loop, to "capture" speculation if this happens. This is required both
> for retpoline, and also in conjunction with IBRS for !SMEP && !KPTI.
>
> On Skylake+ the problem is slightly different, and an *underflow* of the
> RSB may cause errant branch predictions to occur. So there it's not so
> much overwrite, as *filling* the RSB to attempt to prevent it getting
> empty. This is only a partial solution for Skylake+ since there are many
> other conditions which may result in the RSB becoming empty. The full
> solution on Skylake+ is to use IBRS, which will prevent the problem even
> when the RSB becomes empty. With IBRS, the RSB-stuffing will not be
> required on context switch.
If you unconditionally fill the RSB on every entry to supervisor mode,
then there are never guest-controlled RSB values to be found.
With that property (and IBRS to protect Skylake+), you shouldn't need
RSB filling anywhere in the middle.
~Andrew
On Fri, 2018-01-12 at 10:02 -0800, Andi Kleen wrote:
> > + if ((!boot_cpu_has(X86_FEATURE_PTI) &&
> > + !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
> > + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
> > + pr_info("Filling RSB on context switch\n");
>
> We need to do more things for Skylake (like idle and interrupt fill
> and possibly deep call cahin), so I don't think it makes sense to
>
> - have an individual flag for each of these. It can be just a single
> flag that enables all of this for Skylake
>
> - print something for each of them. that will just be very noisy
> without any useful benefit to the user.
I still think we are better off using IBRS by default on Skylake.
This patch wasn't really for Skylake; the real use case was for AMD
CPUs (!PTI) without SMEP. Since it happens to needed on Skylake too we
might as well enable it there... but that doesn't mean I was planning
to do all the other horrible crap we need for Skylake.
On Fri, 2018-01-12 at 18:05 +0000, Andrew Cooper wrote:
>
> If you unconditionally fill the RSB on every entry to supervisor mode,
> then there are never guest-controlled RSB values to be found.
>
> With that property (and IBRS to protect Skylake+), you shouldn't need
> RSB filling anywhere in the middle.
Yes, that's right.
We have a choice — we can do it on kernel entry (in the interrupt and
syscall and NMI paths), and that's nice and easy and really safe
because we know there's *never* a bad RSB entry lurking while we're in
the kernel.
The alternative, which is what we seem to be learning towards now in
the latest tables from Dave (https://goo.gl/pXbvBE and
https://goo.gl/Grbuhf), is to do it on context switch when we might be
switching from a shallow call stack to a deeper one. Which has much
better performance characteristics for processes which make non-
sleeping syscalls.
The caveat with the latter approach is that we do depend on the fact
that context switches are the only imbalance in the kernel. But that's
OK — we don't have a longjmp or anything else like that. Especially
that goes into a *deeper* call stack. Do we?
On Fri, Jan 12, 2018 at 06:56:18PM +0000, David Woodhouse wrote:
> On Fri, 2018-01-12 at 18:05 +0000, Andrew Cooper wrote:
> >
> > If you unconditionally fill the RSB on every entry to supervisor mode,
> > then there are never guest-controlled RSB values to be found.
> >
> > With that property (and IBRS to protect Skylake+), you shouldn't need
> > RSB filling anywhere in the middle.
>
> Yes, that's right.
>
> We have a choice — we can do it on kernel entry (in the interrupt and
> syscall and NMI paths), and that's nice and easy and really safe
> because we know there's *never* a bad RSB entry lurking while we're in
> the kernel.
>
> The alternative, which is what we seem to be learning towards now in
> the latest tables from Dave (https://goo.gl/pXbvBE and
> https://goo.gl/Grbuhf), is to do it on context switch when we might be
> switching from a shallow call stack to a deeper one. Which has much
> better performance characteristics for processes which make non-
> sleeping syscalls.
>
> The caveat with the latter approach is that we do depend on the fact
> that context switches are the only imbalance in the kernel. But that's
> OK — we don't have a longjmp or anything else like that. Especially
> that goes into a *deeper* call stack. Do we?
At least some generated code might create RSB imbalances. Function
graph tracing and kretprobes, for example. They mess with the return
path and could probably underflow the RSB pretty easily. I guess they'd
need to be reworked a bit so they only do a single ret.
--
Josh
On Fri, 12 Jan 2018, David Woodhouse wrote:
> +/* Check for Skylake-like CPUs (for RSB handling) */
> +static bool __init is_skylake_era(void)
> +{
> + if (boot_cpu_data.x86 == 6) {
This wants a checkfor vendor = intel
> + switch (boot_cpu_data.x86_model) {
> + case INTEL_FAM6_SKYLAKE_MOBILE:
> + case INTEL_FAM6_SKYLAKE_DESKTOP:
> + case INTEL_FAM6_SKYLAKE_X:
> + case INTEL_FAM6_KABYLAKE_MOBILE:
> + case INTEL_FAM6_KABYLAKE_DESKTOP:
> + return true;
> + }
> + }
Thanks,
tglx
Commit-ID: a0ab15c0fb68e202bebd9b17fa49fd7ec48975b3
Gitweb: https://git.kernel.org/tip/a0ab15c0fb68e202bebd9b17fa49fd7ec48975b3
Author: David Woodhouse <[email protected]>
AuthorDate: Fri, 12 Jan 2018 17:49:25 +0000
Committer: Thomas Gleixner <[email protected]>
CommitDate: Sun, 14 Jan 2018 16:41:39 +0100
x86/retpoline: Fill RSB on context switch for affected CPUs
On context switch from a shallow call stack to a deeper one, as the CPU
does 'ret' up the deeper side it may encounter RSB entries (predictions for
where the 'ret' goes to) which were populated in userspace.
This is problematic if neither SMEP nor KPTI (the latter of which marks
userspace pages as NX for the kernel) are active, as malicious code in
userspace may then be executed speculatively.
Overwrite the CPU's return prediction stack with calls which are predicted
to return to an infinite loop, to "capture" speculation if this
happens. This is required both for retpoline, and also in conjunction with
IBRS for !SMEP && !KPTI.
On Skylake+ the problem is slightly different, and an *underflow* of the
RSB may cause errant branch predictions to occur. So there it's not so much
overwrite, as *filling* the RSB to attempt to prevent it getting
empty. This is only a partial solution for Skylake+ since there are many
other conditions which may result in the RSB becoming empty. The full
solution on Skylake+ is to use IBRS, which will prevent the problem even
when the RSB becomes empty. With IBRS, the RSB-stuffing will not be
required on context switch.
[ tglx: Added missing vendor check and slighty massaged comments and
changelog ]
Signed-off-by: David Woodhouse <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Acked-by: Arjan van de Ven <[email protected]>
Cc: [email protected]
Cc: Rik van Riel <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: Josh Poimboeuf <[email protected]>
Cc: [email protected]
Cc: Peter Zijlstra <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Jiri Kosina <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Kees Cook <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Greg Kroah-Hartman <[email protected]>
Cc: Paul Turner <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
arch/x86/entry/entry_32.S | 11 +++++++++++
arch/x86/entry/entry_64.S | 11 +++++++++++
arch/x86/include/asm/cpufeatures.h | 1 +
arch/x86/kernel/cpu/bugs.c | 36 ++++++++++++++++++++++++++++++++++++
4 files changed, 59 insertions(+)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index a1f28a5..60c4c34 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -244,6 +244,17 @@ ENTRY(__switch_to_asm)
movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
#endif
+#ifdef CONFIG_RETPOLINE
+ /*
+ * When switching from a shallower to a deeper call stack
+ * the RSB may either underflow or use entries populated
+ * with userspace addresses. On CPUs where those concerns
+ * exist, overwrite the RSB with entries which capture
+ * speculative execution to prevent attack.
+ */
+ FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+#endif
+
/* restore callee-saved registers */
popl %esi
popl %edi
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 59874bc..d54a0ed 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -487,6 +487,17 @@ ENTRY(__switch_to_asm)
movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset
#endif
+#ifdef CONFIG_RETPOLINE
+ /*
+ * When switching from a shallower to a deeper call stack
+ * the RSB may either underflow or use entries populated
+ * with userspace addresses. On CPUs where those concerns
+ * exist, overwrite the RSB with entries which capture
+ * speculative execution to prevent attack.
+ */
+ FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+#endif
+
/* restore callee-saved registers */
popq %r15
popq %r14
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index f275447..aa09559 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -211,6 +211,7 @@
#define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */
#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
+#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index e4dc261..390b3dc 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -23,6 +23,7 @@
#include <asm/alternative.h>
#include <asm/pgtable.h>
#include <asm/set_memory.h>
+#include <asm/intel-family.h>
static void __init spectre_v2_select_mitigation(void);
@@ -155,6 +156,23 @@ disable:
return SPECTRE_V2_CMD_NONE;
}
+/* Check for Skylake-like CPUs (for RSB handling) */
+static bool __init is_skylake_era(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+ boot_cpu_data.x86 == 6) {
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_SKYLAKE_MOBILE:
+ case INTEL_FAM6_SKYLAKE_DESKTOP:
+ case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_FAM6_KABYLAKE_MOBILE:
+ case INTEL_FAM6_KABYLAKE_DESKTOP:
+ return true;
+ }
+ }
+ return false;
+}
+
static void __init spectre_v2_select_mitigation(void)
{
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@ -213,6 +231,24 @@ retpoline_auto:
spectre_v2_enabled = mode;
pr_info("%s\n", spectre_v2_strings[mode]);
+
+ /*
+ * If neither SMEP or KPTI are available, there is a risk of
+ * hitting userspace addresses in the RSB after a context switch
+ * from a shallow call stack to a deeper one. To prevent this fill
+ * the entire RSB, even when using IBRS.
+ *
+ * Skylake era CPUs have a separate issue with *underflow* of the
+ * RSB, when they will predict 'ret' targets from the generic BTB.
+ * The proper mitigation for this is IBRS. If IBRS is not supported
+ * or deactivated in favour of retpolines the RSB fill on context
+ * switch is required.
+ */
+ if ((!boot_cpu_has(X86_FEATURE_PTI) &&
+ !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
+ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
+ pr_info("Filling RSB on context switch\n");
+ }
}
#undef pr_fmt
Commit-ID: c995efd5a740d9cbafbf58bde4973e8b50b4d761
Gitweb: https://git.kernel.org/tip/c995efd5a740d9cbafbf58bde4973e8b50b4d761
Author: David Woodhouse <[email protected]>
AuthorDate: Fri, 12 Jan 2018 17:49:25 +0000
Committer: Thomas Gleixner <[email protected]>
CommitDate: Mon, 15 Jan 2018 00:32:44 +0100
x86/retpoline: Fill RSB on context switch for affected CPUs
On context switch from a shallow call stack to a deeper one, as the CPU
does 'ret' up the deeper side it may encounter RSB entries (predictions for
where the 'ret' goes to) which were populated in userspace.
This is problematic if neither SMEP nor KPTI (the latter of which marks
userspace pages as NX for the kernel) are active, as malicious code in
userspace may then be executed speculatively.
Overwrite the CPU's return prediction stack with calls which are predicted
to return to an infinite loop, to "capture" speculation if this
happens. This is required both for retpoline, and also in conjunction with
IBRS for !SMEP && !KPTI.
On Skylake+ the problem is slightly different, and an *underflow* of the
RSB may cause errant branch predictions to occur. So there it's not so much
overwrite, as *filling* the RSB to attempt to prevent it getting
empty. This is only a partial solution for Skylake+ since there are many
other conditions which may result in the RSB becoming empty. The full
solution on Skylake+ is to use IBRS, which will prevent the problem even
when the RSB becomes empty. With IBRS, the RSB-stuffing will not be
required on context switch.
[ tglx: Added missing vendor check and slighty massaged comments and
changelog ]
Signed-off-by: David Woodhouse <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Acked-by: Arjan van de Ven <[email protected]>
Cc: [email protected]
Cc: Rik van Riel <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: Josh Poimboeuf <[email protected]>
Cc: [email protected]
Cc: Peter Zijlstra <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Jiri Kosina <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Kees Cook <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Greg Kroah-Hartman <[email protected]>
Cc: Paul Turner <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
arch/x86/entry/entry_32.S | 11 +++++++++++
arch/x86/entry/entry_64.S | 11 +++++++++++
arch/x86/include/asm/cpufeatures.h | 1 +
arch/x86/kernel/cpu/bugs.c | 36 ++++++++++++++++++++++++++++++++++++
4 files changed, 59 insertions(+)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index a1f28a5..60c4c34 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -244,6 +244,17 @@ ENTRY(__switch_to_asm)
movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
#endif
+#ifdef CONFIG_RETPOLINE
+ /*
+ * When switching from a shallower to a deeper call stack
+ * the RSB may either underflow or use entries populated
+ * with userspace addresses. On CPUs where those concerns
+ * exist, overwrite the RSB with entries which capture
+ * speculative execution to prevent attack.
+ */
+ FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+#endif
+
/* restore callee-saved registers */
popl %esi
popl %edi
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 59874bc..d54a0ed 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -487,6 +487,17 @@ ENTRY(__switch_to_asm)
movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset
#endif
+#ifdef CONFIG_RETPOLINE
+ /*
+ * When switching from a shallower to a deeper call stack
+ * the RSB may either underflow or use entries populated
+ * with userspace addresses. On CPUs where those concerns
+ * exist, overwrite the RSB with entries which capture
+ * speculative execution to prevent attack.
+ */
+ FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+#endif
+
/* restore callee-saved registers */
popq %r15
popq %r14
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index f275447..aa09559 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -211,6 +211,7 @@
#define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */
#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
+#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index e4dc261..390b3dc 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -23,6 +23,7 @@
#include <asm/alternative.h>
#include <asm/pgtable.h>
#include <asm/set_memory.h>
+#include <asm/intel-family.h>
static void __init spectre_v2_select_mitigation(void);
@@ -155,6 +156,23 @@ disable:
return SPECTRE_V2_CMD_NONE;
}
+/* Check for Skylake-like CPUs (for RSB handling) */
+static bool __init is_skylake_era(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+ boot_cpu_data.x86 == 6) {
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_SKYLAKE_MOBILE:
+ case INTEL_FAM6_SKYLAKE_DESKTOP:
+ case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_FAM6_KABYLAKE_MOBILE:
+ case INTEL_FAM6_KABYLAKE_DESKTOP:
+ return true;
+ }
+ }
+ return false;
+}
+
static void __init spectre_v2_select_mitigation(void)
{
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@ -213,6 +231,24 @@ retpoline_auto:
spectre_v2_enabled = mode;
pr_info("%s\n", spectre_v2_strings[mode]);
+
+ /*
+ * If neither SMEP or KPTI are available, there is a risk of
+ * hitting userspace addresses in the RSB after a context switch
+ * from a shallow call stack to a deeper one. To prevent this fill
+ * the entire RSB, even when using IBRS.
+ *
+ * Skylake era CPUs have a separate issue with *underflow* of the
+ * RSB, when they will predict 'ret' targets from the generic BTB.
+ * The proper mitigation for this is IBRS. If IBRS is not supported
+ * or deactivated in favour of retpolines the RSB fill on context
+ * switch is required.
+ */
+ if ((!boot_cpu_has(X86_FEATURE_PTI) &&
+ !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
+ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
+ pr_info("Filling RSB on context switch\n");
+ }
}
#undef pr_fmt
> + if ((!boot_cpu_has(X86_FEATURE_PTI) &&
> + !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
> + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
> + pr_info("Filling RSB on context switch\n");
> + }
Missing an option to turn this off.
-Andi
On Sun, Jan 14, 2018 at 04:05:54PM -0800, Andi Kleen wrote:
> > + if ((!boot_cpu_has(X86_FEATURE_PTI) &&
> > + !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
> > + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
> > + pr_info("Filling RSB on context switch\n");
> > + }
>
> Missing an option to turn this off.
My earlier patch did this properly by folding it
into the big option parser.
https://marc.info/?l=linux-kernel&m=151578282016915&w=2
-Andi
On Sun, 2018-01-14 at 16:05 -0800, Andi Kleen wrote:
> > + if ((!boot_cpu_has(X86_FEATURE_PTI) &&
> > + !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
> > + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
> > + pr_info("Filling RSB on context switch\n");
> > + }
>
> Missing an option to turn this off.
Deliberately so. You can already boot with 'spectre_v2=off' to turn off
the mitigations. We are not intending to permit all the bullshit micro-
management of IBRS=3/IBPB=2/RSB=π nonsense.
If you choose retpoline, you get the RSB stuffing which is appropriate
along with that. With IBRS, you get the RSB stuffing which is
appropriate with that. You don't get command line or sysfs tunables to
mess it. You *do* have the source code, if you really want to make
changes. Don't.
From: David Woodhouse
> Sent: 14 January 2018 17:04
> x86/retpoline: Fill RSB on context switch for affected CPUs
>
> On context switch from a shallow call stack to a deeper one, as the CPU
> does 'ret' up the deeper side it may encounter RSB entries (predictions for
> where the 'ret' goes to) which were populated in userspace.
>
> This is problematic if neither SMEP nor KPTI (the latter of which marks
> userspace pages as NX for the kernel) are active, as malicious code in
> userspace may then be executed speculatively.
...
Do we have a guarantee that all cpu actually detect the related RSB underflow?
It wouldn't surprise me if at least some cpu just let it wrap.
This would means that userspace would see return predictions based
on the values the kernel 'stuffed' into the RSB to fill it.
Potentially this leaks a kernel address to userspace.
David
On Mon, 2018-01-15 at 14:35 +0000, David Laight wrote:
> From: David Woodhouse
> >
> > Sent: 14 January 2018 17:04
> > x86/retpoline: Fill RSB on context switch for affected CPUs
> >
> > On context switch from a shallow call stack to a deeper one, as the CPU
> > does 'ret' up the deeper side it may encounter RSB entries (predictions for
> > where the 'ret' goes to) which were populated in userspace.
> >
> > This is problematic if neither SMEP nor KPTI (the latter of which marks
> > userspace pages as NX for the kernel) are active, as malicious code in
> > userspace may then be executed speculatively.
> ...
>
> Do we have a guarantee that all cpu actually detect the related RSB underflow?
>
> It wouldn't surprise me if at least some cpu just let it wrap.
>
> This would means that userspace would see return predictions based
> on the values the kernel 'stuffed' into the RSB to fill it.
>
> Potentially this leaks a kernel address to userspace.
Yeah, KASLR is dead unless we do a full IBPB before *every* VMLAUNCH or
return to userspace anyway, isn't it? With KPTI we could put the RSB-
stuffer into the syscall trampoline page perhaps...
For this to be a concern for userspace, I think it does have to be true
that only the lower bits are used, which adds a little complexity but
probably isn't insurmountable?
>
> This would means that userspace would see return predictions based
> on the values the kernel 'stuffed' into the RSB to fill it.
>
> Potentially this leaks a kernel address to userspace.
KASLR pretty much died in May this year to be honest with the KAISER paper (if not before then)
also with KPTI the address won't have a TLB mapping so it wouldn't
actually be speculated into.
On Mon, Jan 15, 2018 at 6:42 AM, Arjan van de Ven <[email protected]> wrote:
>>
>> This would means that userspace would see return predictions based
>> on the values the kernel 'stuffed' into the RSB to fill it.
>>
>> Potentially this leaks a kernel address to userspace.
>
>
> KASLR pretty much died in May this year to be honest with the KAISER paper
> (if not before then)
KASLR was always on shaky ground for local attacks. For pure remote
attacks, it's still useful. And for driving forward research, it
appears to be quite useful. ;)
-Kees
--
Kees Cook
Pixel Security
On 12.01.2018 18:49, Woodhouse, David wrote:
> When we context switch from a shallow call stack to a deeper one, as we
> 'ret' up the deeper side we may encounter RSB entries (predictions for
> where the 'ret' goes to) which were populated in userspace. This is
> problematic if we have neither SMEP nor KPTI (the latter of which marks
> userspace pages as NX for the kernel), as malicious code in userspace
> may then be executed speculatively. So overwrite the CPU's return
> prediction stack with calls which are predicted to return to an infinite
> loop, to "capture" speculation if this happens. This is required both
> for retpoline, and also in conjunction with IBRS for !SMEP && !KPTI.
>
> On Skylake+ the problem is slightly different, and an *underflow* of the
> RSB may cause errant branch predictions to occur. So there it's not so
> much overwrite, as *filling* the RSB to attempt to prevent it getting
> empty. This is only a partial solution for Skylake+ since there are many
> other conditions which may result in the RSB becoming empty. The full
> solution on Skylake+ is to use IBRS, which will prevent the problem even
> when the RSB becomes empty. With IBRS, the RSB-stuffing will not be
> required on context switch.
>
> Signed-off-by: David Woodhouse <[email protected]>
> Acked-by: Arjan van de Ven <[email protected]>
> ---
(..)
> @@ -213,6 +230,23 @@ static void __init spectre_v2_select_mitigation(void)
>
> spectre_v2_enabled = mode;
> pr_info("%s\n", spectre_v2_strings[mode]);
> +
> + /*
> + * If we don't have SMEP or KPTI, then we run the risk of hitting
> + * userspace addresses in the RSB after a context switch from a
> + * shallow call stack to a deeper one. We must must fill the entire
> + * RSB to avoid that, even when using IBRS.
> + *
> + * Skylake era CPUs have a separate issue with *underflow* of the
> + * RSB, when they will predict 'ret' targets from the generic BTB.
> + * IBRS makes that safe, but we need to fill the RSB on context
> + * switch if we're using retpoline.
> + */
> + if ((!boot_cpu_has(X86_FEATURE_PTI) &&
> + !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
> + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
> + pr_info("Filling RSB on context switch\n");
> + }
Shouldn't the RSB filling on context switch also be done on non-IBPB
CPUs to protect (retpolined) user space tasks from other user space
tasks?
We already issue a IBPB when switching to high-value user space tasks
to protect them from other user space tasks.
Thanks,
Maciej
> Shouldn't the RSB filling on context switch also be done on non-IBPB
> CPUs to protect (retpolined) user space tasks from other user space
> tasks?
The comment is actually incorrect. There's no risk to hit user space
addresses if we have KPTI and NX (which is fairly universal).
It's mainly needed on Skylake era CPUs.
Should fix the comment. I'll send a patch.
-Andi
On 09.03.2018 16:14, Andi Kleen wrote:
>> Shouldn't the RSB filling on context switch also be done on non-IBPB
>> CPUs to protect (retpolined) user space tasks from other user space
>> tasks?
>
> The comment is actually incorrect. There's no risk to hit user space
> addresses if we have KPTI and NX (which is fairly universal).
>
> It's mainly needed on Skylake era CPUs.
>
> Should fix the comment. I'll send a patch.
But what about userspace-to-userspace attacks? - the ones that IBPB on
context switches currently protects against (at least for high-value, or
as implemented currently, non-dumpable, processes)?
If understand the issue correctly, high-value user space processes can
be protected from other user space processes even on CPUs that lack
IBPB as long as they are recompiled with retpolines and there is no
danger of RSB entries from one process being used by another one after
a context switch.
For Skyklake this would not be enough, but there we'll (hopefully) have
the IBPB instead.
> -Andi
>
Maciej