2015-04-24 02:15:08

by Andy Lutomirski

[permalink] [raw]
Subject: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

AMD CPUs don't reinitialize the SS descriptor on SYSRET, so SYSRET
with SS == 0 results in an invalid usermode state in which SS is
apparently equal to __USER_DS but causes #SS if used.

Work around the issue by replacing NULL SS values with __KERNEL_DS
in __switch_to, thus ensuring that SYSRET never happens with SS set
to NULL.

This was exposed by a recent vDSO cleanup.

Fixes: e7d6eefaaa44 x86/vdso32/syscall.S: Do not load __USER32_DS to %ss
Signed-off-by: Andy Lutomirski <[email protected]>
---

Tested only on Intel, which isn't very interesting. I'll tidy up
and send a test case, too, once Borislav confirms that it works.

Please don't actually apply this until we're sure we understand the
scope of the issue. If this doesn't affect SYSRETQ, then we might
to fix it on before SYSRETL to avoid impacting 64-bit processes
at all.

arch/x86/ia32/ia32entry.S | 5 +++++
arch/x86/include/asm/cpufeature.h | 1 +
arch/x86/kernel/cpu/amd.c | 3 +++
arch/x86/kernel/entry_64.S | 6 ++++++
arch/x86/kernel/process_64.c | 25 +++++++++++++++++++++++++
5 files changed, 40 insertions(+)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 3c9fadf8b95c..ff519ea8b054 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -421,6 +421,11 @@ sysretl_from_sys_call:
* cs and ss are loaded from MSRs.
* (Note: 32bit->32bit SYSRET is different: since r11
* does not exist, it merely sets eflags.IF=1).
+ *
+ * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
+ * descriptor is not reinitialized. This means that we must
+ * avoid SYSRET with SS == NULL. We prevent that from happening
+ * by reloading SS in __switch_to.
*/
USERGS_SYSRET32

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 854c04b3c9c2..7e244f626301 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -257,6 +257,7 @@
#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
+#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */

#if defined(__KERNEL__) && !defined(__ASSEMBLY__)

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index fd470ebf924e..e4cf63301ff4 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -720,6 +720,9 @@ static void init_amd(struct cpuinfo_x86 *c)
if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
+
+ /* AMD CPUs don't reset SS attributes on SYSRET */
+ set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
}

#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 0034012d04ea..ffeaed0534d8 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -295,6 +295,12 @@ system_call_fastpath:
* rflags from r11 (but RF and VM bits are forced to 0),
* cs and ss are loaded from MSRs.
* Restoration of rflags re-enables interrupts.
+ *
+ * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
+ * descriptor is not reinitialized. This means that we should
+ * avoid SYSRET with SS == NULL. We prevent that from happening
+ * by reloading SS in __switch_to. (Actually detecting the failure
+ * in 64-bit userspace is tricky but can be done.)
*/
USERGS_SYSRET64

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 4baaa972f52a..919d6c2abded 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -419,6 +419,31 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
__switch_to_xtra(prev_p, next_p, tss);

+ if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
+ /*
+ * AMD CPUs have a misfeature: SYSRET sets the SS selector
+ * but does not update the cached descriptor. As a result,
+ * if we do SYSRET while SS is NULL, we'll end up in user
+ * mode with SS apparently equal to __USER_DS but actually
+ * unusable.
+ *
+ * The straightforward workaround would be to fix it up
+ * just before SYSRET, but that would slow down the system
+ * call fast paths. Instead, we ensure that SS is never NULL
+ * in system call context. We do this by replacing NULL
+ * SS selectors at every context switch. SYSCALL sets up
+ * a valid SS, so the only way to get NULL is to re-enter
+ * the kernel from CPL 3 through an interrupt. Since that
+ * can't happen in the same task as a running syscall, we
+ * are guaranteed to context switch between every interrupt
+ * vector entry and a subsequent SYSRET.
+ */
+ unsigned short ss_sel;
+ savesegment(ss, ss_sel);
+ if (ss_sel == 0)
+ loadsegment(ss, __KERNEL_DS);
+ }
+
return prev_p;
}

--
2.1.0


2015-04-24 02:19:12

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Thu, Apr 23, 2015 at 7:15 PM, Andy Lutomirski <[email protected]> wrote:
> AMD CPUs don't reinitialize the SS descriptor on SYSRET, so SYSRET
> with SS == 0 results in an invalid usermode state in which SS is
> apparently equal to __USER_DS but causes #SS if used.
>
> Work around the issue by replacing NULL SS values with __KERNEL_DS
> in __switch_to, thus ensuring that SYSRET never happens with SS set
> to NULL.
>
> This was exposed by a recent vDSO cleanup.
>
> Fixes: e7d6eefaaa44 x86/vdso32/syscall.S: Do not load __USER32_DS to %ss
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
>
> Tested only on Intel, which isn't very interesting. I'll tidy up
> and send a test case, too, once Borislav confirms that it works.
>
> Please don't actually apply this until we're sure we understand the
> scope of the issue. If this doesn't affect SYSRETQ, then we might
> to fix it on before SYSRETL to avoid impacting 64-bit processes
> at all.

Even if the issue affects SYSRETQ, it could be that we don't care. If
the extent of the info leak is whether we context switched during a
64-bit syscall to a non-syscall context, then this is basically
uninteresting. In that case, we could either ignore the 64-bit issue
entirely or fix it the other way: force SS to NULL on context switch
(much faster, I presume) and fix it up before SYSRETL as Denys
suggested.

We clearly don't have a spate of crashes in programs that do SYSCALL
from a 64-bit CS and then far jump/return to a 32-bit CS without first
reloading SS, since this bug has been here forever. I agree that the
issue is ugly (if it exists in the first place), but maybe we don't
need to fix it.

Thoughts?

--Andy

2015-04-24 03:58:11

by Brian Gerst

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Thu, Apr 23, 2015 at 10:15 PM, Andy Lutomirski <[email protected]> wrote:
> AMD CPUs don't reinitialize the SS descriptor on SYSRET, so SYSRET
> with SS == 0 results in an invalid usermode state in which SS is
> apparently equal to __USER_DS but causes #SS if used.
>
> Work around the issue by replacing NULL SS values with __KERNEL_DS
> in __switch_to, thus ensuring that SYSRET never happens with SS set
> to NULL.
>
> This was exposed by a recent vDSO cleanup.
>
> Fixes: e7d6eefaaa44 x86/vdso32/syscall.S: Do not load __USER32_DS to %ss
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
>
> Tested only on Intel, which isn't very interesting. I'll tidy up
> and send a test case, too, once Borislav confirms that it works.
>
> Please don't actually apply this until we're sure we understand the
> scope of the issue. If this doesn't affect SYSRETQ, then we might
> to fix it on before SYSRETL to avoid impacting 64-bit processes
> at all.

It works with Wine. Tested on an AMD Phenom II.

--
Brian Gerst

2015-04-24 10:00:23

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On 04/24/2015 04:15 AM, Andy Lutomirski wrote:
> AMD CPUs don't reinitialize the SS descriptor on SYSRET, so SYSRET
> with SS == 0 results in an invalid usermode state in which SS is
> apparently equal to __USER_DS but causes #SS if used.
>
> Work around the issue by replacing NULL SS values with __KERNEL_DS
> in __switch_to, thus ensuring that SYSRET never happens with SS set
> to NULL.
>
> This was exposed by a recent vDSO cleanup.
>
> Fixes: e7d6eefaaa44 x86/vdso32/syscall.S: Do not load __USER32_DS to %ss
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
>
> Tested only on Intel, which isn't very interesting. I'll tidy up
> and send a test case, too, once Borislav confirms that it works.
>
> Please don't actually apply this until we're sure we understand the
> scope of the issue. If this doesn't affect SYSRETQ, then we might
> to fix it on before SYSRETL to avoid impacting 64-bit processes
> at all.
>
> arch/x86/ia32/ia32entry.S | 5 +++++
> arch/x86/include/asm/cpufeature.h | 1 +
> arch/x86/kernel/cpu/amd.c | 3 +++
> arch/x86/kernel/entry_64.S | 6 ++++++
> arch/x86/kernel/process_64.c | 25 +++++++++++++++++++++++++
> 5 files changed, 40 insertions(+)
>
> diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
> index 3c9fadf8b95c..ff519ea8b054 100644
> --- a/arch/x86/ia32/ia32entry.S
> +++ b/arch/x86/ia32/ia32entry.S
> @@ -421,6 +421,11 @@ sysretl_from_sys_call:
> * cs and ss are loaded from MSRs.
> * (Note: 32bit->32bit SYSRET is different: since r11
> * does not exist, it merely sets eflags.IF=1).
> + *
> + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
> + * descriptor is not reinitialized. This means that we must
> + * avoid SYSRET with SS == NULL. We prevent that from happening
> + * by reloading SS in __switch_to.
> */
> USERGS_SYSRET32
>
> diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
> index 854c04b3c9c2..7e244f626301 100644
> --- a/arch/x86/include/asm/cpufeature.h
> +++ b/arch/x86/include/asm/cpufeature.h
> @@ -257,6 +257,7 @@
> #define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
> #define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
> #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
> +#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
>
> #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
>
> diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
> index fd470ebf924e..e4cf63301ff4 100644
> --- a/arch/x86/kernel/cpu/amd.c
> +++ b/arch/x86/kernel/cpu/amd.c
> @@ -720,6 +720,9 @@ static void init_amd(struct cpuinfo_x86 *c)
> if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
> if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
> set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
> +
> + /* AMD CPUs don't reset SS attributes on SYSRET */
> + set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
> }
>
> #ifdef CONFIG_X86_32
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index 0034012d04ea..ffeaed0534d8 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -295,6 +295,12 @@ system_call_fastpath:
> * rflags from r11 (but RF and VM bits are forced to 0),
> * cs and ss are loaded from MSRs.
> * Restoration of rflags re-enables interrupts.
> + *
> + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
> + * descriptor is not reinitialized. This means that we should
> + * avoid SYSRET with SS == NULL. We prevent that from happening
> + * by reloading SS in __switch_to. (Actually detecting the failure
> + * in 64-bit userspace is tricky but can be done.)
> */
> USERGS_SYSRET64
>
> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
> index 4baaa972f52a..919d6c2abded 100644
> --- a/arch/x86/kernel/process_64.c
> +++ b/arch/x86/kernel/process_64.c
> @@ -419,6 +419,31 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
> task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
> __switch_to_xtra(prev_p, next_p, tss);
>
> + if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
> + /*
> + * AMD CPUs have a misfeature: SYSRET sets the SS selector
> + * but does not update the cached descriptor. As a result,
> + * if we do SYSRET while SS is NULL, we'll end up in user
> + * mode with SS apparently equal to __USER_DS but actually
> + * unusable.
> + *
> + * The straightforward workaround would be to fix it up
> + * just before SYSRET, but that would slow down the system
> + * call fast paths. Instead, we ensure that SS is never NULL
> + * in system call context. We do this by replacing NULL
> + * SS selectors at every context switch. SYSCALL sets up
> + * a valid SS, so the only way to get NULL is to re-enter
> + * the kernel from CPL 3 through an interrupt. Since that
> + * can't happen in the same task as a running syscall, we
> + * are guaranteed to context switch between every interrupt
> + * vector entry and a subsequent SYSRET.
> + */
> + unsigned short ss_sel;
> + savesegment(ss, ss_sel);
> + if (ss_sel == 0)
> + loadsegment(ss, __KERNEL_DS);

I propose a more conservative check:

if (ss_sel != __KERNEL_DS)
loadsegment(ss, __KERNEL_DS);

I would propose this even if I would see no real case where it matters...
but I even do see such a case.

24593_APM_v21.pdf, page 110:

"""
Figure 4-35 on page 110 shows a long-mode stack switch. In long mode, all call gates must reference
64-bit code-segment descriptors, so a long-mode stack switch uses a 64-bit stack. The process of
switching stacks in long mode is similar to switching in legacy mode when no parameters are passed.
The process is as follows:

1. The target code-segment DPL is read by the processor and used as an index into the 64-bit TSS
for selecting the new stack pointer (RSP).

2. The RSP register is loaded with the new RSP value read from the TSS. The SS register is loaded
with a null selector (SS=0). Setting the new SS selector to null allows proper handling of nested
control transfers in 64-bit mode. See ?Nested Returns to 64-Bit Mode Procedures? on page 112
for additional information.

As in legacy mode, it is desirable to keep the stack-segment requestor privilege-level (SS.RPL)
equal to the current privilege-level (CPL). When using a call gate to change privilege levels, the
SS.RPL is updated to reflect the new CPL. The SS.RPL is restored from the return-target CS.RPL
on the subsequent privilege-level-changing far return.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ THIS ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

3. The old values of the SS and RSP registers are pushed onto the stack pointed to by the new RSP.
...
...
"""

Thus, the NULL selector in SS may actually be not all zeros. Its RPL may be nonzero,
if we are not in ring 0.

And in some paravirt setups kernel does run in a nonzero ring:

arch/x86/include/asm/paravirt.h:

#define get_kernel_rpl() (pv_info.kernel_rpl)

2015-04-24 10:59:20

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Fri, Apr 24, 2015 at 11:59:42AM +0200, Denys Vlasenko wrote:
> I propose a more conservative check:
>
> if (ss_sel != __KERNEL_DS)
> loadsegment(ss, __KERNEL_DS);
>
> I would propose this even if I would see no real case where it matters...
> but I even do see such a case.

...

> As in legacy mode, it is desirable to keep the stack-segment requestor privilege-level (SS.RPL)
> equal to the current privilege-level (CPL). When using a call gate to change privilege levels, the
> SS.RPL is updated to reflect the new CPL. The SS.RPL is restored from the return-target CS.RPL
> on the subsequent privilege-level-changing far return.
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ THIS ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>
> 3. The old values of the SS and RSP registers are pushed onto the stack pointed to by the new RSP.
> ...
> ...
> """
>
> Thus, the NULL selector in SS may actually be not all zeros. Its RPL may be nonzero,
> if we are not in ring 0.

Yeah, that makes more sense. So I tested Andy's patch but changed it as
above and I get

$ taskset -c 0 ./sysret_ss_attrs_32
[RUN] Syscalls followed by SS validation
[OK] We survived

And this is on an AMD F16h and it used to fail before the patch. So
yeah, I think we can call this misfeature "architectural".

Tested-by: Borislav Petkov <[email protected]>

Thanks.

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-24 11:27:25

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On 04/24/2015 04:15 AM, Andy Lutomirski wrote:
> AMD CPUs don't reinitialize the SS descriptor on SYSRET, so SYSRET
> with SS == 0 results in an invalid usermode state in which SS is
> apparently equal to __USER_DS but causes #SS if used.
>
> Work around the issue by replacing NULL SS values with __KERNEL_DS
> in __switch_to, thus ensuring that SYSRET never happens with SS set
> to NULL.
>
> This was exposed by a recent vDSO cleanup.
>
> Fixes: e7d6eefaaa44 x86/vdso32/syscall.S: Do not load __USER32_DS to %ss
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
>
> Tested only on Intel, which isn't very interesting. I'll tidy up
> and send a test case, too, once Borislav confirms that it works.
>
> Please don't actually apply this until we're sure we understand the
> scope of the issue. If this doesn't affect SYSRETQ, then we might
> to fix it on before SYSRETL to avoid impacting 64-bit processes
> at all.
>
> arch/x86/ia32/ia32entry.S | 5 +++++
> arch/x86/include/asm/cpufeature.h | 1 +
> arch/x86/kernel/cpu/amd.c | 3 +++
> arch/x86/kernel/entry_64.S | 6 ++++++
> arch/x86/kernel/process_64.c | 25 +++++++++++++++++++++++++
> 5 files changed, 40 insertions(+)
>
> diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
> index 3c9fadf8b95c..ff519ea8b054 100644
> --- a/arch/x86/ia32/ia32entry.S
> +++ b/arch/x86/ia32/ia32entry.S
> @@ -421,6 +421,11 @@ sysretl_from_sys_call:
> * cs and ss are loaded from MSRs.
> * (Note: 32bit->32bit SYSRET is different: since r11
> * does not exist, it merely sets eflags.IF=1).
> + *
> + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
> + * descriptor is not reinitialized. This means that we must
> + * avoid SYSRET with SS == NULL. We prevent that from happening
> + * by reloading SS in __switch_to.
> */
> USERGS_SYSRET32
>
> diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
> index 854c04b3c9c2..7e244f626301 100644
> --- a/arch/x86/include/asm/cpufeature.h
> +++ b/arch/x86/include/asm/cpufeature.h
> @@ -257,6 +257,7 @@
> #define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
> #define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
> #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
> +#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
>
> #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
>
> diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
> index fd470ebf924e..e4cf63301ff4 100644
> --- a/arch/x86/kernel/cpu/amd.c
> +++ b/arch/x86/kernel/cpu/amd.c
> @@ -720,6 +720,9 @@ static void init_amd(struct cpuinfo_x86 *c)
> if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
> if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
> set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
> +
> + /* AMD CPUs don't reset SS attributes on SYSRET */
> + set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
> }
>
> #ifdef CONFIG_X86_32
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index 0034012d04ea..ffeaed0534d8 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -295,6 +295,12 @@ system_call_fastpath:
> * rflags from r11 (but RF and VM bits are forced to 0),
> * cs and ss are loaded from MSRs.
> * Restoration of rflags re-enables interrupts.
> + *
> + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
> + * descriptor is not reinitialized. This means that we should
> + * avoid SYSRET with SS == NULL. We prevent that from happening
> + * by reloading SS in __switch_to. (Actually detecting the failure
> + * in 64-bit userspace is tricky but can be done.)
> */
> USERGS_SYSRET64
>
> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
> index 4baaa972f52a..919d6c2abded 100644
> --- a/arch/x86/kernel/process_64.c
> +++ b/arch/x86/kernel/process_64.c
> @@ -419,6 +419,31 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
> task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
> __switch_to_xtra(prev_p, next_p, tss);
>
> + if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
> + /*
> + * AMD CPUs have a misfeature: SYSRET sets the SS selector
> + * but does not update the cached descriptor. As a result,
> + * if we do SYSRET while SS is NULL, we'll end up in user
> + * mode with SS apparently equal to __USER_DS but actually
> + * unusable.

Perhaps it makes sense to mention here in comment
_how_ SS may become NULL: namely, that interrupts/exceptions
from !CPL0 load SS with null selector.
Before this discussion, I had only vaguest idea that
"I read something about that somewhere in the docs".

Which also suggests that this if() is unnecessary if
interrupts never cause task to switch, when they always return
to the same task they interrupted. They return with IRET,
thus bad SS cached descriptor is not leaked to userspace.

IOW: frame this if() in "#ifdef CONFIG_PREEMPT".

> + *
> + * The straightforward workaround would be to fix it up
> + * just before SYSRET, but that would slow down the system
> + * call fast paths. Instead, we ensure that SS is never NULL
> + * in system call context. We do this by replacing NULL
> + * SS selectors at every context switch. SYSCALL sets up
> + * a valid SS, so the only way to get NULL is to re-enter
> + * the kernel from CPL 3 through an interrupt. Since that
> + * can't happen in the same task as a running syscall, we
> + * are guaranteed to context switch between every interrupt
> + * vector entry and a subsequent SYSRET.
> + */
> + unsigned short ss_sel;
> + savesegment(ss, ss_sel);
> + if (ss_sel == 0)
> + loadsegment(ss, __KERNEL_DS);
> + }
> +
> return prev_p;
> }
>
>

2015-04-24 12:00:28

by Brian Gerst

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Fri, Apr 24, 2015 at 7:27 AM, Denys Vlasenko <[email protected]> wrote:
> On 04/24/2015 04:15 AM, Andy Lutomirski wrote:
>> AMD CPUs don't reinitialize the SS descriptor on SYSRET, so SYSRET
>> with SS == 0 results in an invalid usermode state in which SS is
>> apparently equal to __USER_DS but causes #SS if used.
>>
>> Work around the issue by replacing NULL SS values with __KERNEL_DS
>> in __switch_to, thus ensuring that SYSRET never happens with SS set
>> to NULL.
>>
>> This was exposed by a recent vDSO cleanup.
>>
>> Fixes: e7d6eefaaa44 x86/vdso32/syscall.S: Do not load __USER32_DS to %ss
>> Signed-off-by: Andy Lutomirski <[email protected]>
>> ---
>>
>> Tested only on Intel, which isn't very interesting. I'll tidy up
>> and send a test case, too, once Borislav confirms that it works.
>>
>> Please don't actually apply this until we're sure we understand the
>> scope of the issue. If this doesn't affect SYSRETQ, then we might
>> to fix it on before SYSRETL to avoid impacting 64-bit processes
>> at all.
>>
>> arch/x86/ia32/ia32entry.S | 5 +++++
>> arch/x86/include/asm/cpufeature.h | 1 +
>> arch/x86/kernel/cpu/amd.c | 3 +++
>> arch/x86/kernel/entry_64.S | 6 ++++++
>> arch/x86/kernel/process_64.c | 25 +++++++++++++++++++++++++
>> 5 files changed, 40 insertions(+)
>>
>> diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
>> index 3c9fadf8b95c..ff519ea8b054 100644
>> --- a/arch/x86/ia32/ia32entry.S
>> +++ b/arch/x86/ia32/ia32entry.S
>> @@ -421,6 +421,11 @@ sysretl_from_sys_call:
>> * cs and ss are loaded from MSRs.
>> * (Note: 32bit->32bit SYSRET is different: since r11
>> * does not exist, it merely sets eflags.IF=1).
>> + *
>> + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
>> + * descriptor is not reinitialized. This means that we must
>> + * avoid SYSRET with SS == NULL. We prevent that from happening
>> + * by reloading SS in __switch_to.
>> */
>> USERGS_SYSRET32
>>
>> diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
>> index 854c04b3c9c2..7e244f626301 100644
>> --- a/arch/x86/include/asm/cpufeature.h
>> +++ b/arch/x86/include/asm/cpufeature.h
>> @@ -257,6 +257,7 @@
>> #define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
>> #define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
>> #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
>> +#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
>>
>> #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
>>
>> diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
>> index fd470ebf924e..e4cf63301ff4 100644
>> --- a/arch/x86/kernel/cpu/amd.c
>> +++ b/arch/x86/kernel/cpu/amd.c
>> @@ -720,6 +720,9 @@ static void init_amd(struct cpuinfo_x86 *c)
>> if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
>> if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
>> set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
>> +
>> + /* AMD CPUs don't reset SS attributes on SYSRET */
>> + set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
>> }
>>
>> #ifdef CONFIG_X86_32
>> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
>> index 0034012d04ea..ffeaed0534d8 100644
>> --- a/arch/x86/kernel/entry_64.S
>> +++ b/arch/x86/kernel/entry_64.S
>> @@ -295,6 +295,12 @@ system_call_fastpath:
>> * rflags from r11 (but RF and VM bits are forced to 0),
>> * cs and ss are loaded from MSRs.
>> * Restoration of rflags re-enables interrupts.
>> + *
>> + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
>> + * descriptor is not reinitialized. This means that we should
>> + * avoid SYSRET with SS == NULL. We prevent that from happening
>> + * by reloading SS in __switch_to. (Actually detecting the failure
>> + * in 64-bit userspace is tricky but can be done.)
>> */
>> USERGS_SYSRET64
>>
>> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
>> index 4baaa972f52a..919d6c2abded 100644
>> --- a/arch/x86/kernel/process_64.c
>> +++ b/arch/x86/kernel/process_64.c
>> @@ -419,6 +419,31 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
>> task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
>> __switch_to_xtra(prev_p, next_p, tss);
>>
>> + if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
>> + /*
>> + * AMD CPUs have a misfeature: SYSRET sets the SS selector
>> + * but does not update the cached descriptor. As a result,
>> + * if we do SYSRET while SS is NULL, we'll end up in user
>> + * mode with SS apparently equal to __USER_DS but actually
>> + * unusable.
>
> Perhaps it makes sense to mention here in comment
> _how_ SS may become NULL: namely, that interrupts/exceptions
> from !CPL0 load SS with null selector.
> Before this discussion, I had only vaguest idea that
> "I read something about that somewhere in the docs".
>
> Which also suggests that this if() is unnecessary if
> interrupts never cause task to switch, when they always return
> to the same task they interrupted. They return with IRET,
> thus bad SS cached descriptor is not leaked to userspace.
>
> IOW: frame this if() in "#ifdef CONFIG_PREEMPT".

Intel's docs say that SS will be set to NULL if the CPL changes, or an
IST stack is used. It implies that SS is not changed if CPL doesn't
change and IST=0.

AMD also changes to NULL SS on a CPL change, but does not mention it
for IST stack switches.

So actually this isn't a preemption issue, as the NULL SS is coming
from an interrupt from userspace (timer tick, etc.).

Another alternative to consider is setting SS=__KERNEL_DS on interrupt
entry if it's NULL.

--
Brian Gerst

2015-04-24 16:25:10

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Fri, Apr 24, 2015 at 5:00 AM, Brian Gerst <[email protected]> wrote:
>
> So actually this isn't a preemption issue, as the NULL SS is coming
> from an interrupt from userspace (timer tick, etc.).

It *is* a preemption issue, in the sense that the interrupt that
clears SS also then returns to user space using an "iret" that will
properly restore it.

So the only case we need to worry about is the preemption case, where
the interrupt has caused a task switch (typically because it woke
something up or it was the timer interrupt and the timeslice of the
previous task is up), and we switch to another context that returns to
user space using "sysret" instead.

> Another alternative to consider is setting SS=__KERNEL_DS on interrupt
> entry if it's NULL.

The interrupt path is likely more critical than the scheduler path.
Also, it's any exception, afaik, so it's a lot less targeted.

I like Andy's patch. It looks good and efficient. We need to keep this
issue in mind if we ever expand the "Use sysret to return to userspace
when possible" approach to other things than just system call returns,
but with the limitations of the contents of RCX/R11, that's unlikely
to ever be a useful thing anyway.

Linus

2015-04-24 17:33:09

by Brian Gerst

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Fri, Apr 24, 2015 at 12:25 PM, Linus Torvalds
<[email protected]> wrote:
> On Fri, Apr 24, 2015 at 5:00 AM, Brian Gerst <[email protected]> wrote:
>>
>> So actually this isn't a preemption issue, as the NULL SS is coming
>> from an interrupt from userspace (timer tick, etc.).
>
> It *is* a preemption issue, in the sense that the interrupt that
> clears SS also then returns to user space using an "iret" that will
> properly restore it.
>
> So the only case we need to worry about is the preemption case, where
> the interrupt has caused a task switch (typically because it woke
> something up or it was the timer interrupt and the timeslice of the
> previous task is up), and we switch to another context that returns to
> user space using "sysret" instead.

To clarify, I was thinking of the CONFIG_PREEMPT case. A nested
interrupt wouldn't change SS, and IST interrupts can't schedule.

--
Brian Gerst

2015-04-24 17:41:12

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Fri, Apr 24, 2015 at 10:33 AM, Brian Gerst <[email protected]> wrote:
>
> To clarify, I was thinking of the CONFIG_PREEMPT case. A nested
> interrupt wouldn't change SS, and IST interrupts can't schedule.

It has absolutely nothing to do with nested interrupts or CONFIG_PREEMPT.

The problem happens simply because

- process A does a system call SS=__KERNEL_DS

- the system call sleeps for whatever reason. SS is still __KERNEL_DS

- process B runs, returns to user space, and takes an interrupt. Now SS=0

- process B is about to return to user space (where the interrupt
happened), but we schedule as part of that regular user-space return.
SS=0

- process A returns to user space using sysret, the SS selector
becomes __USER_DS, but the cached descriptor remains non-present

Notice? No nested interrupts, no CONFIG_PREEMPT, nothing special at all.

The reason Luto's patch fixes the problem is that now the scheduling
from B back to A will reload SS, making it __KERNEL_DS, but more
importantly, fixing the cached descriptor to be the usual present flag
one, which is what the AMD sysret instruction needs.

Or do I misunderstand what you are talking about?

Linus

2015-04-24 17:58:02

by Brian Gerst

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Fri, Apr 24, 2015 at 1:41 PM, Linus Torvalds
<[email protected]> wrote:
> On Fri, Apr 24, 2015 at 10:33 AM, Brian Gerst <[email protected]> wrote:
>>
>> To clarify, I was thinking of the CONFIG_PREEMPT case. A nested
>> interrupt wouldn't change SS, and IST interrupts can't schedule.
>
> It has absolutely nothing to do with nested interrupts or CONFIG_PREEMPT.
>
> The problem happens simply because
>
> - process A does a system call SS=__KERNEL_DS
>
> - the system call sleeps for whatever reason. SS is still __KERNEL_DS
>
> - process B runs, returns to user space, and takes an interrupt. Now SS=0
>
> - process B is about to return to user space (where the interrupt
> happened), but we schedule as part of that regular user-space return.
> SS=0
>
> - process A returns to user space using sysret, the SS selector
> becomes __USER_DS, but the cached descriptor remains non-present
>
> Notice? No nested interrupts, no CONFIG_PREEMPT, nothing special at all.
>
> The reason Luto's patch fixes the problem is that now the scheduling
> from B back to A will reload SS, making it __KERNEL_DS, but more
> importantly, fixing the cached descriptor to be the usual present flag
> one, which is what the AMD sysret instruction needs.
>
> Or do I misunderstand what you are talking about?
>
> Linus

Your explanation is correct. I meant that this can happen even if
CONFIG_PREEMPT is disabled. I just took "preemption" to mean kernel
preemption, not normal scheduling.

--
Brian Gerst

2015-04-24 19:58:25

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Fri, Apr 24, 2015 at 12:59:06PM +0200, Borislav Petkov wrote:
> Yeah, that makes more sense. So I tested Andy's patch but changed it as
> above and I get
>
> $ taskset -c 0 ./sysret_ss_attrs_32
> [RUN] Syscalls followed by SS validation
> [OK] We survived

Andy, you wanted the 64-bit version too:

$ taskset -c 0 ./sysret_ss_attrs_64
[RUN] Syscalls followed by SS validation
[OK] We survived
$ taskset -c 0 ./sysret_ss_attrs_64
[RUN] Syscalls followed by SS validation
[OK] We survived

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-24 20:21:29

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Thu, Apr 23, 2015 at 7:15 PM, Andy Lutomirski <[email protected]> wrote:
> AMD CPUs don't reinitialize the SS descriptor on SYSRET, so SYSRET
> with SS == 0 results in an invalid usermode state in which SS is
> apparently equal to __USER_DS but causes #SS if used.
>
> Work around the issue by replacing NULL SS values with __KERNEL_DS
> in __switch_to, thus ensuring that SYSRET never happens with SS set
> to NULL.
>
> This was exposed by a recent vDSO cleanup.
>
> Fixes: e7d6eefaaa44 x86/vdso32/syscall.S: Do not load __USER32_DS to %ss
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
>
> Tested only on Intel, which isn't very interesting. I'll tidy up
> and send a test case, too, once Borislav confirms that it works.
>
> Please don't actually apply this until we're sure we understand the
> scope of the issue. If this doesn't affect SYSRETQ, then we might
> to fix it on before SYSRETL to avoid impacting 64-bit processes
> at all.
>

After sleeping on it, I think I want to offer a different, more
complicated approach. AFAIK there are really only two ways that this
issue can be visible:

1. SYSRETL. We can fix that up in the AMD SYSRETL path. I think
there's a decent argument that that path is less performance-critical
than context switches.

2. SYSRETQ. The only way that I know of to see the problem is SYSRETQ
followed by a far jump or return. This is presumably *extremely*
rare.

What if we fixed #2 up in do_stack_segment. We should double-check
the docs, but I think that this will only ever manifest as #SS(0) with
regs->ss == __USER_DS and !user_mode_64bit(regs). We need to avoid
infinite retry looks, but this might be okay. I think that #SS(0)
from userspace under those conditions can *only* happen as a result of
this issue. Even if not, we could come up with a way to only retry
once per syscall (e.g. set some ti->status flag in the 64-bit syscall
path on AMD and clear it in do_stack_segment).

This might be way more trouble than it's worth. For one thing, we
need to be careful with the IRET fixup. Ick. So maybe this should be
written off as my useless ramblings.

NB: I suspect that all of this is irrelevant on Xen. Xen does its own
thing wrt sysret.

--Andy

2015-04-24 20:47:21

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Fri, Apr 24, 2015 at 10:21 PM, Andy Lutomirski <[email protected]> wrote:
> On Thu, Apr 23, 2015 at 7:15 PM, Andy Lutomirski <[email protected]> wrote:
>> AMD CPUs don't reinitialize the SS descriptor on SYSRET, so SYSRET
>> with SS == 0 results in an invalid usermode state in which SS is
>> apparently equal to __USER_DS but causes #SS if used.
>>
>> Work around the issue by replacing NULL SS values with __KERNEL_DS
>> in __switch_to, thus ensuring that SYSRET never happens with SS set
>> to NULL.
>>
>> This was exposed by a recent vDSO cleanup.
>>
>> Fixes: e7d6eefaaa44 x86/vdso32/syscall.S: Do not load __USER32_DS to %ss
>> Signed-off-by: Andy Lutomirski <[email protected]>
>> ---
>>
>> Tested only on Intel, which isn't very interesting. I'll tidy up
>> and send a test case, too, once Borislav confirms that it works.
>>
>> Please don't actually apply this until we're sure we understand the
>> scope of the issue. If this doesn't affect SYSRETQ, then we might
>> to fix it on before SYSRETL to avoid impacting 64-bit processes
>> at all.
>>
>
> After sleeping on it, I think I want to offer a different, more
> complicated approach. AFAIK there are really only two ways that this
> issue can be visible:
>
> 1. SYSRETL. We can fix that up in the AMD SYSRETL path. I think
> there's a decent argument that that path is less performance-critical
> than context switches.
>
> 2. SYSRETQ. The only way that I know of to see the problem is SYSRETQ
> followed by a far jump or return. This is presumably *extremely*
> rare.
>
> What if we fixed #2 up in do_stack_segment. We should double-check
> the docs, but I think that this will only ever manifest as #SS(0) with
> regs->ss == __USER_DS and !user_mode_64bit(regs). We need to avoid
> infinite retry looks, but this might be okay. I think that #SS(0)
> from userspace under those conditions can *only* happen as a result of
> this issue. Even if not, we could come up with a way to only retry
> once per syscall (e.g. set some ti->status flag in the 64-bit syscall
> path on AMD and clear it in do_stack_segment).
>
> This might be way more trouble than it's worth.

Exactly my feeling. What are you trying to save? About four CPU
cycles of checking %ss != __KERNEL_DS on each switch_to?
That's not worth bothering about. Your last patch seems to be perfect.

2015-04-24 20:50:27

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Fri, Apr 24, 2015 at 1:46 PM, Denys Vlasenko
<[email protected]> wrote:
> On Fri, Apr 24, 2015 at 10:21 PM, Andy Lutomirski <[email protected]> wrote:
>> On Thu, Apr 23, 2015 at 7:15 PM, Andy Lutomirski <[email protected]> wrote:
>>> AMD CPUs don't reinitialize the SS descriptor on SYSRET, so SYSRET
>>> with SS == 0 results in an invalid usermode state in which SS is
>>> apparently equal to __USER_DS but causes #SS if used.
>>>
>>> Work around the issue by replacing NULL SS values with __KERNEL_DS
>>> in __switch_to, thus ensuring that SYSRET never happens with SS set
>>> to NULL.
>>>
>>> This was exposed by a recent vDSO cleanup.
>>>
>>> Fixes: e7d6eefaaa44 x86/vdso32/syscall.S: Do not load __USER32_DS to %ss
>>> Signed-off-by: Andy Lutomirski <[email protected]>
>>> ---
>>>
>>> Tested only on Intel, which isn't very interesting. I'll tidy up
>>> and send a test case, too, once Borislav confirms that it works.
>>>
>>> Please don't actually apply this until we're sure we understand the
>>> scope of the issue. If this doesn't affect SYSRETQ, then we might
>>> to fix it on before SYSRETL to avoid impacting 64-bit processes
>>> at all.
>>>
>>
>> After sleeping on it, I think I want to offer a different, more
>> complicated approach. AFAIK there are really only two ways that this
>> issue can be visible:
>>
>> 1. SYSRETL. We can fix that up in the AMD SYSRETL path. I think
>> there's a decent argument that that path is less performance-critical
>> than context switches.
>>
>> 2. SYSRETQ. The only way that I know of to see the problem is SYSRETQ
>> followed by a far jump or return. This is presumably *extremely*
>> rare.
>>
>> What if we fixed #2 up in do_stack_segment. We should double-check
>> the docs, but I think that this will only ever manifest as #SS(0) with
>> regs->ss == __USER_DS and !user_mode_64bit(regs). We need to avoid
>> infinite retry looks, but this might be okay. I think that #SS(0)
>> from userspace under those conditions can *only* happen as a result of
>> this issue. Even if not, we could come up with a way to only retry
>> once per syscall (e.g. set some ti->status flag in the 64-bit syscall
>> path on AMD and clear it in do_stack_segment).
>>
>> This might be way more trouble than it's worth.
>
> Exactly my feeling. What are you trying to save? About four CPU
> cycles of checking %ss != __KERNEL_DS on each switch_to?
> That's not worth bothering about. Your last patch seems to be perfect.

We'll have to do the write to ss almost every time an AMD CPU sleeps
in a syscall. Maybe that's still not a big deal.

--Andy

--
Andy Lutomirski
AMA Capital Management, LLC

2015-04-24 20:53:41

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Fri, Apr 24, 2015 at 1:21 PM, Andy Lutomirski <[email protected]> wrote:
>
> 2. SYSRETQ. The only way that I know of to see the problem is SYSRETQ
> followed by a far jump or return. This is presumably *extremely*
> rare.
>
> What if we fixed #2 up in do_stack_segment. We should double-check
> the docs, but I think that this will only ever manifest as #SS(0) with
> regs->ss == __USER_DS and !user_mode_64bit(regs).

Hmm. It smells a bit "too clever" for me, and in particular, I think
you need a good test-case for this. But yeah, I guess that gets things
out of any possibly critical paths.

That said, I wouldn't even be sure about the SS(0). The rules about
when you get SS and when you get GP are odd.

Also, you need to check what happens when somebody does something like

movl $-1,%eax
ss ; movl (%eax),%eax

because I think that gets a #DB(0) too with the situation you expect
to be "unique", because the address wraps.. I dunno.

So care and testing needed. I think the scheduler approach is a *lot*
more obvious, quite frankly.

Linus

2015-04-24 21:45:54

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On 04/24/2015 01:50 PM, Andy Lutomirski wrote:
>>
>> Exactly my feeling. What are you trying to save? About four CPU
>> cycles of checking %ss != __KERNEL_DS on each switch_to?
>> That's not worth bothering about. Your last patch seems to be perfect.
>
> We'll have to do the write to ss almost every time an AMD CPU sleeps
> in a syscall. Maybe that's still not a big deal.
>

Once you're sleeping anyway, I don't think so.

-hpa

2015-04-24 21:45:45

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On 04/24/2015 01:50 PM, Andy Lutomirski wrote:
>>
>> Exactly my feeling. What are you trying to save? About four CPU
>> cycles of checking %ss != __KERNEL_DS on each switch_to?
>> That's not worth bothering about. Your last patch seems to be perfect.
>
> We'll have to do the write to ss almost every time an AMD CPU sleeps
> in a syscall. Maybe that's still not a big deal.
>

Once you're sleeping anyway, I don't think so.

-hpa

2015-04-24 21:45:49

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On 04/24/2015 01:50 PM, Andy Lutomirski wrote:
>>
>> Exactly my feeling. What are you trying to save? About four CPU
>> cycles of checking %ss != __KERNEL_DS on each switch_to?
>> That's not worth bothering about. Your last patch seems to be perfect.
>
> We'll have to do the write to ss almost every time an AMD CPU sleeps
> in a syscall. Maybe that's still not a big deal.
>

Once you're sleeping anyway, I don't think so.

-hpa

2015-04-24 21:45:57

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On 04/24/2015 01:50 PM, Andy Lutomirski wrote:
>>
>> Exactly my feeling. What are you trying to save? About four CPU
>> cycles of checking %ss != __KERNEL_DS on each switch_to?
>> That's not worth bothering about. Your last patch seems to be perfect.
>
> We'll have to do the write to ss almost every time an AMD CPU sleeps
> in a syscall. Maybe that's still not a big deal.
>

Once you're sleeping anyway, I don't think so.

-hpa

2015-04-24 21:46:02

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On 04/24/2015 01:50 PM, Andy Lutomirski wrote:
>>
>> Exactly my feeling. What are you trying to save? About four CPU
>> cycles of checking %ss != __KERNEL_DS on each switch_to?
>> That's not worth bothering about. Your last patch seems to be perfect.
>
> We'll have to do the write to ss almost every time an AMD CPU sleeps
> in a syscall. Maybe that's still not a big deal.
>

Once you're sleeping anyway, I don't think so.

-hpa

2015-04-24 21:47:08

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On 04/24/2015 01:50 PM, Andy Lutomirski wrote:
>>
>> Exactly my feeling. What are you trying to save? About four CPU
>> cycles of checking %ss != __KERNEL_DS on each switch_to?
>> That's not worth bothering about. Your last patch seems to be perfect.
>
> We'll have to do the write to ss almost every time an AMD CPU sleeps
> in a syscall. Maybe that's still not a big deal.
>

Once you're sleeping anyway, I don't think so.

-hpa

2015-04-25 02:17:27

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Fri, Apr 24, 2015 at 10:50 PM, Andy Lutomirski <[email protected]> wrote:
> On Fri, Apr 24, 2015 at 1:46 PM, Denys Vlasenko
>>> This might be way more trouble than it's worth.
>>
>> Exactly my feeling. What are you trying to save? About four CPU
>> cycles of checking %ss != __KERNEL_DS on each switch_to?
>> That's not worth bothering about. Your last patch seems to be perfect.
>
> We'll have to do the write to ss almost every time an AMD CPU sleeps
> in a syscall.

Why do you think so?
Scheduling from a syscall which decided to block won't require
writing to %ss, since in this case %ss isn't NULL.

Writing to %ss will happen every time we schedule from an interrupt.
With timer interrupt every 1 ms it means scheduling at most ~1000
times per second, if _every_ such interrupt causes task switch.
This is still not often enough to worry.

2015-04-25 21:12:24

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Thu, Apr 23, 2015 at 07:15:01PM -0700, Andy Lutomirski wrote:
> AMD CPUs don't reinitialize the SS descriptor on SYSRET, so SYSRET
> with SS == 0 results in an invalid usermode state in which SS is
> apparently equal to __USER_DS but causes #SS if used.
>
> Work around the issue by replacing NULL SS values with __KERNEL_DS
> in __switch_to, thus ensuring that SYSRET never happens with SS set
> to NULL.
>
> This was exposed by a recent vDSO cleanup.
>
> Fixes: e7d6eefaaa44 x86/vdso32/syscall.S: Do not load __USER32_DS to %ss
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
>
> Tested only on Intel, which isn't very interesting. I'll tidy up
> and send a test case, too, once Borislav confirms that it works.

So I did some benchmarking today. Custom kernel build measured with perf
stat, 10 builds with --pre doing

$ cat pre-build-kernel.sh
make -s clean
echo 3 > /proc/sys/vm/drop_caches

$ cat measure.sh
EVENTS="cpu-clock,task-clock,cycles,instructions,branches,branch-misses,context-switches,migrations"
perf stat -e $EVENTS --sync -a --repeat 10 --pre ~/kernel/pre-build-kernel.sh make -s -j64

I've prepended the perf stat output with markers A:, B: or C: for easier
comparing. The markers mean:

A: Linus' master from a couple of days ago + tip/master + tip/x86/asm
B: With Andy's SYSRET patch ontop
C: Without RCX canonicalness check (see patch at the end).

Numbers are from an AMD F16h box:

A: 2835570.145246 cpu-clock (msec) ( +- 0.02% ) [100.00%]
B: 2833364.074970 cpu-clock (msec) ( +- 0.04% ) [100.00%]
C: 2834708.335431 cpu-clock (msec) ( +- 0.02% ) [100.00%]

This is interesting - The SYSRET SS fix makes it minimally better and
the C-patch is a bit worse again. Net win is 861 msec, almost a second,
oh well.

A: 2835570.099981 task-clock (msec) # 3.996 CPUs utilized ( +- 0.02% ) [100.00%]
B: 2833364.073633 task-clock (msec) # 3.996 CPUs utilized ( +- 0.04% ) [100.00%]
C: 2834708.350387 task-clock (msec) # 3.996 CPUs utilized ( +- 0.02% ) [100.00%]

Similar thing observable here.

A: 5,591,213,166,613 cycles # 1.972 GHz ( +- 0.03% ) [75.00%]
B: 5,585,023,802,888 cycles # 1.971 GHz ( +- 0.03% ) [75.00%]
C: 5,587,983,212,758 cycles # 1.971 GHz ( +- 0.02% ) [75.00%]

net win is 3,229,953,855 cycles drop.

A: 3,106,707,101,530 instructions # 0.56 insns per cycle ( +- 0.01% ) [75.00%]
B: 3,106,632,251,528 instructions # 0.56 insns per cycle ( +- 0.00% ) [75.00%]
C: 3,106,265,958,142 instructions # 0.56 insns per cycle ( +- 0.00% ) [75.00%]

This looks like it would make sense - instruction count drops from A -> B -> C.

A: 683,676,044,429 branches # 241.107 M/sec ( +- 0.01% ) [75.00%]
B: 683,670,899,595 branches # 241.293 M/sec ( +- 0.01% ) [75.00%]
C: 683,675,772,858 branches # 241.180 M/sec ( +- 0.01% ) [75.00%]

Also makes sense - the C patch adds an unconditional JMP over the
RCX-canonicalness check.

A: 43,829,535,008 branch-misses # 6.41% of all branches ( +- 0.02% ) [75.00%]
B: 43,844,118,416 branch-misses # 6.41% of all branches ( +- 0.03% ) [75.00%]
C: 43,819,871,086 branch-misses # 6.41% of all branches ( +- 0.02% ) [75.00%]

And this is nice, branch misses are the smallest with C, cool. It makes
sense again - the C patch adds an unconditional JMP which doesn't miss.

A: 2,030,357 context-switches # 0.716 K/sec ( +- 0.06% ) [100.00%]
B: 2,029,313 context-switches # 0.716 K/sec ( +- 0.05% ) [100.00%]
C: 2,028,566 context-switches # 0.716 K/sec ( +- 0.06% ) [100.00%]

Those look good.

A: 52,421 migrations # 0.018 K/sec ( +- 1.13% )
B: 52,049 migrations # 0.018 K/sec ( +- 1.02% )
C: 51,365 migrations # 0.018 K/sec ( +- 0.92% )

Same here.

A: 709.528485252 seconds time elapsed ( +- 0.02% )
B: 708.976557288 seconds time elapsed ( +- 0.04% )
C: 709.312844791 seconds time elapsed ( +- 0.02% )

Interestingly, the unconditional JMP kinda costs... Btw, I'm not sure if
kernel build is the optimal workload for benchmarking here but I don't
see why not - it does a lot of syscalls so it should exercise the SYSRET
path sufficiently.

Anyway, we can do this below. Or not, I'm sitting on the fence about
that one.

---
From: Borislav Petkov <[email protected]>
Date: Sat, 25 Apr 2015 19:30:33 +0200
Subject: [PATCH] x86/entry: Avoid canonical RCX check on AMD

It is not needed on AMD as RCX canonicalness is not checked during
SYSRET there.

Signed-off-by: Borislav Petkov <[email protected]>
---
arch/x86/include/asm/cpufeature.h | 1 +
arch/x86/kernel/cpu/intel.c | 2 ++
arch/x86/kernel/entry_64.S | 13 +++++++++----
3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 7ee9b94d9921..8d555b046fe9 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -265,6 +265,7 @@
#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
+#define X86_BUG_CANONICAL_RCX X86_BUG(8) /* SYSRET #GPs when %RCX non-canonical */

#if defined(__KERNEL__) && !defined(__ASSEMBLY__)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 50163fa9034f..109a51815e92 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -159,6 +159,8 @@ static void early_init_intel(struct cpuinfo_x86 *c)
pr_info("Disabling PGE capability bit\n");
setup_clear_cpu_cap(X86_FEATURE_PGE);
}
+
+ set_cpu_bug(c, X86_BUG_CANONICAL_RCX);
}

#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e952f6bf1d6d..d01fb6c1362f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -415,16 +415,20 @@ syscall_return:
jne opportunistic_sysret_failed

/*
- * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
- * in kernel space. This essentially lets the user take over
- * the kernel, since userspace controls RSP.
- *
* If width of "canonical tail" ever becomes variable, this will need
* to be updated to remain correct on both old and new CPUs.
*/
.ifne __VIRTUAL_MASK_SHIFT - 47
.error "virtual address width changed -- SYSRET checks need update"
.endif
+
+ /*
+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP.
+ */
+ ALTERNATIVE "jmp 1f", "", X86_BUG_CANONICAL_RCX
+
/* Change top 16 bits to be the sign-extension of 47th bit */
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
@@ -432,6 +436,7 @@ syscall_return:
cmpq %rcx, %r11
jne opportunistic_sysret_failed

+1:
cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */
jne opportunistic_sysret_failed

--
2.3.5

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-26 11:22:21

by Borislav Petkov

[permalink] [raw]
Subject: perf numbers (was: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue)

On Sat, Apr 25, 2015 at 11:12:06PM +0200, Borislav Petkov wrote:
> I've prepended the perf stat output with markers A:, B: or C: for easier
> comparing. The markers mean:
>
> A: Linus' master from a couple of days ago + tip/master + tip/x86/asm
> B: With Andy's SYSRET patch ontop
> C: Without RCX canonicalness check (see patch at the end).

What was missing is D = B+C results, here they are:

A: 2835570.145246 cpu-clock (msec) ( +- 0.02% ) [100.00%]
B: 2833364.074970 cpu-clock (msec) ( +- 0.04% ) [100.00%]
C: 2834708.335431 cpu-clock (msec) ( +- 0.02% ) [100.00%]
D: 2835055.118431 cpu-clock (msec) ( +- 0.01% ) [100.00%]

A: 2835570.099981 task-clock (msec) # 3.996 CPUs utilized ( +- 0.02% ) [100.00%]
B: 2833364.073633 task-clock (msec) # 3.996 CPUs utilized ( +- 0.04% ) [100.00%]
C: 2834708.350387 task-clock (msec) # 3.996 CPUs utilized ( +- 0.02% ) [100.00%]
D: 2835055.094383 task-clock (msec) # 3.996 CPUs utilized ( +- 0.01% ) [100.00%]

A: 5,591,213,166,613 cycles # 1.972 GHz ( +- 0.03% ) [75.00%]
B: 5,585,023,802,888 cycles # 1.971 GHz ( +- 0.03% ) [75.00%]
C: 5,587,983,212,758 cycles # 1.971 GHz ( +- 0.02% ) [75.00%]
D: 5,584,838,532,936 cycles # 1.970 GHz ( +- 0.03% ) [75.00%]

A: 3,106,707,101,530 instructions # 0.56 insns per cycle ( +- 0.01% ) [75.00%]
B: 3,106,632,251,528 instructions # 0.56 insns per cycle ( +- 0.00% ) [75.00%]
C: 3,106,265,958,142 instructions # 0.56 insns per cycle ( +- 0.00% ) [75.00%]
D: 3,106,294,801,185 instructions # 0.56 insns per cycle ( +- 0.00% ) [75.00%]

A: 683,676,044,429 branches # 241.107 M/sec ( +- 0.01% ) [75.00%]
B: 683,670,899,595 branches # 241.293 M/sec ( +- 0.01% ) [75.00%]
C: 683,675,772,858 branches # 241.180 M/sec ( +- 0.01% ) [75.00%]
D: 683,683,533,664 branches # 241.154 M/sec ( +- 0.00% ) [75.00%]

A: 43,829,535,008 branch-misses # 6.41% of all branches ( +- 0.02% ) [75.00%]
B: 43,844,118,416 branch-misses # 6.41% of all branches ( +- 0.03% ) [75.00%]
C: 43,819,871,086 branch-misses # 6.41% of all branches ( +- 0.02% ) [75.00%]
D: 43,795,107,998 branch-misses # 6.41% of all branches ( +- 0.02% ) [75.00%]

A: 2,030,357 context-switches # 0.716 K/sec ( +- 0.06% ) [100.00%]
B: 2,029,313 context-switches # 0.716 K/sec ( +- 0.05% ) [100.00%]
C: 2,028,566 context-switches # 0.716 K/sec ( +- 0.06% ) [100.00%]
D: 2,028,895 context-switches # 0.716 K/sec ( +- 0.06% ) [100.00%]

A: 52,421 migrations # 0.018 K/sec ( +- 1.13% )
B: 52,049 migrations # 0.018 K/sec ( +- 1.02% )
C: 51,365 migrations # 0.018 K/sec ( +- 0.92% )
D: 51,766 migrations # 0.018 K/sec ( +- 1.11% )

A: 709.528485252 seconds time elapsed ( +- 0.02% )
B: 708.976557288 seconds time elapsed ( +- 0.04% )
C: 709.312844791 seconds time elapsed ( +- 0.02% )
D: 709.400050112 seconds time elapsed ( +- 0.01% )

So in all events except "branches" - which is comprehensible, btw -
we have a minimal net win when looking at how the numbers in A have
improved in D *with* *both* patches applied.

And those numbers are pretty nice IMO - even if the net win is
measurement artefact and not really a win, we certainly don't have a net
loss so unless anyone objects, I'm going to apply both patches but wait
for Andy's v2 with better comments and changed ss_sel test as per Denys'
suggestion.

Thanks.

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-26 12:34:39

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Fri, Apr 24, 2015 at 4:18 AM, Andy Lutomirski <[email protected]> wrote:
> On Thu, Apr 23, 2015 at 7:15 PM, Andy Lutomirski <[email protected]> wrote:
> Even if the issue affects SYSRETQ, it could be that we don't care. If
> the extent of the info leak is whether we context switched during a
> 64-bit syscall to a non-syscall context, then this is basically
> uninteresting. In that case, we could either ignore the 64-bit issue
> entirely or fix it the other way: force SS to NULL on context switch
> (much faster, I presume) and fix it up before SYSRETL as Denys
> suggested.
>
> We clearly don't have a spate of crashes in programs that do SYSCALL
> from a 64-bit CS and then far jump/return to a 32-bit CS without first
> reloading SS, since this bug has been here forever. I agree that the
> issue is ugly (if it exists in the first place), but maybe we don't
> need to fix it.

If you feel that concerned by the speed impact,
you can also disable this fix for !CONFIG_IA32_EMULATION kernels.
If kernel builder declared they don't want 32-bit userspace,
they won't do any ridiculous "I will temporarily
switch to 32-bit mode in 64-bit tasks" stuff either.

2015-04-26 23:37:01

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Fri, Apr 24, 2015 at 7:17 PM, Denys Vlasenko
<[email protected]> wrote:
> On Fri, Apr 24, 2015 at 10:50 PM, Andy Lutomirski <[email protected]> wrote:
>> On Fri, Apr 24, 2015 at 1:46 PM, Denys Vlasenko
>>>> This might be way more trouble than it's worth.
>>>
>>> Exactly my feeling. What are you trying to save? About four CPU
>>> cycles of checking %ss != __KERNEL_DS on each switch_to?
>>> That's not worth bothering about. Your last patch seems to be perfect.
>>
>> We'll have to do the write to ss almost every time an AMD CPU sleeps
>> in a syscall.
>
> Why do you think so?
> Scheduling from a syscall which decided to block won't require
> writing to %ss, since in this case %ss isn't NULL.
>
> Writing to %ss will happen every time we schedule from an interrupt.
> With timer interrupt every 1 ms it means scheduling at most ~1000
> times per second, if _every_ such interrupt causes task switch.
> This is still not often enough to worry.

OK, you've convinced me. I still think it can happen much more than
~1k times per second due to hrtimers (see my test case) or things like
page faults, but those are all slow paths.

v2 coming.

--
Andy Lutomirski
AMA Capital Management, LLC

2015-04-26 23:40:01

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

>
> diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
> index 7ee9b94d9921..8d555b046fe9 100644
> --- a/arch/x86/include/asm/cpufeature.h
> +++ b/arch/x86/include/asm/cpufeature.h
> @@ -265,6 +265,7 @@
> #define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
> #define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
> #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
> +#define X86_BUG_CANONICAL_RCX X86_BUG(8) /* SYSRET #GPs when %RCX non-canonical */

I think that "sysret" should appear in the name.


>
> #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
>
> diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
> index 50163fa9034f..109a51815e92 100644
> --- a/arch/x86/kernel/cpu/intel.c
> +++ b/arch/x86/kernel/cpu/intel.c
> @@ -159,6 +159,8 @@ static void early_init_intel(struct cpuinfo_x86 *c)
> pr_info("Disabling PGE capability bit\n");
> setup_clear_cpu_cap(X86_FEATURE_PGE);
> }
> +
> + set_cpu_bug(c, X86_BUG_CANONICAL_RCX);

Oh no! My laptop is currently bug-free, and you're breaking it! :)

> }
>
> #ifdef CONFIG_X86_32
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index e952f6bf1d6d..d01fb6c1362f 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -415,16 +415,20 @@ syscall_return:
> jne opportunistic_sysret_failed
>
> /*
> - * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
> - * in kernel space. This essentially lets the user take over
> - * the kernel, since userspace controls RSP.
> - *
> * If width of "canonical tail" ever becomes variable, this will need
> * to be updated to remain correct on both old and new CPUs.
> */
> .ifne __VIRTUAL_MASK_SHIFT - 47
> .error "virtual address width changed -- SYSRET checks need update"
> .endif
> +
> + /*
> + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
> + * in kernel space. This essentially lets the user take over
> + * the kernel, since userspace controls RSP.
> + */
> + ALTERNATIVE "jmp 1f", "", X86_BUG_CANONICAL_RCX
> +

I know it would be ugly, but would it be worth saving two bytes by
using ALTERNATIVE "jmp 1f", "shl ...", ...?

> /* Change top 16 bits to be the sign-extension of 47th bit */
> shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
> sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
> @@ -432,6 +436,7 @@ syscall_return:
> cmpq %rcx, %r11
> jne opportunistic_sysret_failed
>
> +1:
> cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */
> jne opportunistic_sysret_failed

--Andy

2015-04-27 08:53:26

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Sun, Apr 26, 2015 at 04:39:38PM -0700, Andy Lutomirski wrote:
> > +#define X86_BUG_CANONICAL_RCX X86_BUG(8) /* SYSRET #GPs when %RCX non-canonical */
>
> I think that "sysret" should appear in the name.

Yeah, I thought about it too, will fix.

> Oh no! My laptop is currently bug-free, and you're breaking it! :)

Muahahahhahaha...

> > +
> > + /*
> > + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
> > + * in kernel space. This essentially lets the user take over
> > + * the kernel, since userspace controls RSP.
> > + */
> > + ALTERNATIVE "jmp 1f", "", X86_BUG_CANONICAL_RCX
> > +
>
> I know it would be ugly, but would it be worth saving two bytes by
> using ALTERNATIVE "jmp 1f", "shl ...", ...?
>
> > /* Change top 16 bits to be the sign-extension of 47th bit */
> > shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
> > sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
> > @@ -432,6 +436,7 @@ syscall_return:
> > cmpq %rcx, %r11
> > jne opportunistic_sysret_failed

You want to stick all 4 insns in the alternative? Yeah, it should work
but it might even more unreadable than it is now.

Btw, we can do this too:

ALTERNATIVE "",
"shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx \
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx \
cmpq %rcx, %r11 \
jne opportunistic_sysret_failed"
X86_BUG_SYSRET_CANONICAL_RCX

which will replace the 2-byte JMP with a lot of NOPs on AMD.

I'll trace it again to see which one is worse :-)

Thanks.

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-27 10:07:49

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On 04/27/2015 10:53 AM, Borislav Petkov wrote:
> On Sun, Apr 26, 2015 at 04:39:38PM -0700, Andy Lutomirski wrote:
>>> +#define X86_BUG_CANONICAL_RCX X86_BUG(8) /* SYSRET #GPs when %RCX non-canonical */
>>
>> I think that "sysret" should appear in the name.
>
> Yeah, I thought about it too, will fix.
>
>> Oh no! My laptop is currently bug-free, and you're breaking it! :)
>
> Muahahahhahaha...
>
>>> +
>>> + /*
>>> + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
>>> + * in kernel space. This essentially lets the user take over
>>> + * the kernel, since userspace controls RSP.
>>> + */
>>> + ALTERNATIVE "jmp 1f", "", X86_BUG_CANONICAL_RCX
>>> +
>>
>> I know it would be ugly, but would it be worth saving two bytes by
>> using ALTERNATIVE "jmp 1f", "shl ...", ...?
>>
>>> /* Change top 16 bits to be the sign-extension of 47th bit */
>>> shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
>>> sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
>>> @@ -432,6 +436,7 @@ syscall_return:
>>> cmpq %rcx, %r11
>>> jne opportunistic_sysret_failed
>
> You want to stick all 4 insns in the alternative? Yeah, it should work
> but it might even more unreadable than it is now.
>
> Btw, we can do this too:
>
> ALTERNATIVE "",
> "shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx \
> sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx \
> cmpq %rcx, %r11 \
> jne opportunistic_sysret_failed"
> X86_BUG_SYSRET_CANONICAL_RCX
>
> which will replace the 2-byte JMP with a lot of NOPs on AMD.

The instructions you want to NOP out are translated to these bytes:

2c2: 48 c1 e1 10 shl $0x10,%rcx
2c6: 48 c1 f9 10 sar $0x10,%rcx
2ca: 49 39 cb cmp %rcx,%r11
2cd: 75 5f jne 32e <opportunistic_sysret_failed>

According to http://instlatx64.atw.hu/
CPUs from both AMD and Intel are happy to eat "66,66,66,90" NOPs
with maximum throughput; more than three 66 prefixes slow decode down,
sometimes horrifically (from 3 insns per cycle to one insn per ~10 cycles).

Probably doing something like this

/* Only three 0x66 prefixes for NOP for fast decode on all CPUs */
ALTERNATIVE ".byte 0x66,0x66,0x66,0x90 \
.byte 0x66,0x66,0x66,0x90 \
.byte 0x66,0x66,0x66,0x90",
"shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx \
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx \
cmpq %rcx, %r11 \
jne opportunistic_sysret_failed"
X86_BUG_SYSRET_CANONICAL_RCX

would be better than letting ALTERNATIVE to generate 13 one-byte NOPs.

--
vda

2015-04-27 10:09:53

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 12:07:14PM +0200, Denys Vlasenko wrote:
> /* Only three 0x66 prefixes for NOP for fast decode on all CPUs */
> ALTERNATIVE ".byte 0x66,0x66,0x66,0x90 \
> .byte 0x66,0x66,0x66,0x90 \
> .byte 0x66,0x66,0x66,0x90",
> "shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx \
> sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx \
> cmpq %rcx, %r11 \
> jne opportunistic_sysret_failed"
> X86_BUG_SYSRET_CANONICAL_RCX
>
> would be better than letting ALTERNATIVE to generate 13 one-byte NOPs.

We already do everything automagically, see:
arch/x86/kernel/alternative.c: optimize_nops()

:-)

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-27 11:35:25

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 10:53:05AM +0200, Borislav Petkov wrote:
> ALTERNATIVE "",
> "shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx \
> sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx \
> cmpq %rcx, %r11 \
> jne opportunistic_sysret_failed"
> X86_BUG_SYSRET_CANONICAL_RCX

Right, so I can do this:

/*
* Change top 16 bits to be the sign-extension of 47th bit, if this
* changed %rcx, it was not canonical.
*/
ALTERNATIVE "", \
"shl $(64 - (47+1)), %rcx; \
sar $(64 - (47+1)), %rcx; \
cmpq %rcx, %r11; \
jne opportunistic_sysret_failed", X86_BUG_SYSRET_CANON_RCX

If I use the __VIRTUAL_MASK_SHIFT macro *in* the ALTERNATIVE macro, I get some
really cryptic gas error:

arch/x86/kernel/entry_64.S: Assembler messages:
arch/x86/kernel/entry_64.S:441: Error: can't resolve `L0' {*ABS* section} - `L0' {*UND* section}
scripts/Makefile.build:294: recipe for target 'arch/x86/kernel/entry_64.o' failed
make[1]: *** [arch/x86/kernel/entry_64.o] Error 1
Makefile:1536: recipe for target 'arch/x86/kernel/entry_64.o' failed
make: *** [arch/x86/kernel/entry_64.o] Error 2

but I guess we can simply use the naked "47" because a couple of lines
above, we already have the sanity-check:

.ifne __VIRTUAL_MASK_SHIFT - 47
.error "virtual address width changed -- SYSRET checks need update"
.endif

so we should be guarded just fine.

Anyway, if we do it this way, we get 17 NOPs added at build time which is the
length of the 4 instructions:

ffffffff819ef40c: 48 c1 e1 10 shl $0x10,%rcx
ffffffff819ef410: 48 c1 f9 10 sar $0x10,%rcx
ffffffff819ef414: 49 39 cb cmp %rcx,%r11
ffffffff819ef417: 0f 85 ff 9c bc ff jne ffffffff815b911c <opportunistic_sysret_failed>

and the 17 NOPs should be optimized at boot time on AMD.

I was initially afraid that the JMP in the 4th line might be wrong but
apparently since we're using a global label, gas/gcc does generate the
offset properly (0xffffffff815b911c):

ffffffff815b911c <opportunistic_sysret_failed>:
ffffffff815b911c: ff 15 fe a4 26 00 callq *0x26a4fe(%rip) # ffffffff81823620 <pv_cpu_ops+0x120>
ffffffff815b9122: e9 01 09 00 00 jmpq ffffffff815b9a28 <restore_c_regs_and_iret>
ffffffff815b9127: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
ffffffff815b912e: 00 00

So, on to do some tracing.

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-27 12:08:48

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On 04/27/2015 01:35 PM, Borislav Petkov wrote:
> On Mon, Apr 27, 2015 at 10:53:05AM +0200, Borislav Petkov wrote:
>> ALTERNATIVE "",
>> "shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx \
>> sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx \
>> cmpq %rcx, %r11 \
>> jne opportunistic_sysret_failed"
>> X86_BUG_SYSRET_CANONICAL_RCX
>
> Right, so I can do this:
>
> /*
> * Change top 16 bits to be the sign-extension of 47th bit, if this
> * changed %rcx, it was not canonical.
> */
> ALTERNATIVE "", \
> "shl $(64 - (47+1)), %rcx; \
> sar $(64 - (47+1)), %rcx; \
> cmpq %rcx, %r11; \
> jne opportunistic_sysret_failed", X86_BUG_SYSRET_CANON_RCX
>
> If I use the __VIRTUAL_MASK_SHIFT macro *in* the ALTERNATIVE macro, I get some
> really cryptic gas error:
>
> arch/x86/kernel/entry_64.S: Assembler messages:
> arch/x86/kernel/entry_64.S:441: Error: can't resolve `L0' {*ABS* section} - `L0' {*UND* section}
> scripts/Makefile.build:294: recipe for target 'arch/x86/kernel/entry_64.o' failed
> make[1]: *** [arch/x86/kernel/entry_64.o] Error 1
> Makefile:1536: recipe for target 'arch/x86/kernel/entry_64.o' failed
> make: *** [arch/x86/kernel/entry_64.o] Error 2
>
> but I guess we can simply use the naked "47" because a couple of lines
> above, we already have the sanity-check:
>
> .ifne __VIRTUAL_MASK_SHIFT - 47
> .error "virtual address width changed -- SYSRET checks need update"
> .endif
>
> so we should be guarded just fine.
>
> Anyway, if we do it this way, we get 17 NOPs added at build time which is the
> length of the 4 instructions:
>
> ffffffff819ef40c: 48 c1 e1 10 shl $0x10,%rcx
> ffffffff819ef410: 48 c1 f9 10 sar $0x10,%rcx
> ffffffff819ef414: 49 39 cb cmp %rcx,%r11
> ffffffff819ef417: 0f 85 ff 9c bc ff jne ffffffff815b911c <opportunistic_sysret_failed>

This looks strange. opportunistic_sysret_failed label is just a few
instructions below. Why are you getting "ff 9c bc ff" offset in JNE
instead of short jump of 0x5f bytes I see without ALTERNATIVE?

2015-04-27 12:49:14

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 02:08:40PM +0200, Denys Vlasenko wrote:
> > ffffffff819ef40c: 48 c1 e1 10 shl $0x10,%rcx
> > ffffffff819ef410: 48 c1 f9 10 sar $0x10,%rcx
> > ffffffff819ef414: 49 39 cb cmp %rcx,%r11
> > ffffffff819ef417: 0f 85 ff 9c bc ff jne ffffffff815b911c <opportunistic_sysret_failed>
>
> This looks strange. opportunistic_sysret_failed label is just a few
> instructions below. Why are you getting "ff 9c bc ff" offset in JNE
> instead of short jump of 0x5f bytes I see without ALTERNATIVE?

Because the replacement instructions are placed far away in the section
.altinstr_replacement and since we have relative JMPs, gas generates
JMP from that section to opportunistic_sysret_failed. That's why it is
negative too.

And by looking at this more, I'm afraid even this current version won't
work because even after I added recompute_jump() recently which is
supposed to fixup the JMPs and even make them smaller, it won't work in
this case because it won't detect the JMP as it is the 4th instruction
and not the first byte. (And even if, it won't detect it still because
we're not looking at conditional JMPs yet, i.e. Jcc).

What we could do is something like this instead:

jne opportunistic_sysret_failed - 1f
1:

so that the offset is correct. Need to experiment with this a bit first
though, for the exact placement of the label but it should show the
idea.

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-27 14:39:24

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Sun, Apr 26, 2015 at 04:39:38PM -0700, Andy Lutomirski wrote:
> I know it would be ugly, but would it be worth saving two bytes by
> using ALTERNATIVE "jmp 1f", "shl ...", ...?

Damn, it is actually visible even that saving the unconditional forward
JMP makes the numbers marginally nicer (E: row). So I guess we'll be
dropping the forward JMP too.

A: 2835570.145246 cpu-clock (msec) ( +- 0.02% ) [100.00%]
B: 2833364.074970 cpu-clock (msec) ( +- 0.04% ) [100.00%]
C: 2834708.335431 cpu-clock (msec) ( +- 0.02% ) [100.00%]
D: 2835055.118431 cpu-clock (msec) ( +- 0.01% ) [100.00%]
E: 2833115.118624 cpu-clock (msec) ( +- 0.06% ) [100.00%]

A: 2835570.099981 task-clock (msec) # 3.996 CPUs utilized ( +- 0.02% ) [100.00%]
B: 2833364.073633 task-clock (msec) # 3.996 CPUs utilized ( +- 0.04% ) [100.00%]
C: 2834708.350387 task-clock (msec) # 3.996 CPUs utilized ( +- 0.02% ) [100.00%]
D: 2835055.094383 task-clock (msec) # 3.996 CPUs utilized ( +- 0.01% ) [100.00%]
E: 2833115.145292 task-clock (msec) # 3.996 CPUs utilized ( +- 0.06% ) [100.00%]

A: 5,591,213,166,613 cycles # 1.972 GHz ( +- 0.03% ) [75.00%]
B: 5,585,023,802,888 cycles # 1.971 GHz ( +- 0.03% ) [75.00%]
C: 5,587,983,212,758 cycles # 1.971 GHz ( +- 0.02% ) [75.00%]
D: 5,584,838,532,936 cycles # 1.970 GHz ( +- 0.03% ) [75.00%]
E: 5,583,979,727,842 cycles # 1.971 GHz ( +- 0.05% ) [75.00%]

cycles is the lowest, nice.

A: 3,106,707,101,530 instructions # 0.56 insns per cycle ( +- 0.01% ) [75.00%]
B: 3,106,632,251,528 instructions # 0.56 insns per cycle ( +- 0.00% ) [75.00%]
C: 3,106,265,958,142 instructions # 0.56 insns per cycle ( +- 0.00% ) [75.00%]
D: 3,106,294,801,185 instructions # 0.56 insns per cycle ( +- 0.00% ) [75.00%]
E: 3,106,381,223,355 instructions # 0.56 insns per cycle ( +- 0.01% ) [75.00%]

Understandable - we end up executing 5 insns more:

ffffffff815b90ac: 66 66 66 90 data16 data16 xchg %ax,%ax
ffffffff815b90b0: 66 66 66 90 data16 data16 xchg %ax,%ax
ffffffff815b90b4: 66 66 66 90 data16 data16 xchg %ax,%ax
ffffffff815b90b8: 66 66 66 90 data16 data16 xchg %ax,%ax
ffffffff815b90bc: 90 nop


A: 683,676,044,429 branches # 241.107 M/sec ( +- 0.01% ) [75.00%]
B: 683,670,899,595 branches # 241.293 M/sec ( +- 0.01% ) [75.00%]
C: 683,675,772,858 branches # 241.180 M/sec ( +- 0.01% ) [75.00%]
D: 683,683,533,664 branches # 241.154 M/sec ( +- 0.00% ) [75.00%]
E: 683,648,518,667 branches # 241.306 M/sec ( +- 0.01% ) [75.00%]

Lowest.

A: 43,829,535,008 branch-misses # 6.41% of all branches ( +- 0.02% ) [75.00%]
B: 43,844,118,416 branch-misses # 6.41% of all branches ( +- 0.03% ) [75.00%]
C: 43,819,871,086 branch-misses # 6.41% of all branches ( +- 0.02% ) [75.00%]
D: 43,795,107,998 branch-misses # 6.41% of all branches ( +- 0.02% ) [75.00%]
E: 43,801,985,070 branch-misses # 6.41% of all branches ( +- 0.02% ) [75.00%]

That looks like noise to me - we shouldn't be getting more branch misses
with the E: version.

A: 2,030,357 context-switches # 0.716 K/sec ( +- 0.06% ) [100.00%]
B: 2,029,313 context-switches # 0.716 K/sec ( +- 0.05% ) [100.00%]
C: 2,028,566 context-switches # 0.716 K/sec ( +- 0.06% ) [100.00%]
D: 2,028,895 context-switches # 0.716 K/sec ( +- 0.06% ) [100.00%]
E: 2,031,008 context-switches # 0.717 K/sec ( +- 0.09% ) [100.00%]

A: 52,421 migrations # 0.018 K/sec ( +- 1.13% )
B: 52,049 migrations # 0.018 K/sec ( +- 1.02% )
C: 51,365 migrations # 0.018 K/sec ( +- 0.92% )
D: 51,766 migrations # 0.018 K/sec ( +- 1.11% )
E: 53,047 migrations # 0.019 K/sec ( +- 1.08% )

A: 709.528485252 seconds time elapsed ( +- 0.02% )
B: 708.976557288 seconds time elapsed ( +- 0.04% )
C: 709.312844791 seconds time elapsed ( +- 0.02% )
D: 709.400050112 seconds time elapsed ( +- 0.01% )
E: 708.914562508 seconds time elapsed ( +- 0.06% )

Nice.

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-27 14:57:40

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 4:35 AM, Borislav Petkov <[email protected]> wrote:
>
> /*
> * Change top 16 bits to be the sign-extension of 47th bit, if this
> * changed %rcx, it was not canonical.
> */
> ALTERNATIVE "", \
> "shl $(64 - (47+1)), %rcx; \
> sar $(64 - (47+1)), %rcx; \
> cmpq %rcx, %r11; \
> jne opportunistic_sysret_failed", X86_BUG_SYSRET_CANON_RCX

Guys, if we're looking at cycles for this, then don't do the "exact
canonical test". and go back to just doing

shr $__VIRTUAL_MASK_SHIFT, %rcx
jnz opportunistic_sysret_failed

which is much smaller. In fact, aim to make the conditional jump be a
two-byte one (jump forward to another jump if required - it's a
slow-path that doesn't matter at *all* for the taken case), and the
end result is just six bytes. That way you can use alternative to
replace it with one single noop on AMD.

Because dammit, if we're playing these kinds of games, let's do it
*right*. No half measures.

Linus

2015-04-27 15:06:19

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 7:57 AM, Linus Torvalds
<[email protected]> wrote:
>
> ..end result is just six bytes. That way you can use alternative to
> replace it with one single noop on AMD.

Actually, it looks like we have no good 6-byte no-ops on AMD. So you'd
get two three-byte ones. Oh well. It's still better than five nops
that can't even be decoded all at once.

That said, our NOP tables look to be old for AMD. Looking at the AMD
optimization guide (family 16h), it says to use

66 0F 1F 44 00 00

which seems to be the same as Intel (it's "nopw 0x0(%rax,%rax,1)").

So maybe our AMD nop tables should be updated?

Linus

2015-04-27 15:35:42

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 08:06:16AM -0700, Linus Torvalds wrote:
> So maybe our AMD nop tables should be updated?

Ho-humm, we're using k8_nops on all 64-bit AMD. I better do some
opt-guide staring.

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-27 15:46:51

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 07:57:36AM -0700, Linus Torvalds wrote:
> On Mon, Apr 27, 2015 at 4:35 AM, Borislav Petkov <[email protected]> wrote:
> >
> > /*
> > * Change top 16 bits to be the sign-extension of 47th bit, if this
> > * changed %rcx, it was not canonical.
> > */
> > ALTERNATIVE "", \
> > "shl $(64 - (47+1)), %rcx; \
> > sar $(64 - (47+1)), %rcx; \
> > cmpq %rcx, %r11; \
> > jne opportunistic_sysret_failed", X86_BUG_SYSRET_CANON_RCX
>
> Guys, if we're looking at cycles for this, then don't do the "exact
> canonical test". and go back to just doing
>
> shr $__VIRTUAL_MASK_SHIFT, %rcx
> jnz opportunistic_sysret_failed
>
> which is much smaller.

Right, what about the false positives:

17be0aec74fb ("x86/asm/entry/64: Implement better check for canonical addresses")

? We don't care?

> In fact, aim to make the conditional jump be a
> two-byte one (jump forward to another jump if required - it's a
> slow-path that doesn't matter at *all* for the taken case), and the
> end result is just six bytes. That way you can use alternative to
> replace it with one single noop on AMD.

Well, even with the non-optimal NOPs (we end up with 4 3-byte NOPs and
one single-byte), we're still better than the unconditional JMP I had
there before:

https://lkml.kernel.org/r/[email protected]

(you might want to look at the raw email - marc.info breaks lines)

I'll retest with the F16h NOPs to see whether there's any difference.

Thanks.

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-27 15:56:46

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 8:46 AM, Borislav Petkov <[email protected]> wrote:
> On Mon, Apr 27, 2015 at 07:57:36AM -0700, Linus Torvalds wrote:
>> On Mon, Apr 27, 2015 at 4:35 AM, Borislav Petkov <[email protected]> wrote:
>> >
>> > /*
>> > * Change top 16 bits to be the sign-extension of 47th bit, if this
>> > * changed %rcx, it was not canonical.
>> > */
>> > ALTERNATIVE "", \
>> > "shl $(64 - (47+1)), %rcx; \
>> > sar $(64 - (47+1)), %rcx; \
>> > cmpq %rcx, %r11; \
>> > jne opportunistic_sysret_failed", X86_BUG_SYSRET_CANON_RCX
>>
>> Guys, if we're looking at cycles for this, then don't do the "exact
>> canonical test". and go back to just doing
>>
>> shr $__VIRTUAL_MASK_SHIFT, %rcx
>> jnz opportunistic_sysret_failed
>>
>> which is much smaller.
>
> Right, what about the false positives:
>
> 17be0aec74fb ("x86/asm/entry/64: Implement better check for canonical addresses")
>
> ? We don't care?

The false positives only matter for very strange workloads, e.g.
vsyscall=native with old libc. If it's a measurable regression, we
could revert it.

--Andy

2015-04-27 16:00:12

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 8:46 AM, Borislav Petkov <[email protected]> wrote:
>
> Right, what about the false positives:

Anybody who tries to return to kernel addresses with sysret is
suspect. It's more likely to be an attack vector than anything else
(ie somebody who is trying to take advantage of a CPU bug).

I don't think there are any false positives. The only valid sysret
targets are in normal user space.

There's the "vsyscall" area, I guess, but we are actively discouraging
people from using it (it's emulated by default) and using iret to
return from it is fine if somebody ends up using it natively. It was a
mistake to have fixed addresses with known code in it, so I don't
think we should care.

We've had the inexact version for a long time, and the exact canonical
address check hasn't even hit my tree yet. I wouldn't worry about it.

And since we haven't even merged the "better check for canonical
addresses" it cannot even be a regression if we never really use it.

Linus

2015-04-27 16:04:46

by Brian Gerst

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 11:56 AM, Andy Lutomirski <[email protected]> wrote:
> On Mon, Apr 27, 2015 at 8:46 AM, Borislav Petkov <[email protected]> wrote:
>> On Mon, Apr 27, 2015 at 07:57:36AM -0700, Linus Torvalds wrote:
>>> On Mon, Apr 27, 2015 at 4:35 AM, Borislav Petkov <[email protected]> wrote:
>>> >
>>> > /*
>>> > * Change top 16 bits to be the sign-extension of 47th bit, if this
>>> > * changed %rcx, it was not canonical.
>>> > */
>>> > ALTERNATIVE "", \
>>> > "shl $(64 - (47+1)), %rcx; \
>>> > sar $(64 - (47+1)), %rcx; \
>>> > cmpq %rcx, %r11; \
>>> > jne opportunistic_sysret_failed", X86_BUG_SYSRET_CANON_RCX
>>>
>>> Guys, if we're looking at cycles for this, then don't do the "exact
>>> canonical test". and go back to just doing
>>>
>>> shr $__VIRTUAL_MASK_SHIFT, %rcx
>>> jnz opportunistic_sysret_failed
>>>
>>> which is much smaller.
>>
>> Right, what about the false positives:
>>
>> 17be0aec74fb ("x86/asm/entry/64: Implement better check for canonical addresses")
>>
>> ? We don't care?
>
> The false positives only matter for very strange workloads, e.g.
> vsyscall=native with old libc. If it's a measurable regression, we
> could revert it.
>
> --Andy

Another alternative is to do the canonical check in the paths that can
set user RIP with an untrusted value, ie, sigreturn and exec.

--
Brian Gerst

2015-04-27 16:10:59

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On 04/27/2015 06:04 PM, Brian Gerst wrote:
> On Mon, Apr 27, 2015 at 11:56 AM, Andy Lutomirski <[email protected]> wrote:
>> On Mon, Apr 27, 2015 at 8:46 AM, Borislav Petkov <[email protected]> wrote:
>>> On Mon, Apr 27, 2015 at 07:57:36AM -0700, Linus Torvalds wrote:
>>>> On Mon, Apr 27, 2015 at 4:35 AM, Borislav Petkov <[email protected]> wrote:
>>>>>
>>>>> /*
>>>>> * Change top 16 bits to be the sign-extension of 47th bit, if this
>>>>> * changed %rcx, it was not canonical.
>>>>> */
>>>>> ALTERNATIVE "", \
>>>>> "shl $(64 - (47+1)), %rcx; \
>>>>> sar $(64 - (47+1)), %rcx; \
>>>>> cmpq %rcx, %r11; \
>>>>> jne opportunistic_sysret_failed", X86_BUG_SYSRET_CANON_RCX
>>>>
>>>> Guys, if we're looking at cycles for this, then don't do the "exact
>>>> canonical test". and go back to just doing
>>>>
>>>> shr $__VIRTUAL_MASK_SHIFT, %rcx
>>>> jnz opportunistic_sysret_failed
>>>>
>>>> which is much smaller.
>>>
>>> Right, what about the false positives:
>>>
>>> 17be0aec74fb ("x86/asm/entry/64: Implement better check for canonical addresses")
>>>
>>> ? We don't care?
>>
>> The false positives only matter for very strange workloads, e.g.
>> vsyscall=native with old libc. If it's a measurable regression, we
>> could revert it.
>>
>> --Andy
>
> Another alternative is to do the canonical check in the paths that can
> set user RIP with an untrusted value, ie, sigreturn and exec.

It is already done only on that path. Fast path doesn't check
RCX for canonicalness.

2015-04-27 16:12:40

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On 04/27/2015 04:57 PM, Linus Torvalds wrote:
> On Mon, Apr 27, 2015 at 4:35 AM, Borislav Petkov <[email protected]> wrote:
>>
>> /*
>> * Change top 16 bits to be the sign-extension of 47th bit, if this
>> * changed %rcx, it was not canonical.
>> */
>> ALTERNATIVE "", \
>> "shl $(64 - (47+1)), %rcx; \
>> sar $(64 - (47+1)), %rcx; \
>> cmpq %rcx, %r11; \
>> jne opportunistic_sysret_failed", X86_BUG_SYSRET_CANON_RCX
>
> Guys, if we're looking at cycles for this, then don't do the "exact
> canonical test". and go back to just doing
>
> shr $__VIRTUAL_MASK_SHIFT, %rcx
> jnz opportunistic_sysret_failed
>
> which is much smaller.

It is smaller, but not by much. It is two instructions smaller.
On disassembly level, the changes are:

cmp %rcx,0x80(%rsp) -> mov 0x80(%rsp),%r11; cmp %rcx,%r11
shr $0x2f,%rcx -> shl $0x10,%rcx; sar $0x10,%rcx; cmp %rcx,%r11
mov 0x58(%rsp),%rcx -> (eliminated)

2015-04-27 16:40:43

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 09:00:08AM -0700, Linus Torvalds wrote:
> On Mon, Apr 27, 2015 at 8:46 AM, Borislav Petkov <[email protected]> wrote:
> >
> > Right, what about the false positives:
>
> Anybody who tries to return to kernel addresses with sysret is
> suspect. It's more likely to be an attack vector than anything else
> (ie somebody who is trying to take advantage of a CPU bug).
>
> I don't think there are any false positives. The only valid sysret
> targets are in normal user space.
>
> There's the "vsyscall" area, I guess, but we are actively discouraging

vsyscall=native on old glibc, says Andy.

> people from using it (it's emulated by default) and using iret to
> return from it is fine if somebody ends up using it natively. It was a
> mistake to have fixed addresses with known code in it, so I don't
> think we should care.
>
> We've had the inexact version for a long time, and the exact canonical
> address check hasn't even hit my tree yet. I wouldn't worry about it.
>
> And since we haven't even merged the "better check for canonical
> addresses" it cannot even be a regression if we never really use it.

Well, it certainly sounds to me like not really worth the trouble to do
the exact check. So I'm all for dropping it. Unless Andy doesn't come up
with a "but but, there's this use case..."

Either way, the NOPs-version is faster and I'm running the test with the
F16h-specific NOPs to see how they perform.

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-27 18:12:10

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 9:12 AM, Denys Vlasenko <[email protected]> wrote:
>
> It is smaller, but not by much. It is two instructions smaller.

Ehh. That's _half_.

And on a decoding side, it's the difference between 6 bytes that
decode cleanly and can be decoded in parallel with other things
(assuming the 6-byte nop), and 13 bytes that will need at least 2 nops
(unless you want to do lots of prefixes, which is slow on some cores),
_and_ which is likely big enough that you will basically not be
decoding anythign else that cycle.

So on the whole, your "smaller, but not by much" is all relative. It's
a relatively big difference.

So if one or two cycles in this code doesn't matter, then why are we
adding alternate instructions just to avoid a few ALU instructions and
a conditional branch that predicts perfectly? And if it does matter,
then the 6-byte option looks clearly better..

Linus

2015-04-27 18:14:20

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 9:40 AM, Borislav Petkov <[email protected]> wrote:
>
> Either way, the NOPs-version is faster and I'm running the test with the
> F16h-specific NOPs to see how they perform.

Btw, please don't use the "more than three 66h overrides" version.
Sure, that's what the optimization manual suggests if you want
single-instruction decode for all sizes up to 15 bytes, but I think
we're better off with the two-nop case for sizes 12-15) (4-byte nop
followed by 8-11 byte nop).

Because the "more than three 66b prefixes" really performs abysmally
on some cores, iirc.

Linus

2015-04-27 18:39:42

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 11:14:15AM -0700, Linus Torvalds wrote:
> Btw, please don't use the "more than three 66h overrides" version.

Oh yeah, a notorious "frontend choker".

> Sure, that's what the optimization manual suggests if you want
> single-instruction decode for all sizes up to 15 bytes, but I think
> we're better off with the two-nop case for sizes 12-15) (4-byte nop
> followed by 8-11 byte nop).

Yeah, so says the manual. Although I wouldn't trust those manuals
blindly but that's another story.

> Because the "more than three 66b prefixes" really performs abysmally
> on some cores, iirc.

Right.

So our current NOP-infrastructure does ASM_NOP_MAX NOPs of 8 bytes so
without more invasive changes, our longest NOPs are 8 byte long and then
we have to repeat. This is consistent with what the code looks like here
after alternatives application:

ffffffff815b9084 <syscall_return>:
...

ffffffff815b90ac: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
ffffffff815b90b3: 00
ffffffff815b90b4: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
ffffffff815b90bb: 00
ffffffff815b90bc: 90 nop

You can recognize the p6_nops being the same as in-the-manual-suggested
F16h ones.

:-)

I'm running them now and will report numbers relative to the last run
once it is done. And those numbers should in practice get even better if
we revert to the simpler canonical-ness check but let's see...

Thanks.

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-27 18:47:32

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 11:38 AM, Borislav Petkov <[email protected]> wrote:
>
> So our current NOP-infrastructure does ASM_NOP_MAX NOPs of 8 bytes so
> without more invasive changes, our longest NOPs are 8 byte long and then
> we have to repeat.

Btw (and I'm too lazy to check) do we take alignment into account?

Because if you have to split, and use multiple nops, it is *probably*
a good idea to try to avoid 16-byte boundaries, since that's can be
the I$ fetch granularity from L1 (although I guess 32B is getting more
common).

So the exact split might depend on the alignment of the nop replacement..

> You can recognize the p6_nops being the same as in-the-manual-suggested
> F16h ones.

Can we perhaps get rid of the distinction entirely, and just use one
set of 64-bit nops for both Intel/AMD?

Linus

2015-04-27 18:47:51

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 11:12:05AM -0700, Linus Torvalds wrote:
> So if one or two cycles in this code doesn't matter, then why are we
> adding alternate instructions just to avoid a few ALU instructions and
> a conditional branch that predicts perfectly? And if it does matter,
> then the 6-byte option looks clearly better..

You know what? I haven't even measured the tree *without* Denys'
stricter RCX canonical-ness patch. All numbers so far are from 4.0+ with
tip/master ontop which has Denys' patch.

And I *should* measure once with plain 4.1-rc1 and then with Denys'
patch ontop to see whether this all jumping-thru-alternatives-hoops is
even worth it.

If nothing else, it was a nice exercise. :-)

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-27 18:54:05

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 11:47:30AM -0700, Linus Torvalds wrote:
> On Mon, Apr 27, 2015 at 11:38 AM, Borislav Petkov <[email protected]> wrote:
> >
> > So our current NOP-infrastructure does ASM_NOP_MAX NOPs of 8 bytes so
> > without more invasive changes, our longest NOPs are 8 byte long and then
> > we have to repeat.
>
> Btw (and I'm too lazy to check) do we take alignment into account?
>
> Because if you have to split, and use multiple nops, it is *probably*
> a good idea to try to avoid 16-byte boundaries, since that's can be
> the I$ fetch granularity from L1 (although I guess 32B is getting more
> common).

Yeah, on F16h you have 32B fetch but the paths later in the machine
gets narrower, so to speak.

> So the exact split might depend on the alignment of the nop replacement..

Yeah, no. Our add_nops() is trivial:

/* Use this to add nops to a buffer, then text_poke the whole buffer. */
static void __init_or_module add_nops(void *insns, unsigned int len)
{
while (len > 0) {
unsigned int noplen = len;
if (noplen > ASM_NOP_MAX)
noplen = ASM_NOP_MAX;
memcpy(insns, ideal_nops[noplen], noplen);
insns += noplen;
len -= noplen;
}
}

> Can we perhaps get rid of the distinction entirely, and just use one
> set of 64-bit nops for both Intel/AMD?

I *think* hpa would have an opinion here. I'm judging by looking at
comments like this one in the code:

/*
* Due to a decoder implementation quirk, some
* specific Intel CPUs actually perform better with
* the "k8_nops" than with the SDM-recommended NOPs.
*/

which is a fun one in itself. :-)

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-27 19:12:06

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 08:38:54PM +0200, Borislav Petkov wrote:
> I'm running them now and will report numbers relative to the last run
> once it is done. And those numbers should in practice get even better if
> we revert to the simpler canonical-ness check but let's see...

Results are done. New row is F: which is with the F16h NOPs.

With all things equal and with this change ontop:

---
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index aef653193160..d713080005ef 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -227,6 +227,14 @@ void __init arch_init_ideal_nops(void)
#endif
}
break;
+
+ case X86_VENDOR_AMD:
+ if (boot_cpu_data.x86 == 0x16) {
+ ideal_nops = p6_nops;
+ return;
+ }
+
+ /* fall through */
default:
#ifdef CONFIG_X86_64
ideal_nops = k8_nops;
---

... cycles, instructions, branches, branch-misses, context-switches
drop or remain roughly the same. BUT(!) timings increases.
cpu-clock/task-clock and duration of the workload are all the worst of
all possible cases.

So either those NOPs are not really optimal (i.e., trusting the manuals
and so on :-)) or it is their alignment.

But look at the chapter in the manual - "2.7.2.1 Encoding Padding for
Loop Alignment" - those NOPs are supposed to be used as padding so
they themselves will not be necessarily aligned when you use them to pad
stuff.

Or maybe using the longer NOPs is probably worse than the shorter 4-byte
ones with 3 0x66 prefixes which should "flow" easier through the pipe
due to their smaller length.

Or something completely different...

Oh well, enough measurements for today - will do the rc1 measurement
tomorrow.

Thanks.

---
Performance counter stats for 'system wide' (10 runs):

A: 2835570.145246 cpu-clock (msec) ( +- 0.02% ) [100.00%]
B: 2833364.074970 cpu-clock (msec) ( +- 0.04% ) [100.00%]
C: 2834708.335431 cpu-clock (msec) ( +- 0.02% ) [100.00%]
D: 2835055.118431 cpu-clock (msec) ( +- 0.01% ) [100.00%]
E: 2833115.118624 cpu-clock (msec) ( +- 0.06% ) [100.00%]
F: 2835863.670798 cpu-clock (msec) ( +- 0.02% ) [100.00%]

A: 2835570.099981 task-clock (msec) # 3.996 CPUs utilized ( +- 0.02% ) [100.00%]
B: 2833364.073633 task-clock (msec) # 3.996 CPUs utilized ( +- 0.04% ) [100.00%]
C: 2834708.350387 task-clock (msec) # 3.996 CPUs utilized ( +- 0.02% ) [100.00%]
D: 2835055.094383 task-clock (msec) # 3.996 CPUs utilized ( +- 0.01% ) [100.00%]
E: 2833115.145292 task-clock (msec) # 3.996 CPUs utilized ( +- 0.06% ) [100.00%]
F: 2835863.719556 task-clock (msec) # 3.996 CPUs utilized ( +- 0.02% ) [100.00%]

A: 5,591,213,166,613 cycles # 1.972 GHz ( +- 0.03% ) [75.00%]
B: 5,585,023,802,888 cycles # 1.971 GHz ( +- 0.03% ) [75.00%]
C: 5,587,983,212,758 cycles # 1.971 GHz ( +- 0.02% ) [75.00%]
D: 5,584,838,532,936 cycles # 1.970 GHz ( +- 0.03% ) [75.00%]
E: 5,583,979,727,842 cycles # 1.971 GHz ( +- 0.05% ) [75.00%]
F: 5,581,639,840,197 cycles # 1.968 GHz ( +- 0.03% ) [75.00%]

A: 3,106,707,101,530 instructions # 0.56 insns per cycle ( +- 0.01% ) [75.00%]
B: 3,106,632,251,528 instructions # 0.56 insns per cycle ( +- 0.00% ) [75.00%]
C: 3,106,265,958,142 instructions # 0.56 insns per cycle ( +- 0.00% ) [75.00%]
D: 3,106,294,801,185 instructions # 0.56 insns per cycle ( +- 0.00% ) [75.00%]
E: 3,106,381,223,355 instructions # 0.56 insns per cycle ( +- 0.01% ) [75.00%]
F: 3,105,996,162,436 instructions # 0.56 insns per cycle ( +- 0.00% ) [75.00%]

A: 683,676,044,429 branches # 241.107 M/sec ( +- 0.01% ) [75.00%]
B: 683,670,899,595 branches # 241.293 M/sec ( +- 0.01% ) [75.00%]
C: 683,675,772,858 branches # 241.180 M/sec ( +- 0.01% ) [75.00%]
D: 683,683,533,664 branches # 241.154 M/sec ( +- 0.00% ) [75.00%]
E: 683,648,518,667 branches # 241.306 M/sec ( +- 0.01% ) [75.00%]
F: 683,663,028,656 branches # 241.078 M/sec ( +- 0.00% ) [75.00%]

A: 43,829,535,008 branch-misses # 6.41% of all branches ( +- 0.02% ) [75.00%]
B: 43,844,118,416 branch-misses # 6.41% of all branches ( +- 0.03% ) [75.00%]
C: 43,819,871,086 branch-misses # 6.41% of all branches ( +- 0.02% ) [75.00%]
D: 43,795,107,998 branch-misses # 6.41% of all branches ( +- 0.02% ) [75.00%]
E: 43,801,985,070 branch-misses # 6.41% of all branches ( +- 0.02% ) [75.00%]
F: 43,804,449,271 branch-misses # 6.41% of all branches ( +- 0.02% ) [75.00%]

A: 2,030,357 context-switches # 0.716 K/sec ( +- 0.06% ) [100.00%]
B: 2,029,313 context-switches # 0.716 K/sec ( +- 0.05% ) [100.00%]
C: 2,028,566 context-switches # 0.716 K/sec ( +- 0.06% ) [100.00%]
D: 2,028,895 context-switches # 0.716 K/sec ( +- 0.06% ) [100.00%]
E: 2,031,008 context-switches # 0.717 K/sec ( +- 0.09% ) [100.00%]
F: 2,028,132 context-switches # 0.715 K/sec ( +- 0.05% ) [100.00%]

A: 52,421 migrations # 0.018 K/sec ( +- 1.13% )
B: 52,049 migrations # 0.018 K/sec ( +- 1.02% )
C: 51,365 migrations # 0.018 K/sec ( +- 0.92% )
D: 51,766 migrations # 0.018 K/sec ( +- 1.11% )
E: 53,047 migrations # 0.019 K/sec ( +- 1.08% )
F: 51,447 migrations # 0.018 K/sec ( +- 0.86% )

A: 709.528485252 seconds time elapsed ( +- 0.02% )
B: 708.976557288 seconds time elapsed ( +- 0.04% )
C: 709.312844791 seconds time elapsed ( +- 0.02% )
D: 709.400050112 seconds time elapsed ( +- 0.01% )
E: 708.914562508 seconds time elapsed ( +- 0.06% )
F: 709.602255085 seconds time elapsed ( +- 0.02% )

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-27 19:21:40

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On 04/27/2015 09:11 PM, Borislav Petkov wrote:
> A: 709.528485252 seconds time elapsed ( +- 0.02% )
> B: 708.976557288 seconds time elapsed ( +- 0.04% )
> C: 709.312844791 seconds time elapsed ( +- 0.02% )
> D: 709.400050112 seconds time elapsed ( +- 0.01% )
> E: 708.914562508 seconds time elapsed ( +- 0.06% )
> F: 709.602255085 seconds time elapsed ( +- 0.02% )

That's about 0.2% variance. Very small.

Sounds obvious, but. Did you try running a test several times?
Maybe you are measuring random noise.

2015-04-27 19:45:32

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 09:21:34PM +0200, Denys Vlasenko wrote:
> On 04/27/2015 09:11 PM, Borislav Petkov wrote:
> > A: 709.528485252 seconds time elapsed ( +- 0.02% )
> > B: 708.976557288 seconds time elapsed ( +- 0.04% )
> > C: 709.312844791 seconds time elapsed ( +- 0.02% )
> > D: 709.400050112 seconds time elapsed ( +- 0.01% )
> > E: 708.914562508 seconds time elapsed ( +- 0.06% )
> > F: 709.602255085 seconds time elapsed ( +- 0.02% )
>
> That's about 0.2% variance. Very small.

Right, I'm doubtful this is the right workload for this. And actually
if even any workload would show any serious difference. Perhaps it all
doesn't really matter and we shouldn't do anything at all.

> Sounds obvious, but. Did you try running a test several times?

All runs so far are done with perf state ... --repeat 10 so, 10 kernel
builds and results are averaged.

> Maybe you are measuring random noise.

Yeah. Last exercise tomorrow. Let's see what those numbers would look
like.

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-27 20:00:06

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

It really comes down to this: it seems older cores from both Intel and AMD perform better with 66 66 66 90, whereas the 0F 1F series is better on newer cores.

When I measured it, the differences were sometimes dramatic.

On April 27, 2015 11:53:44 AM PDT, Borislav Petkov <[email protected]> wrote:
>On Mon, Apr 27, 2015 at 11:47:30AM -0700, Linus Torvalds wrote:
>> On Mon, Apr 27, 2015 at 11:38 AM, Borislav Petkov <[email protected]>
>wrote:
>> >
>> > So our current NOP-infrastructure does ASM_NOP_MAX NOPs of 8 bytes
>so
>> > without more invasive changes, our longest NOPs are 8 byte long and
>then
>> > we have to repeat.
>>
>> Btw (and I'm too lazy to check) do we take alignment into account?
>>
>> Because if you have to split, and use multiple nops, it is *probably*
>> a good idea to try to avoid 16-byte boundaries, since that's can be
>> the I$ fetch granularity from L1 (although I guess 32B is getting
>more
>> common).
>
>Yeah, on F16h you have 32B fetch but the paths later in the machine
>gets narrower, so to speak.
>
>> So the exact split might depend on the alignment of the nop
>replacement..
>
>Yeah, no. Our add_nops() is trivial:
>
>/* Use this to add nops to a buffer, then text_poke the whole buffer.
>*/
>static void __init_or_module add_nops(void *insns, unsigned int len)
>{
> while (len > 0) {
> unsigned int noplen = len;
> if (noplen > ASM_NOP_MAX)
> noplen = ASM_NOP_MAX;
> memcpy(insns, ideal_nops[noplen], noplen);
> insns += noplen;
> len -= noplen;
> }
>}
>
>> Can we perhaps get rid of the distinction entirely, and just use one
>> set of 64-bit nops for both Intel/AMD?
>
>I *think* hpa would have an opinion here. I'm judging by looking at
>comments like this one in the code:
>
> /*
> * Due to a decoder implementation quirk, some
> * specific Intel CPUs actually perform better with
> * the "k8_nops" than with the SDM-recommended NOPs.
> */
>
>which is a fun one in itself. :-)

--
Sent from my mobile phone. Please pardon brevity and lack of formatting.

2015-04-27 20:03:53

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 12:59:11PM -0700, H. Peter Anvin wrote:
> It really comes down to this: it seems older cores from both Intel
> and AMD perform better with 66 66 66 90, whereas the 0F 1F series is
> better on newer cores.
>
> When I measured it, the differences were sometimes dramatic.

How did you measure that? I should probably do the same on some newer
AMD machines... We're using k8_nops on all AMD, even though the
optimization manuals cite different NOPs on newer AMD families.

Thanks.

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-27 20:15:32

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

I did a microbenchmark in user space... let's see if I can find it.

On April 27, 2015 1:03:29 PM PDT, Borislav Petkov <[email protected]> wrote:
>On Mon, Apr 27, 2015 at 12:59:11PM -0700, H. Peter Anvin wrote:
>> It really comes down to this: it seems older cores from both Intel
>> and AMD perform better with 66 66 66 90, whereas the 0F 1F series is
>> better on newer cores.
>>
>> When I measured it, the differences were sometimes dramatic.
>
>How did you measure that? I should probably do the same on some newer
>AMD machines... We're using k8_nops on all AMD, even though the
>optimization manuals cite different NOPs on newer AMD families.
>
>Thanks.

--
Sent from my mobile phone. Please pardon brevity and lack of formatting.

2015-04-28 13:40:37

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 09:45:12PM +0200, Borislav Petkov wrote:
> > Maybe you are measuring random noise.
>
> Yeah. Last exercise tomorrow. Let's see what those numbers would look
> like.

Right, so with Mel's help, I did a simple microbenchmark to measure how
many cycles a syscall (getpid()) needs on 4.1-rc1 and with your patch.

4.1-rc1
-------
Running 60 times, 10000000 loops per run.
Cycles: 3977233027, cycles/syscall: 397.723303
Cycles: 3964979519, cycles/syscall: 396.497952
Cycles: 3962470261, cycles/syscall: 396.247026
Cycles: 3963524693, cycles/syscall: 396.352469
Cycles: 3962853704, cycles/syscall: 396.285370
Cycles: 3964603727, cycles/syscall: 396.460373
Cycles: 3964758926, cycles/syscall: 396.475893
Cycles: 3965268019, cycles/syscall: 396.526802
Cycles: 3962198683, cycles/syscall: 396.219868
...


4.1-rc1 + 17be0aec74fb036eb4eb32c2268f3420a034762b from tip
-----------------------------------------------------------
Running 60 times, 10000000 loops per run.
Cycles: 3973575441, cycles/syscall: 397.357544
Cycles: 3963999393, cycles/syscall: 396.399939
Cycles: 3962613575, cycles/syscall: 396.261357
Cycles: 3963440408, cycles/syscall: 396.344041
Cycles: 3963475255, cycles/syscall: 396.347526
Cycles: 3964471785, cycles/syscall: 396.447179
Cycles: 3962890513, cycles/syscall: 396.289051
Cycles: 3964940114, cycles/syscall: 396.494011
Cycles: 3964186426, cycles/syscall: 396.418643
...

So yeah, your patch is fine - provided I've done everything right. Here's
the microbenchmark:

---
/*
* How to run:
*
* taskset -c <cpunum> ./sys
*/
#include <stdio.h>
#include <sys/syscall.h>
#include <stdlib.h>
#include <unistd.h>

typedef unsigned long long u64;

#define DECLARE_ARGS(val, low, high) unsigned low, high
#define EAX_EDX_VAL(val, low, high) ((low) | ((u64)(high) << 32))
#define EAX_EDX_ARGS(val, low, high) "a" (low), "d" (high)
#define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high)

static __always_inline unsigned long long rdtsc(void)
{
DECLARE_ARGS(val, low, high);

asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));

return EAX_EDX_VAL(val, low, high);
}

static long my_getpid(void)
{
long ret;
asm volatile ("syscall" :
"=a" (ret) :
"a" (SYS_getpid) :
"memory", "cc", "rcx", "r11");
return ret;
}

static inline u64 read_tsc(void)
{
u64 ret;

asm volatile("mfence");
ret = rdtsc();
asm volatile("mfence");

return ret;
}


int main()
{
int i, j;
u64 p1, p2;
u64 count = 0;

#define TIMES 60
#define LOOPS 10000000ULL

printf("Running %d times, %lld loops per run.\n", TIMES, LOOPS);

for (j = 0; j < TIMES; j++) {
for (i = 0; i < LOOPS; i++) {
p1 = read_tsc();
my_getpid();
p2 = read_tsc();

count += (p2 - p1);
}

printf("Cycles: %lld, cycles/syscall: %f\n",
count, (double)count / LOOPS);

count = 0;
}

return 0;
}

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-28 15:55:47

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Mon, Apr 27, 2015 at 01:14:51PM -0700, H. Peter Anvin wrote:
> I did a microbenchmark in user space... let's see if I can find it.

How about the simple one below?

Provided it is correct, it shows that the 0x66-prefixed 3-byte NOPs are
better than the 0F 1F 00 suggested by the manual (Haha!):

$ taskset -c 3 ./nops
Running 600 times, 10000000 loops per run.
nop_0x90 average: 439.805220
nop_3_byte average: 442.412915

---
/*
* How to run:
*
* taskset -c <cpunum> argv0
*/
#include <stdio.h>
#include <sys/syscall.h>
#include <stdlib.h>
#include <unistd.h>

typedef unsigned long long u64;

#define DECLARE_ARGS(val, low, high) unsigned low, high
#define EAX_EDX_VAL(val, low, high) ((low) | ((u64)(high) << 32))
#define EAX_EDX_ARGS(val, low, high) "a" (low), "d" (high)
#define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high)

static __always_inline unsigned long long rdtsc(void)
{
DECLARE_ARGS(val, low, high);

asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));

return EAX_EDX_VAL(val, low, high);
}

static inline u64 read_tsc(void)
{
u64 ret;

asm volatile("mfence");
ret = rdtsc();
asm volatile("mfence");

return ret;
}

static inline void nop_0x90(void)
{
asm volatile(
".byte 0x66, 0x66, 0x90\n\t"
".byte 0x66, 0x66, 0x90\n\t"
".byte 0x66, 0x66, 0x90\n\t"
".byte 0x66, 0x66, 0x90\n\t"
".byte 0x66, 0x66, 0x90\n\t"

".byte 0x66, 0x66, 0x90\n\t"
".byte 0x66, 0x66, 0x90\n\t"
".byte 0x66, 0x66, 0x90\n\t"
".byte 0x66, 0x66, 0x90\n\t"
".byte 0x66, 0x66, 0x90\n\t"

".byte 0x66, 0x66, 0x90\n\t"
".byte 0x66, 0x66, 0x90\n\t"
".byte 0x66, 0x66, 0x90\n\t"
".byte 0x66, 0x66, 0x90\n\t"
".byte 0x66, 0x66, 0x90\n\t"
);
}

static inline void nop_3_byte(void)
{
asm volatile(
".byte 0x0f, 0x1f, 0x00\n\t"
".byte 0x0f, 0x1f, 0x00\n\t"
".byte 0x0f, 0x1f, 0x00\n\t"
".byte 0x0f, 0x1f, 0x00\n\t"
".byte 0x0f, 0x1f, 0x00\n\t"

".byte 0x0f, 0x1f, 0x00\n\t"
".byte 0x0f, 0x1f, 0x00\n\t"
".byte 0x0f, 0x1f, 0x00\n\t"
".byte 0x0f, 0x1f, 0x00\n\t"
".byte 0x0f, 0x1f, 0x00\n\t"

".byte 0x0f, 0x1f, 0x00\n\t"
".byte 0x0f, 0x1f, 0x00\n\t"
".byte 0x0f, 0x1f, 0x00\n\t"
".byte 0x0f, 0x1f, 0x00\n\t"
".byte 0x0f, 0x1f, 0x00\n\t"
);
}

int main()
{
int i, j;
u64 p1, p2;
u64 r;
double avg, t;

#define TIMES 600
#define LOOPS 10000000ULL

printf("Running %d times, %lld loops per run.\n", TIMES, LOOPS);

avg = 0;

for (r = 0, j = 0; j < TIMES; j++) {
for (i = 0; i < LOOPS; i++) {
p1 = read_tsc();
nop_0x90();
p2 = read_tsc();

r += (p2 - p1);
}

t = (double)r / LOOPS;

// printf("NOP cycles: %lld, cycles/nop_0x90: %f\n", r, t);
avg += t;
r = 0;
}

printf("nop_0x90 average: %f\n", avg/TIMES);

avg = 0;

for (r = 0, j = 0; j < TIMES; j++) {
for (i = 0; i < LOOPS; i++) {
p1 = read_tsc();
nop_3_byte();
p2 = read_tsc();

r += (p2 - p1);
}

t = (double)r / LOOPS;

// printf("NOP cycles: %lld, cycles/nop_3_byte: %f\n", r, t);
avg += t;
r = 0;
}

printf("nop_3_byte average: %f\n", avg/TIMES);

return 0;
}

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-28 16:28:56

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Tue, Apr 28, 2015 at 8:55 AM, Borislav Petkov <[email protected]> wrote:
>
> Provided it is correct, it shows that the 0x66-prefixed 3-byte NOPs are
> better than the 0F 1F 00 suggested by the manual (Haha!):

That's which AMD CPU?

On my intel i7-4770S, they are the same cost (I cut down your loop
numbers by an order of magnitude each because I couldn't be arsed to
wait for it, so it might be off by a cycle or two):

Running 60 times, 1000000 loops per run.
nop_0x90 average: 81.065681
nop_3_byte average: 80.230101

That said, I think your benchmark tests the speed of "rdtsc" rather
than the no-ops. Putting the read_tsc inside the inner loop basically
makes it swamp everything else.

> $ taskset -c 3 ./nops
> Running 600 times, 10000000 loops per run.
> nop_0x90 average: 439.805220
> nop_3_byte average: 442.412915

I think that's in the noise, and could be explained by random
alignment of the loop too, or even random factors like "the CPU heated
up, so the later run was slightly slower". The difference between 439
and 442 doesn't strike me as all that significant.

It might be better to *not* inline, and instead make a real function
call to something that has a lot of no-ops (do some preprocessor magic
to make more no-ops in one go). At least that way the alignment is
likely the same for the two cases.

Or if not that, then I think you're better off with something like

p1 = read_tsc();
for (i = 0; i < LOOPS; i++) {
nop_0x90();

}
p2 = read_tsc();
r = (p2 - p1);

because while you're now measuring the loop overhead too, that's
*much* smaller than the rdtsc overhead. So I get something like

Running 600 times, 1000000 loops per run.
nop_0x90 average: 3.786935
nop_3_byte average: 3.677228

and notice the difference between "~80 cycles" and "~3.7 cycles".
Yeah, that's rdtsc. I bet your 440 is about the same thing too.

Btw, the whole thing about "averaging cycles" is not the right thing
to do either. You should probably take the *minimum* cycles count, not
the average, because anything non-minimal means "some perturbation"
(ie interrupt etc).

So I think something like the attached would be better. It gives an
approximate "cycles per one four-byte nop", and I get

[torvalds@i7 ~]$ taskset -c 3 ./a.out
Running 60 times, 1000000 loops per run.
nop_0x90 average: 0.200479
nop_3_byte average: 0.199694

which sounds suspiciously good to me (5 nops per cycle? uop cache and
nop compression, I guess).

Linus


Attachments:
t.c (1.85 kB)

2015-04-28 16:58:28

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Tue, Apr 28, 2015 at 09:28:52AM -0700, Linus Torvalds wrote:
> On Tue, Apr 28, 2015 at 8:55 AM, Borislav Petkov <[email protected]> wrote:
> >
> > Provided it is correct, it shows that the 0x66-prefixed 3-byte NOPs are
> > better than the 0F 1F 00 suggested by the manual (Haha!):
>
> That's which AMD CPU?

F16h.

> On my intel i7-4770S, they are the same cost (I cut down your loop
> numbers by an order of magnitude each because I couldn't be arsed to
> wait for it, so it might be off by a cycle or two):
>
> Running 60 times, 1000000 loops per run.
> nop_0x90 average: 81.065681
> nop_3_byte average: 80.230101
>
> That said, I think your benchmark tests the speed of "rdtsc" rather
> than the no-ops. Putting the read_tsc inside the inner loop basically
> makes it swamp everything else.

Whoops, now that you mention it... of course, that RDTSC *along* with
the barriers around it is much much more expensive than the NOPs.

> > $ taskset -c 3 ./nops
> > Running 600 times, 10000000 loops per run.
> > nop_0x90 average: 439.805220
> > nop_3_byte average: 442.412915
>
> I think that's in the noise, and could be explained by random
> alignment of the loop too, or even random factors like "the CPU heated
> up, so the later run was slightly slower". The difference between 439
> and 442 doesn't strike me as all that significant.
>
> It might be better to *not* inline, and instead make a real function
> call to something that has a lot of no-ops (do some preprocessor magic
> to make more no-ops in one go). At least that way the alignment is
> likely the same for the two cases.

malloc a page, populate it with NOPs, slap a RET at the end and jump to
it? Maybe even more than 1 page?

> Or if not that, then I think you're better off with something like
>
> p1 = read_tsc();
> for (i = 0; i < LOOPS; i++) {
> nop_0x90();
>
> }
> p2 = read_tsc();
> r = (p2 - p1);
>
> because while you're now measuring the loop overhead too, that's
> *much* smaller than the rdtsc overhead. So I get something like

Yap, that looks better.

> Running 600 times, 1000000 loops per run.
> nop_0x90 average: 3.786935
> nop_3_byte average: 3.677228
>
> and notice the difference between "~80 cycles" and "~3.7 cycles".
> Yeah, that's rdtsc. I bet your 440 is about the same thing too.
>
> Btw, the whole thing about "averaging cycles" is not the right thing
> to do either. You should probably take the *minimum* cycles count, not
> the average, because anything non-minimal means "some perturbation"
> (ie interrupt etc).

My train of thought was: if you do a *lot* of runs, perturbations would
average out. But ok, noted.

> So I think something like the attached would be better. It gives an
> approximate "cycles per one four-byte nop", and I get
>
> [torvalds@i7 ~]$ taskset -c 3 ./a.out
> Running 60 times, 1000000 loops per run.
> nop_0x90 average: 0.200479
> nop_3_byte average: 0.199694
>
> which sounds suspiciously good to me (5 nops per cycle? uop cache and
> nop compression, I guess).

Well, AFAIK, NOPs do require resources for tracking in the machine. I
was hoping that hw would be smarter and discard at decode time but there
probably are reasons that it can't be done (...yet).

So they most likely get discarted at retire time and I can't imagine how
an otherwise relatively idle core's ROB with gazillion of NOPs would
look like. Those things need hw traces. Maybe in another life. :-)

$ taskset -c 3 ./t
Running 60 times, 1000000 loops per run.
nop_0x90 average: 0.390625
nop_3_byte average: 0.390625

and those exact numbers are actually reproducible pretty reliably.

$ taskset -c 3 ./t
Running 60 times, 1000000 loops per run.
nop_0x90 average: 0.390625
nop_3_byte average: 0.390625
$ taskset -c 3 ./t
Running 60 times, 1000000 loops per run.
nop_0x90 average: 0.390625
nop_3_byte average: 0.390625
$ taskset -c 3 ./t
Running 60 times, 1000000 loops per run.
nop_0x90 average: 0.390625
nop_3_byte average: 0.390625

Hmm, so what are we saying? Modern CPUs should use one set of NOPs and
that's it...

Maybe we need to do more measurements...

Hmmm.

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-28 17:16:39

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Tue, Apr 28, 2015 at 9:58 AM, Borislav Petkov <[email protected]> wrote:
>
> Well, AFAIK, NOPs do require resources for tracking in the machine. I
> was hoping that hw would be smarter and discard at decode time but there
> probably are reasons that it can't be done (...yet).

I suspect it might be related to things like getting performance
counters and instruction debug traps etc right. There are quite
possibly also simply constraints where the front end has to generate
*something* just to keep the back end happy.

The front end can generally not just totally remove things without any
tracking, since the front end doesn't know if things are speculative
etc. So you can't do instruction debug traps in the front end afaik.
Or rather, I'm sure you *could*, but in general I suspect the best way
to handle nops without making them *too* special is to bunch up
several to make them look like one big instruction, and then associate
that bunch with some minimal tracking uop that uses minimal resources
in the back end without losing sight of the original nop entirely, so
that you can still do checks at retirement time.

So I think the "you can do ~5 nops per cycle" is not unreasonable.
Even in the uop cache, the nops have to take some space, and have to
do things like update eip, so I don't think they'll ever be entirely
free, the best you can do is minimize their impact.

> $ taskset -c 3 ./t
> Running 60 times, 1000000 loops per run.
> nop_0x90 average: 0.390625
> nop_3_byte average: 0.390625
>
> and those exact numbers are actually reproducible pretty reliably.

Yeah. That looks somewhat reasonable. I think the 16h architecture
technically decodes just two instructions per cycle, but I wouldn't be
surprised if there's some simple nop special casing going on so that
it can decode three nops in one go when things line up right. So you
might get 0.33 cycles for the best case, but then 0.5 cycles when it
crosses a 16-byte boundary or something. So you might have some
pattern where it decodes 32 bytes worth of nops as 12/8/12 bytes
(3/2/3 instructions), which would come out to 0.38 cycles. Add some
random overhead for the loop, and I could see the 0.39 cycles.

That was wild handwaving with no data to back it up, but I'm trying to
explain to myself why you could get some odd number like that. It
seems _possiible_ at least.

Linus

2015-04-28 18:38:45

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Tue, Apr 28, 2015 at 10:16:33AM -0700, Linus Torvalds wrote:
> I suspect it might be related to things like getting performance
> counters and instruction debug traps etc right. There are quite
> possibly also simply constraints where the front end has to generate
> *something* just to keep the back end happy.
>
> The front end can generally not just totally remove things without any
> tracking, since the front end doesn't know if things are speculative
> etc. So you can't do instruction debug traps in the front end afaik.
> Or rather, I'm sure you *could*, but in general I suspect the best way
> to handle nops without making them *too* special is to bunch up
> several to make them look like one big instruction, and then associate
> that bunch with some minimal tracking uop that uses minimal resources
> in the back end without losing sight of the original nop entirely, so
> that you can still do checks at retirement time.

Yeah, I was thinking about a simplified uop for tracking - makes most
sense ...

> So I think the "you can do ~5 nops per cycle" is not unreasonable.
> Even in the uop cache, the nops have to take some space, and have to
> do things like update eip, so I don't think they'll ever be entirely
> free, the best you can do is minimize their impact.

... exactly! So something needs to increment rIP so you either need to
special-handle that and remember by how many bytes to increment and
exactly *when* at retire time or simply use a barebones, simplified uop
which does that for you for free and flows down the pipe. Yeah, that
makes a lot of sense!

> Yeah. That looks somewhat reasonable. I think the 16h architecture
> technically decodes just two instructions per cycle,

Yeah, fetch 32B and look at two 16B for max 2 insns per cycle. I.e.,
two-way.

> but I wouldn't be surprised if there's some simple nop special casing
> going on so that it can decode three nops in one go when things line
> up right.

Right.

> So you might get 0.33 cycles for the best case, but then 0.5 cycles
> when it crosses a 16-byte boundary or something. So you might have
> some pattern where it decodes 32 bytes worth of nops as 12/8/12 bytes
> (3/2/3 instructions), which would come out to 0.38 cycles. Add some
> random overhead for the loop, and I could see the 0.39 cycles.
>
> That was wild handwaving with no data to back it up, but I'm trying
> to explain to myself why you could get some odd number like that. It
> seems _possiible_ at least.

Yep, that makes sense.

Now if only we had some numbers to back this up with... I'll play with
this more.

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-30 21:39:57

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

This is the microbenchmark I used.

For the record, Intel's intention going forward is that 0F 1F will
always be as fast or faster than any other alternative.

-hpa


Attachments:
nopltest.c (2.37 kB)

2015-04-30 23:24:10

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On 04/30/2015 02:39 PM, H. Peter Anvin wrote:
> This is the microbenchmark I used.
>
> For the record, Intel's intention going forward is that 0F 1F will
> always be as fast or faster than any other alternative.
>

I probably should have added that the microbenchmark specifically tests
for an atomic 5-byte NOP (as required by tracepoints etc.) If the
requirement for 5-byte atomic is dropped there might be faster
combinations, e.g. 66 66 66 90 might work better on some platforms, or
using very long 0F 1F sequences.

-hpa

2015-05-01 09:04:05

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Thu, Apr 30, 2015 at 04:23:26PM -0700, H. Peter Anvin wrote:
> I probably should have added that the microbenchmark specifically tests
> for an atomic 5-byte NOP (as required by tracepoints etc.) If the
> requirement for 5-byte atomic is dropped there might be faster
> combinations, e.g. 66 66 66 90 might work better on some platforms, or
> using very long 0F 1F sequences.

Right, so I was thinking of extending it for all sizes 1 - 15 and then
run it on boxes. I'll keep you posted.

Thanks for the toy - now I get to have some fun. :-)

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-05-03 11:51:31

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue

On Thu, Apr 30, 2015 at 02:39:07PM -0700, H. Peter Anvin wrote:
> This is the microbenchmark I used.
>
> For the record, Intel's intention going forward is that 0F 1F will
> always be as fast or faster than any other alternative.

It looks like this is the case on AMD too.

So I took your benchmark and made it to measure all sizes of K8 and P6
NOPs. Also I'm doing 10^6 iterations and taking the minimum. The results
speak for themselves, especially from 5-byte NOPs onwards where we have
to repeat the K8 NOP but still can use a single P6 NOP.

And I'm going to move all relevant AMD hw to use the P6 NOPs for the
alternatives.

Unless I've done something wrong, of course. Please double-check, I'm
attaching the microbenchmark too.

Anyway, here's a patch:

---
From: Borislav Petkov <[email protected]>
Date: Sat, 2 May 2015 23:55:40 +0200
Subject: [PATCH] x86/alternatives: Switch AMD F15h and later to the P6 NOPs

Software optimization guides for both F15h and F16h cite those NOPs as
the optimal ones. A microbenchmark confirms that actually even older
families are better with the single-insn NOPs so switch to them for the
alternatives.

Cycles count below includes the loop overhead of the measurement but
that overhead is the same with all runs.

F10h, revE:
-----------
Running NOP tests, 1000 NOPs x 1000000 repetitions

K8:
90 288.212282 cycles
66 90 288.220840 cycles
66 66 90 288.219447 cycles
66 66 66 90 288.223204 cycles
66 66 90 66 90 571.393424 cycles
66 66 90 66 66 90 571.374919 cycles
66 66 66 90 66 66 90 572.249281 cycles
66 66 66 90 66 66 66 90 571.388651 cycles

P6:
90 288.214193 cycles
66 90 288.225550 cycles
0f 1f 00 288.224441 cycles
0f 1f 40 00 288.225030 cycles
0f 1f 44 00 00 288.233558 cycles
66 0f 1f 44 00 00 324.792342 cycles
0f 1f 80 00 00 00 00 325.657462 cycles
0f 1f 84 00 00 00 00 00 430.246643 cycles

F14h:
----
Running NOP tests, 1000 NOPs x 1000000 repetitions

K8:
90 510.404890 cycles
66 90 510.432117 cycles
66 66 90 510.561858 cycles
66 66 66 90 510.541865 cycles
66 66 90 66 90 1014.192782 cycles
66 66 90 66 66 90 1014.226546 cycles
66 66 66 90 66 66 90 1014.334299 cycles
66 66 66 90 66 66 66 90 1014.381205 cycles

P6:
90 510.436710 cycles
66 90 510.448229 cycles
0f 1f 00 510.545100 cycles
0f 1f 40 00 510.502792 cycles
0f 1f 44 00 00 510.589517 cycles
66 0f 1f 44 00 00 510.611462 cycles
0f 1f 80 00 00 00 00 511.166794 cycles
0f 1f 84 00 00 00 00 00 511.651641 cycles

F15h:
-----
Running NOP tests, 1000 NOPs x 1000000 repetitions

K8:
90 243.128396 cycles
66 90 243.129883 cycles
66 66 90 243.131631 cycles
66 66 66 90 242.499324 cycles
66 66 90 66 90 481.829083 cycles
66 66 90 66 66 90 481.884413 cycles
66 66 66 90 66 66 90 481.851446 cycles
66 66 66 90 66 66 66 90 481.409220 cycles

P6:
90 243.127026 cycles
66 90 243.130711 cycles
0f 1f 00 243.122747 cycles
0f 1f 40 00 242.497617 cycles
0f 1f 44 00 00 245.354461 cycles
66 0f 1f 44 00 00 361.930417 cycles
0f 1f 80 00 00 00 00 362.844944 cycles
0f 1f 84 00 00 00 00 00 480.514948 cycles

F16h:
-----
Running NOP tests, 1000 NOPs x 1000000 repetitions

K8:
90 507.793298 cycles
66 90 507.789636 cycles
66 66 90 507.826490 cycles
66 66 66 90 507.859075 cycles
66 66 90 66 90 1008.663129 cycles
66 66 90 66 66 90 1008.696259 cycles
66 66 66 90 66 66 90 1008.692517 cycles
66 66 66 90 66 66 66 90 1008.755399 cycles

P6:
90 507.795232 cycles
66 90 507.794761 cycles
0f 1f 00 507.834901 cycles
0f 1f 40 00 507.822629 cycles
0f 1f 44 00 00 507.838493 cycles
66 0f 1f 44 00 00 507.908597 cycles
0f 1f 80 00 00 00 00 507.946417 cycles
0f 1f 84 00 00 00 00 00 507.954960 cycles

Signed-off-by: Borislav Petkov <[email protected]>
Cc: Aravind Gopalakrishnan <[email protected]>
---
arch/x86/kernel/alternative.c | 9 +++++++++
1 file changed, 9 insertions(+)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index aef653193160..b0932c4341b3 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -227,6 +227,15 @@ void __init arch_init_ideal_nops(void)
#endif
}
break;
+
+ case X86_VENDOR_AMD:
+ if (boot_cpu_data.x86 > 0xf) {
+ ideal_nops = p6_nops;
+ return;
+ }
+
+ /* fall through */
+
default:
#ifdef CONFIG_X86_64
ideal_nops = k8_nops;
--
2.3.5

Modified benchmark:

---
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <stdbool.h>
#include <sys/time.h>

typedef unsigned long long u64;

#define DECLARE_ARGS(val, low, high) unsigned low, high
#define EAX_EDX_VAL(val, low, high) ((low) | ((u64)(high) << 32))
#define EAX_EDX_ARGS(val, low, high) "a" (low), "d" (high)
#define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high)

static __always_inline unsigned long long rdtsc(void)
{
DECLARE_ARGS(val, low, high);

asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));

return EAX_EDX_VAL(val, low, high);
}

static inline u64 read_tsc(void)
{
u64 ret;

asm volatile("mfence");
ret = rdtsc();
asm volatile("mfence");

return ret;
}

#define __stringify_1(x...) #x
#define __stringify(x...) __stringify_1(x)

#define GENERIC_NOP1 0x90

#define K8_NOP1 GENERIC_NOP1
#define K8_NOP2 0x66,K8_NOP1
#define K8_NOP3 0x66,K8_NOP2
#define K8_NOP4 0x66,K8_NOP3
#define K8_NOP5 K8_NOP3,K8_NOP2
#define K8_NOP6 K8_NOP3,K8_NOP3
#define K8_NOP7 K8_NOP4,K8_NOP3
#define K8_NOP8 K8_NOP4,K8_NOP4

#define P6_NOP3 0x0f,0x1f,0x00
#define P6_NOP4 0x0f,0x1f,0x40,0
#define P6_NOP5 0x0f,0x1f,0x44,0x00,0
#define P6_NOP6 0x66,0x0f,0x1f,0x44,0x00,0
#define P6_NOP7 0x0f,0x1f,0x80,0,0,0,0
#define P6_NOP8 0x0f,0x1f,0x84,0x00,0,0,0,0

#define BUILD_NOP(func, nop) \
static void func(void) \
{ \
asm volatile(".rept 1000\n" \
".byte " __stringify(nop) "\n" \
".endr"); \
}

/* single-byte NOP */
BUILD_NOP(k8_nop1, K8_NOP1)

/* 2-byte NOPs */
BUILD_NOP(k8_nop2, K8_NOP2)

/* 3-byte NOPs */
BUILD_NOP(k8_nop3, K8_NOP3)
BUILD_NOP(p6_nop3, P6_NOP3)

/* 4-byte NOPs */
BUILD_NOP(k8_nop4, K8_NOP4)
BUILD_NOP(p6_nop4, P6_NOP4)

/* 5-byte NOPs */
static void p6_nop5(void)
{
asm volatile(".rept 1000\n"
".byte 0x0f,0x1f,0x44,0x00,0x00\n"
".endr");
}

static void nop_k8(void)
{
asm volatile(".rept 1000\n"
".byte 0x66,0x66,0x66,0x66,0x90\n"
".endr");
}

BUILD_NOP(k8_nop5, K8_NOP5)

static void nop_lea(void)
{
#ifdef __x86_64__
asm volatile(".rept 1000\n"
".byte 0x48,0x8d,0x74,0x26,0x00\n"
".endr");
#else
asm volatile(".rept 1000\n"
".byte 0x3e,0x8d,0x74,0x26,0x00\n"
".endr");
#endif
}

static void nop_jmp5(void)
{
asm volatile(".rept 1000\n"
".byte 0xe9,0,0,0,0\n"
".endr");
}

static void nop_jmp2(void)
{
asm volatile(".rept 1000\n"
".byte 0xeb,3,0x90,0x90,0x90\n"
".endr");
}

static void nop_xchg(void)
{
asm volatile(".rept 1000\n"
".byte 0x66,0x66,0x66,0x87,0xc0\n"
".endr");
}

static void nop_mov(void)
{
asm volatile(".rept 1000\n"
".byte 0x66,0x66,0x66,0x89,0xc0\n"
".endr");
}

static void nop_fdisi(void)
{
asm volatile(".rept 1000\n"
".byte 0x66,0x66,0x66,0xdb,0xe1\n"
".endr");
}

static void nop_feni(void)
{
asm volatile(".rept 1000\n"
".byte 0x66,0x66,0x66,0xdb,0xe0\n"
".endr");
}

/* 6-byte NOPs */
BUILD_NOP(k8_nop6, K8_NOP6)
BUILD_NOP(p6_nop6, P6_NOP6)

/* 7-byte NOPs */
BUILD_NOP(k8_nop7, K8_NOP7)
BUILD_NOP(p6_nop7, P6_NOP7)

/* 8-byte NOPs */
BUILD_NOP(k8_nop8, K8_NOP8)
BUILD_NOP(p6_nop8, P6_NOP8)

struct test_list {
const char *name;
void (*func)(void);
};

static const struct test_list tests[] = {
{ "P6 NOPs (NOPL)", p6_nop5 },
{ "K8 NOPs (66 90)", nop_k8 },
{ "LEA", nop_lea },
{ "XCHG", nop_xchg },
{ "MOV", nop_mov },
{ "FDISI", nop_fdisi },
{ "FENI", nop_feni },
{ "E9 JMP", nop_jmp5 },
{ "EB JMP", nop_jmp2 },
{ NULL, NULL }
};

#define TIMES 30
static void benchmark(const struct test_list *test, const int reps, bool warmup)
{
u64 p1, p2, r;
double min = 10000000000;
int i, j;

for (j = 0; j < TIMES; j++) {
p1 = read_tsc();
for (i = 0; i < reps; i++)
test->func();
p2 = read_tsc();

r = (p2 - p1);

if (r < min)
min = r;
}

if (!warmup)
printf("%24s%15f cycles\n", test->name, min/reps);
}

static const struct test_list k8_nops[] = {
{ NULL, NULL },
{ "90", k8_nop1 },
{ "66 90", k8_nop2 },
{ "66 66 90", k8_nop3 },
{ "66 66 66 90", k8_nop4 },
{ "66 66 90 66 90", k8_nop5 },
{ "66 66 90 66 66 90", k8_nop6 },
{ "66 66 66 90 66 66 90", k8_nop7 },
{ "66 66 66 90 66 66 66 90", k8_nop8 },
{ NULL, NULL },
};

static const struct test_list f16h_nops[] = {
{ NULL, NULL },
{ "90", k8_nop1 },
{ "66 90", k8_nop2 },
{ "0f 1f 00", p6_nop3 },
{ "0f 1f 40 00", p6_nop4 },
{ "0f 1f 44 00 00", p6_nop5 },
{ "66 0f 1f 44 00 00", p6_nop6 },
{ "0f 1f 80 00 00 00 00", p6_nop7 },
{ "0f 1f 84 00 00 00 00 00", p6_nop8 },
{ NULL, NULL },
};

int main(void)
{
const int reps = 1000000;
const struct test_list *test;
int i;

printf("Running NOP tests, 1000 NOPs x %d repetitions\n\n", reps);

#if 0
for (test = tests; test->func; test++) {
benchmark(test, reps, true);
benchmark(test, reps, false);
}
#endif

printf("K8:\n");
for (i = 1; i < 9; i++) {
benchmark(&k8_nops[i], reps, true);
benchmark(&k8_nops[i], reps, false);
}
printf("\n");

printf("P6:\n");
for (i = 1; i < 9; i++) {
benchmark(&f16h_nops[i], reps, true);
benchmark(&f16h_nops[i], reps, false);
}
printf("\n");

return 0;
}

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--