2018-01-15 03:40:48

by Nadav Amit

[permalink] [raw]
Subject: [RFC] x86: Avoid CR3 load on compatibility mode with PTI

Currently, when page-table isolation is on to prevent the Meltdown bug
(CVE-2017-5754), CR3 is always loaded on system-call and interrupt.

However, it appears that this is an unnecessary measure when programs
run in compatibility mode. In this mode only 32-bit registers are
available, which means that there *should* be no way for the CPU to
access, even speculatively, memory that belongs to the kernel, which
sits in high addresses.

While this may seem as an "uninteresting" case, it opens the possibility
to run I/O intensive applications in compatibility mode with improved
performance relatively to their 64-bit counterpart. These applications
are arguably also the ones that are least affected by the less efficient
32-code.

For example, on Dell R630 (Intel E5-2670 v3)

Redis (redis-benchmark)
======================
x86_64 i386 improvement
-------------------------------------------
PING_INLINE: 103519.66 122249.39 18.09%
PING_BULK: 104493.2 120918.98 15.72%
SET: 86580.09 126103.41 45.65%
GET: 87719.3 124533 41.97%
INCR: 88573.96 123915.73 39.90%
LPUSH: 88731.15 99502.48 12.14%
RPUSH: 88417.33 99108.03 12.09%
LPOP: 88261.25 99304.87 12.51%
RPOP: 88183.43 98716.68 11.94%
SADD: 88028.16 98911.97 12.36%
SPOP: 88261.25 97465.89 10.43%
LPUSH 87873.46 99108.03 12.78%
LRANGE_100 51572.98 47326.08 -8.23%
LRANGE_300 20383.2 16528.93 -18.91%
LRANGE_500 13259.08 10617.97 -19.92%
LRANGE_600 10246.95 8389.26 -18.13%
MSET 89847.26 93370.68 3.92%

Apache (wrk -c 8 -t 4 --latency -d 30s http://localhost)
========================================================
Latency: 137.11us 312.18us -56%
Req/Sec: 17.19k 17.02k -1%

This patch provides a rough implementation. It is mainly intended to
provide the idea and gather feedback whether compatibility mode can be
considered safe. It does overlap with Willy Tarreau work and indeed the
NX-bit should be resolved more nicely.

Signed-off-by: Nadav Amit <[email protected]>
---
arch/x86/entry/calling.h | 29 +++++++++++++++++++++++++++++
arch/x86/entry/entry_64_compat.S | 4 ++--
arch/x86/include/asm/desc.h | 17 +++++++++++++++++
arch/x86/include/asm/tlbflush.h | 9 ++++++++-
arch/x86/kernel/process_64.c | 11 +++++++++++
arch/x86/mm/pti.c | 17 -----------------
6 files changed, 67 insertions(+), 20 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 45a63e00a6af..9b0190f4a96b 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -213,7 +213,14 @@ For 32-bit we have the following conventions - kernel is built with

.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+
+ /*
+ * Do not switch on compatibility mode.
+ */
mov %cr3, \scratch_reg
+ testq $PTI_SWITCH_MASK, \scratch_reg
+ jz .Lend_\@
+
ADJUST_KERNEL_CR3 \scratch_reg
mov \scratch_reg, %cr3
.Lend_\@:
@@ -224,6 +231,16 @@ For 32-bit we have the following conventions - kernel is built with

.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+
+ /*
+ * Do not switch on compatibility mode. If there is no need for a
+ * flush, run lfence to avoid speculative execution returning to user
+ * with the wrong CR3.
+ */
+ movq PER_CPU_VAR(current_task), \scratch_reg
+ testl $_TIF_IA32, TASK_TI_flags(\scratch_reg)
+ jnz .Lno_spec_\@
+
mov %cr3, \scratch_reg

ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
@@ -241,6 +258,10 @@ For 32-bit we have the following conventions - kernel is built with
movq \scratch_reg2, \scratch_reg
jmp .Lwrcr3_\@

+.Lno_spec_\@:
+ lfence
+ jmp .Lend_\@
+
.Lnoflush_\@:
movq \scratch_reg2, \scratch_reg
SET_NOFLUSH_BIT \scratch_reg
@@ -286,6 +307,10 @@ For 32-bit we have the following conventions - kernel is built with

ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID

+ movq PER_CPU_VAR(current_task), \scratch_reg
+ testl $_TIF_IA32, TASK_TI_flags(\scratch_reg)
+ jnz .Lno_spec_\@
+
/*
* KERNEL pages can always resume with NOFLUSH as we do
* explicit flushes.
@@ -305,6 +330,10 @@ For 32-bit we have the following conventions - kernel is built with
btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
jmp .Lwrcr3_\@

+.Lno_spec_\@:
+ lfence
+ jmp .Lend_\@
+
.Lnoflush_\@:
SET_NOFLUSH_BIT \save_reg

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 98d5358e4041..0d520c80ef36 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -4,7 +4,7 @@
*
* Copyright 2000-2002 Andi Kleen, SuSE Labs.
*/
-#include "calling.h"
+#include <linux/linkage.h>
#include <asm/asm-offsets.h>
#include <asm/current.h>
#include <asm/errno.h>
@@ -14,8 +14,8 @@
#include <asm/irqflags.h>
#include <asm/asm.h>
#include <asm/smap.h>
-#include <linux/linkage.h>
#include <linux/err.h>
+#include "calling.h"

.section .entry.text, "ax"

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 13c5ee878a47..5d139f915b2e 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -431,6 +431,23 @@ static inline void load_current_idt(void)
load_idt((const struct desc_ptr *)&idt_descr);
}

+static inline void remove_user_cs64(void)
+{
+ struct desc_struct *d = get_cpu_gdt_rw(smp_processor_id());
+ struct desc_struct user_cs = {0};
+
+ write_gdt_entry(d, GDT_ENTRY_DEFAULT_USER_CS, &user_cs, DESCTYPE_S);
+}
+
+static inline void restore_user_cs64(void)
+{
+ struct desc_struct *d = get_cpu_gdt_rw(smp_processor_id());
+ struct desc_struct user_cs = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff);
+
+ write_gdt_entry(d, GDT_ENTRY_DEFAULT_USER_CS, &user_cs, DESCTYPE_S);
+}
+
+
extern void idt_setup_early_handler(void);
extern void idt_setup_early_traps(void);
extern void idt_setup_traps(void);
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 4a08dd2ab32a..d11480130ef1 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -355,7 +355,8 @@ static inline void __native_flush_tlb(void)
*/
WARN_ON_ONCE(preemptible());

- invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
+ if (!(current->mm && test_thread_flag(TIF_IA32)))
+ invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));

/* If current->mm == NULL then the read_cr3() "borrows" an mm */
native_write_cr3(__native_read_cr3());
@@ -407,6 +408,9 @@ static inline void __native_flush_tlb_single(unsigned long addr)
if (!static_cpu_has(X86_FEATURE_PTI))
return;

+ if (current->mm && test_thread_flag(TIF_IA32))
+ return;
+
/*
* Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
* Just use invalidate_user_asid() in case we are called early.
@@ -443,6 +447,9 @@ static inline void __flush_tlb_one(unsigned long addr)
if (!static_cpu_has(X86_FEATURE_PTI))
return;

+ if (current->mm && test_thread_flag(TIF_IA32))
+ return;
+
/*
* __flush_tlb_single() will have cleared the TLB entry for this ASID,
* but since kernel space is replicated across all, we must also
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c75466232016..01235402cbbe 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -473,6 +473,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
__switch_to_xtra(prev_p, next_p, tss);

+#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_IA32_EMULATION)
+ if (unlikely(static_cpu_has(X86_FEATURE_PTI) &&
+ (task_thread_info(next_p)->flags ^
+ task_thread_info(prev_p)->flags) & _TIF_IA32)) {
+ if (task_thread_info(next_p)->flags & _TIF_IA32)
+ remove_user_cs64();
+ else
+ restore_user_cs64();
+ }
+#endif
+
#ifdef CONFIG_XEN_PV
/*
* On Xen PV, IOPL bits in pt_regs->flags have no effect, and
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 43d4a4a29037..c7d7f5a35d8c 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -122,23 +122,6 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
*/
kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;

- /*
- * If this is normal user memory, make it NX in the kernel
- * pagetables so that, if we somehow screw up and return to
- * usermode with the kernel CR3 loaded, we'll get a page fault
- * instead of allowing user code to execute with the wrong CR3.
- *
- * As exceptions, we don't set NX if:
- * - _PAGE_USER is not set. This could be an executable
- * EFI runtime mapping or something similar, and the kernel
- * may execute from it
- * - we don't have NX support
- * - we're clearing the PGD (i.e. the new pgd is not present).
- */
- if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
- (__supported_pte_mask & _PAGE_NX))
- pgd.pgd |= _PAGE_NX;
-
/* return the copy of the PGD we want the kernel to use: */
return pgd;
}
--
2.14.1


2018-01-15 17:20:38

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC] x86: Avoid CR3 load on compatibility mode with PTI


> On Jan 14, 2018, at 12:13 PM, Nadav Amit <[email protected]> wrote:
>
> Currently, when page-table isolation is on to prevent the Meltdown bug
> (CVE-2017-5754), CR3 is always loaded on system-call and interrupt.
>
> However, it appears that this is an unnecessary measure when programs
> run in compatibility mode. In this mode only 32-bit registers are
> available, which means that there *should* be no way for the CPU to
> access, even speculatively, memory that belongs to the kernel, which
> sits in high addresses.

You're assuming that TIF_IA32 prevents the execution of 64-bit code. It doesn't.

I've occasionally considered adding an opt-in hardening mechanism to enforce 32-bit or 64-bit execution, but we don't have this now.

Anything like this would also need to spend on SMEP, I think -- the pseudo-SMEP granted by PTI is too valuable to give up on old boxes, I think.

2018-01-15 17:42:11

by Nadav Amit

[permalink] [raw]
Subject: Re: [RFC] x86: Avoid CR3 load on compatibility mode with PTI

Andy Lutomirski <[email protected]> wrote:

>
>> On Jan 14, 2018, at 12:13 PM, Nadav Amit <[email protected]> wrote:
>>
>> Currently, when page-table isolation is on to prevent the Meltdown bug
>> (CVE-2017-5754), CR3 is always loaded on system-call and interrupt.
>>
>> However, it appears that this is an unnecessary measure when programs
>> run in compatibility mode. In this mode only 32-bit registers are
>> available, which means that there *should* be no way for the CPU to
>> access, even speculatively, memory that belongs to the kernel, which
>> sits in high addresses.
>
> You're assuming that TIF_IA32 prevents the execution of 64-bit code. It doesn't.
>
> I've occasionally considered adding an opt-in hardening mechanism to enforce 32-bit or 64-bit execution, but we don't have this now.

I noticed it doesn’t. I thought the removing/restoring the __USER_CS
descriptor on context switch, based on TIF_IA32, would be enough.
modify_ldt() always keeps the descriptor l-bit clear. I will review the
other GDT descriptors, and if needed, create two GDTs. Let me know if I
missed anything else.

> Anything like this would also need to spend on SMEP, I think -- the pseudo-SMEP granted by PTI is too valuable to give up on old boxes, I think.

If SMEP is not supported, compatibility mode would still require page-table
isolation.

Thanks for the feedback. I still look for an ack for the basic idea of
disabling page-table isolation on compatibility mode.

Regards,
Nadav

2018-01-15 17:45:11

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC] x86: Avoid CR3 load on compatibility mode with PTI



> On Jan 15, 2018, at 9:42 AM, Nadav Amit <[email protected]> wrote:
>
> Andy Lutomirski <[email protected]> wrote:
>
>>
>>> On Jan 14, 2018, at 12:13 PM, Nadav Amit <[email protected]> wrote:
>>>
>>> Currently, when page-table isolation is on to prevent the Meltdown bug
>>> (CVE-2017-5754), CR3 is always loaded on system-call and interrupt.
>>>
>>> However, it appears that this is an unnecessary measure when programs
>>> run in compatibility mode. In this mode only 32-bit registers are
>>> available, which means that there *should* be no way for the CPU to
>>> access, even speculatively, memory that belongs to the kernel, which
>>> sits in high addresses.
>>
>> You're assuming that TIF_IA32 prevents the execution of 64-bit code. It doesn't.
>>
>> I've occasionally considered adding an opt-in hardening mechanism to enforce 32-bit or 64-bit execution, but we don't have this now.
>
> I noticed it doesn’t. I thought the removing/restoring the __USER_CS
> descriptor on context switch, based on TIF_IA32, would be enough.
> modify_ldt() always keeps the descriptor l-bit clear. I will review the
> other GDT descriptors, and if needed, create two GDTs. Let me know if I
> missed anything else.

There world need to be some opt-in control, I think, for CRIU if nothing else.

Also, on Xen PV, it's a complete nonstarter. We don't have enough control over the GDT unless someone knows otherwise. But there's no PTI on Xen PV either.

>
>> Anything like this would also need to spend on SMEP, I think -- the pseudo-SMEP granted by PTI is too valuable to give up on old boxes, I think.
>
> If SMEP is not supported, compatibility mode would still require page-table
> isolation.
>
> Thanks for the feedback. I still look for an ack for the basic idea of
> disabling page-table isolation on compatibility mode.
>

I'm still not really convinced this is worth it. It will send a bad message and get people to run critical stuff compiled for 32-bit, which has its own downsides.

> Regards,
> Nadav

2018-01-15 17:50:38

by Nadav Amit

[permalink] [raw]
Subject: Re: [RFC] x86: Avoid CR3 load on compatibility mode with PTI

Andy Lutomirski <[email protected]> wrote:

>
>
>> On Jan 15, 2018, at 9:42 AM, Nadav Amit <[email protected]> wrote:
>>
>> Andy Lutomirski <[email protected]> wrote:
>>
>>>> On Jan 14, 2018, at 12:13 PM, Nadav Amit <[email protected]> wrote:
>>>>
>>>> Currently, when page-table isolation is on to prevent the Meltdown bug
>>>> (CVE-2017-5754), CR3 is always loaded on system-call and interrupt.
>>>>
>>>> However, it appears that this is an unnecessary measure when programs
>>>> run in compatibility mode. In this mode only 32-bit registers are
>>>> available, which means that there *should* be no way for the CPU to
>>>> access, even speculatively, memory that belongs to the kernel, which
>>>> sits in high addresses.
>>>
>>> You're assuming that TIF_IA32 prevents the execution of 64-bit code. It doesn't.
>>>
>>> I've occasionally considered adding an opt-in hardening mechanism to enforce 32-bit or 64-bit execution, but we don't have this now.
>>
>> I noticed it doesn’t. I thought the removing/restoring the __USER_CS
>> descriptor on context switch, based on TIF_IA32, would be enough.
>> modify_ldt() always keeps the descriptor l-bit clear. I will review the
>> other GDT descriptors, and if needed, create two GDTs. Let me know if I
>> missed anything else.
>
> There world need to be some opt-in control, I think, for CRIU if nothing else.
>
> Also, on Xen PV, it's a complete nonstarter. We don't have enough control over the GDT unless someone knows otherwise. But there's no PTI on Xen PV either.
>
>>> Anything like this would also need to spend on SMEP, I think -- the pseudo-SMEP granted by PTI is too valuable to give up on old boxes, I think.
>>
>> If SMEP is not supported, compatibility mode would still require page-table
>> isolation.
>>
>> Thanks for the feedback. I still look for an ack for the basic idea of
>> disabling page-table isolation on compatibility mode.
>
> I'm still not really convinced this is worth it. It will send a bad message and get people to run critical stuff compiled for 32-bit, which has its own downsides.

I can handle #GP gracefully if __USER_CS is loaded so PTI would be required
again. Doing so would eliminate the need for an opt-in, and preserve the
current semantics.

2018-01-15 18:04:55

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC] x86: Avoid CR3 load on compatibility mode with PTI



> On Jan 15, 2018, at 9:50 AM, Nadav Amit <[email protected]> wrote:
>
> Andy Lutomirski <[email protected]> wrote:
>
>>
>>
>>> On Jan 15, 2018, at 9:42 AM, Nadav Amit <[email protected]> wrote:
>>>
>>> Andy Lutomirski <[email protected]> wrote:
>>>
>>>>> On Jan 14, 2018, at 12:13 PM, Nadav Amit <[email protected]> wrote:
>>>>>
>>>>> Currently, when page-table isolation is on to prevent the Meltdown bug
>>>>> (CVE-2017-5754), CR3 is always loaded on system-call and interrupt.
>>>>>
>>>>> However, it appears that this is an unnecessary measure when programs
>>>>> run in compatibility mode. In this mode only 32-bit registers are
>>>>> available, which means that there *should* be no way for the CPU to
>>>>> access, even speculatively, memory that belongs to the kernel, which
>>>>> sits in high addresses.
>>>>
>>>> You're assuming that TIF_IA32 prevents the execution of 64-bit code. It doesn't.
>>>>
>>>> I've occasionally considered adding an opt-in hardening mechanism to enforce 32-bit or 64-bit execution, but we don't have this now.
>>>
>>> I noticed it doesn’t. I thought the removing/restoring the __USER_CS
>>> descriptor on context switch, based on TIF_IA32, would be enough.
>>> modify_ldt() always keeps the descriptor l-bit clear. I will review the
>>> other GDT descriptors, and if needed, create two GDTs. Let me know if I
>>> missed anything else.
>>
>> There world need to be some opt-in control, I think, for CRIU if nothing else.
>>
>> Also, on Xen PV, it's a complete nonstarter. We don't have enough control over the GDT unless someone knows otherwise. But there's no PTI on Xen PV either.
>>
>>>> Anything like this would also need to spend on SMEP, I think -- the pseudo-SMEP granted by PTI is too valuable to give up on old boxes, I think.
>>>
>>> If SMEP is not supported, compatibility mode would still require page-table
>>> isolation.
>>>
>>> Thanks for the feedback. I still look for an ack for the basic idea of
>>> disabling page-table isolation on compatibility mode.
>>
>> I'm still not really convinced this is worth it. It will send a bad message and get people to run critical stuff compiled for 32-bit, which has its own downsides.
>
> I can handle #GP gracefully if __USER_CS is loaded so PTI would be required
> again. Doing so would eliminate the need for an opt-in, and preserve the
> current semantics.
>

Not if someone used LAR, a la the sigreturn_32 test. Not necessarily a showstopper, though.

You'd also have to figure out how to do PTI per-thread, which Linus doesn't like. See Willy's PTI opt-out thread.

2018-01-15 18:50:33

by Nadav Amit

[permalink] [raw]
Subject: Re: [RFC] x86: Avoid CR3 load on compatibility mode with PTI

Andy Lutomirski <[email protected]> wrote:

>
>
>> On Jan 15, 2018, at 9:50 AM, Nadav Amit <[email protected]> wrote:
>>
>> Andy Lutomirski <[email protected]> wrote:
>>
>>>> On Jan 15, 2018, at 9:42 AM, Nadav Amit <[email protected]> wrote:
>>>>
>>>> Andy Lutomirski <[email protected]> wrote:
>>>>
>>>>>> On Jan 14, 2018, at 12:13 PM, Nadav Amit <[email protected]> wrote:
>>>>>>
>>>>>> Currently, when page-table isolation is on to prevent the Meltdown bug
>>>>>> (CVE-2017-5754), CR3 is always loaded on system-call and interrupt.
>>>>>>
>>>>>> However, it appears that this is an unnecessary measure when programs
>>>>>> run in compatibility mode. In this mode only 32-bit registers are
>>>>>> available, which means that there *should* be no way for the CPU to
>>>>>> access, even speculatively, memory that belongs to the kernel, which
>>>>>> sits in high addresses.
>>>>>
>>>>> You're assuming that TIF_IA32 prevents the execution of 64-bit code. It doesn't.
>>>>>
>>>>> I've occasionally considered adding an opt-in hardening mechanism to enforce 32-bit or 64-bit execution, but we don't have this now.
>>>>
>>>> I noticed it doesn’t. I thought the removing/restoring the __USER_CS
>>>> descriptor on context switch, based on TIF_IA32, would be enough.
>>>> modify_ldt() always keeps the descriptor l-bit clear. I will review the
>>>> other GDT descriptors, and if needed, create two GDTs. Let me know if I
>>>> missed anything else.
>>>
>>> There world need to be some opt-in control, I think, for CRIU if nothing else.
>>>
>>> Also, on Xen PV, it's a complete nonstarter. We don't have enough control over the GDT unless someone knows otherwise. But there's no PTI on Xen PV either.
>>>
>>>>> Anything like this would also need to spend on SMEP, I think -- the pseudo-SMEP granted by PTI is too valuable to give up on old boxes, I think.
>>>>
>>>> If SMEP is not supported, compatibility mode would still require page-table
>>>> isolation.
>>>>
>>>> Thanks for the feedback. I still look for an ack for the basic idea of
>>>> disabling page-table isolation on compatibility mode.
>>>
>>> I'm still not really convinced this is worth it. It will send a bad message and get people to run critical stuff compiled for 32-bit, which has its own downsides.
>>
>> I can handle #GP gracefully if __USER_CS is loaded so PTI would be required
>> again. Doing so would eliminate the need for an opt-in, and preserve the
>> current semantics.
>
> Not if someone used LAR, a la the sigreturn_32 test. Not necessarily a showstopper, though.

Thanks for pointing it out. Actually, I think that since
GDT_ENTRY_DEFAULT_USER_DS and GDT_ENTRY_DEFAULT_USER_CS are the last set
entries in the GDT, I can just play with the GDT limit (lower it on IA32),
and get LAR working as well.

> You'd also have to figure out how to do PTI per-thread, which Linus doesn't like. See Willy's PTI opt-out thread.

Maybe I read it wrong, but I think Linus's main objections are for
dynamically enabling/disabling PTI and for not having clear protection
guarantees. I don’t think that disabling PTI on compatibility mode suffers
from these limitations. (But then again…)

2018-01-15 19:49:23

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC] x86: Avoid CR3 load on compatibility mode with PTI

On 01/14/2018 12:13 PM, Nadav Amit wrote:
> Currently, when page-table isolation is on to prevent the Meltdown bug
> (CVE-2017-5754), CR3 is always loaded on system-call and interrupt.

I think of PTI as being a defense against bad stuff that happens from
the kernel being mapped into the user address space, with Meltdown being
the most obvious "bad" thing.

What you're saying here is that since a 32-bit program can't address the
kernel sitting at a >32-bit address, it does not need to unmap the
kernel. As Andy pointed out, there are a few holes with that assumption.

IMNHO, any PTI-disabling mechanisms better be rock-solid, and easy to
convince ourselves that they do the right thing. For instance, the
per-process PTI stuff is going to make the decision quite close to a
capability check, which makes it fairly easy to get right.

If we start disabling PTI willy nilly at points _away_ from the
capability checks (like for 32-bit binaries, say), then it gets really
hard to decide if we are doing the right things.

Also, what's the end goal here? Run old 32-bit binaries better? You
want to weaken the security of the whole implementation to do that?
Sounds like a bad tradeoff to me.

2018-01-15 19:53:11

by Willy Tarreau

[permalink] [raw]
Subject: Re: [RFC] x86: Avoid CR3 load on compatibility mode with PTI

On Mon, Jan 15, 2018 at 11:49:19AM -0800, Dave Hansen wrote:
> If we start disabling PTI willy nilly at points _away_ from the
> capability checks (like for 32-bit binaries, say), then it gets really
> hard to decide if we are doing the right things.
>
> Also, what's the end goal here? Run old 32-bit binaries better? You
> want to weaken the security of the whole implementation to do that?
> Sounds like a bad tradeoff to me.

In fact I understand it differently, which is that by running 32-bit,
he can recover the original performance without sacrifying security.
It's not that bad actually when you think about it since the vast
majority of performance-sensitive software doesn't need to access
even one GB of data.

Willy

2018-01-15 20:09:46

by Nadav Amit

[permalink] [raw]
Subject: Re: [RFC] x86: Avoid CR3 load on compatibility mode with PTI

Dave Hansen <[email protected]> wrote:

> On 01/14/2018 12:13 PM, Nadav Amit wrote:
>> Currently, when page-table isolation is on to prevent the Meltdown bug
>> (CVE-2017-5754), CR3 is always loaded on system-call and interrupt.
>
> I think of PTI as being a defense against bad stuff that happens from
> the kernel being mapped into the user address space, with Meltdown being
> the most obvious "bad" thing.
>
> What you're saying here is that since a 32-bit program can't address the
> kernel sitting at a >32-bit address, it does not need to unmap the
> kernel. As Andy pointed out, there are a few holes with that assumption.

I think that Andy pointed out that my RFC may break existing programs, but
it should be relatively easy to fix. I don’t think there is any problem with
the assumption.

> IMNHO, any PTI-disabling mechanisms better be rock-solid, and easy to
> convince ourselves that they do the right thing. For instance, the
> per-process PTI stuff is going to make the decision quite close to a
> capability check, which makes it fairly easy to get right.

Per-process PTI stuff is an easy solution, but it just pushes the decision
to the user, who is likely to disable PTI without thinking twice, especially
since he got no other option.

> If we start disabling PTI willy nilly at points _away_ from the
> capability checks (like for 32-bit binaries, say), then it gets really
> hard to decide if we are doing the right things.

Eventually it comes down to the question: what does the CPU do? I was
assuming that Intel can figure it out. If it is just about being “paranoid”,
I presume some paranoid knob should control this behavior.

> Also, what's the end goal here? Run old 32-bit binaries better? You
> want to weaken the security of the whole implementation to do that?
> Sounds like a bad tradeoff to me.

As Willy noted in this thread, I think that some users may be interested in
running 32-bit Apache/Nginx/Redis to get the performance back without
sacrificing security.

2018-01-16 00:41:47

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC] x86: Avoid CR3 load on compatibility mode with PTI


* Nadav Amit <[email protected]> wrote:

> > Also, what's the end goal here? Run old 32-bit binaries better? You
> > want to weaken the security of the whole implementation to do that?
> > Sounds like a bad tradeoff to me.
>
> As Willy noted in this thread, I think that some users may be interested in
> running 32-bit Apache/Nginx/Redis to get the performance back without
> sacrificing security.

Note that it is a flawed assumption to think that this is possible, as they might
in many cases not be getting their performance back: 32-bit binaries for the same
general CPU bound computation can easily be 5% slower than 64-bit binaries (as
long as the larger cache footprint of 64-bit data doesn't fall out of key caches),
but can be up to 30% slower for certain computations.

In fact, depending on how kernel heavy the web workload is (for example how much
CGI processing versus IO it does, etc.), a 32-bit binary could be distinctly
_slower_ than even a PTI-enabled 64-bit binary.

So we are trading a 5-15% slowdown (PTI) for another 5-15% slowdown, plus we are
losing the soft-SMEP feature on older CPUs that PTI enables, which is a pretty
powerful mitigation technique.

Yes, I suspect in some (maybe many) cases it would be a speedup, but I really
don't like the underlying assumptions and tradeoffs here. (Not that I like any of
this whole Meltdown debacle TBH.)

Thanks,

Ingo

2018-01-16 03:49:22

by Nadav Amit

[permalink] [raw]
Subject: Re: [RFC] x86: Avoid CR3 load on compatibility mode with PTI

Ingo Molnar <[email protected]> wrote:

>
> * Nadav Amit <[email protected]> wrote:
>
>>> Also, what's the end goal here? Run old 32-bit binaries better? You
>>> want to weaken the security of the whole implementation to do that?
>>> Sounds like a bad tradeoff to me.
>>
>> As Willy noted in this thread, I think that some users may be interested in
>> running 32-bit Apache/Nginx/Redis to get the performance back without
>> sacrificing security.
>
> Note that it is a flawed assumption to think that this is possible, as they might
> in many cases not be getting their performance back: 32-bit binaries for the same
> general CPU bound computation can easily be 5% slower than 64-bit binaries (as
> long as the larger cache footprint of 64-bit data doesn't fall out of key caches),
> but can be up to 30% slower for certain computations.
>
> In fact, depending on how kernel heavy the web workload is (for example how much
> CGI processing versus IO it does, etc.), a 32-bit binary could be distinctly
> _slower_ than even a PTI-enabled 64-bit binary.

Obviously you are right - I didn’t argue otherwise - and I think it is also
reflected in the results (Redis LRANGE results). Yet, arguably the workloads
that are affected the most by PTI are those with a high number of syscalls
and interrupts, in which user computation time is relatively small.

> So we are trading a 5-15% slowdown (PTI) for another 5-15% slowdown, plus we are
> losing the soft-SMEP feature on older CPUs that PTI enables, which is a pretty
> powerful mitigation technique.

This soft-SMEP can be kept by keeping PTI if SMEP is unsupported. Although
we trade slowdowns, they are different ones, which allows the user to make
his best decision.

> Yes, I suspect in some (maybe many) cases it would be a speedup, but I really
> don't like the underlying assumptions and tradeoffs here. (Not that I like any of
> this whole Meltdown debacle TBH.)

To make sure that I understand correctly - the assumptions are that
disabling PTI on compatibility mode would: (1) Benefit some workloads; (2)
Be useful, even if we only consider CPUs with SMEP; and (3) Secure.

Under these assumptions, the tradeoff is slightly greater code complexity
for considerably better performance of 32-bit code; in some common cases
this makes 32-bit code to perform significantly better than 64-bit code.

Am I missing something? My main concern was initially security, but so far
from your aggregated feedback I did not see something concrete which cannot
relatively easily be addressed.

2018-01-20 14:28:20

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC] x86: Avoid CR3 load on compatibility mode with PTI


* Nadav Amit <[email protected]> wrote:

> > So we are trading a 5-15% slowdown (PTI) for another 5-15% slowdown, plus we
> > are losing the soft-SMEP feature on older CPUs that PTI enables, which is a
> > pretty powerful mitigation technique.
>
> This soft-SMEP can be kept by keeping PTI if SMEP is unsupported. Although we
> trade slowdowns, they are different ones, which allows the user to make his best
> decision.

Indeed, not allowing PTI to be disabled if SMEP is unavailable might be a
solution.

> > Yes, I suspect in some (maybe many) cases it would be a speedup, but I really
> > don't like the underlying assumptions and tradeoffs here. (Not that I like any
> > of this whole Meltdown debacle TBH.)
>
> To make sure that I understand correctly - the assumptions are that disabling
> PTI on compatibility mode would: (1) Benefit some workloads; (2) Be useful, even
> if we only consider CPUs with SMEP; and (3) Secure.
>
> Under these assumptions, the tradeoff is slightly greater code complexity for
> considerably better performance of 32-bit code; in some common cases this makes
> 32-bit code to perform significantly better than 64-bit code.
>
> Am I missing something? My main concern was initially security, but so far from
> your aggregated feedback I did not see something concrete which cannot
> relatively easily be addressed.

Yes, I suppose.

Thanks,

Ingo

2018-01-20 16:38:08

by Willy Tarreau

[permalink] [raw]
Subject: Re: [RFC] x86: Avoid CR3 load on compatibility mode with PTI

On Sat, Jan 20, 2018 at 03:26:27PM +0100, Ingo Molnar wrote:
>
> * Nadav Amit <[email protected]> wrote:
>
> > > So we are trading a 5-15% slowdown (PTI) for another 5-15% slowdown, plus we
> > > are losing the soft-SMEP feature on older CPUs that PTI enables, which is a
> > > pretty powerful mitigation technique.
> >
> > This soft-SMEP can be kept by keeping PTI if SMEP is unsupported. Although we
> > trade slowdowns, they are different ones, which allows the user to make his best
> > decision.
>
> Indeed, not allowing PTI to be disabled if SMEP is unavailable might be a
> solution.

Well, I do not agree with this, for the simple reason that the SMEP-like
protection provided by PTI was in fact a byproduct of the Meltdown
mitigation, eventhough quite a valuable one. For me, disabling PTI means
"I want to recover the performance I had on this workload before the PTI
fixes because I value performance over security". By doing it per process
we'll allow users to have both performance for a few processes and
protection (including SMEP-like) for the rest of the system. Their only
other choice will be to completely disable PTI, thus removing all
protection and losing the SMEP emulation.

Best regards,
Willy