From: Peter Zijlstra <[email protected]>
When mapping the LDT RO the hardware will typically generate write faults
on first use. These faults can be trapped and the backing pages can be
modified by the kernel.
There is one exception; IRET will immediately load CS/SS and unrecoverably
#GP. To avoid this issue access the LDT descriptors used by CS/SS before
the IRET to userspace.
For this use LAR, which is a safe operation in that it will happily consume
an invalid LDT descriptor without traps. It gets the CPU to load the
descriptor and observes the (preset) ACCESS bit.
So far none of the obvious candidates like dosemu/wine/etc. do care about
the ACCESS bit at all, so it should be rather safe to enforce it.
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/entry/common.c | 8 ++++-
arch/x86/include/asm/desc.h | 2 +
arch/x86/include/asm/mmu_context.h | 53 +++++++++++++++++++++++--------------
arch/x86/include/asm/thread_info.h | 4 ++
arch/x86/kernel/cpu/common.c | 4 +-
arch/x86/kernel/ldt.c | 30 ++++++++++++++++++++
arch/x86/mm/tlb.c | 2 -
arch/x86/power/cpu.c | 2 -
8 files changed, 78 insertions(+), 27 deletions(-)
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -30,6 +30,7 @@
#include <asm/vdso.h>
#include <linux/uaccess.h>
#include <asm/cpufeature.h>
+#include <asm/mmu_context.h>
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
@@ -130,8 +131,8 @@ static long syscall_trace_enter(struct p
return ret ?: regs->orig_ax;
}
-#define EXIT_TO_USERMODE_LOOP_FLAGS \
- (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
+#define EXIT_TO_USERMODE_LOOP_FLAGS \
+ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | _TIF_LDT |\
_TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
@@ -171,6 +172,9 @@ static void exit_to_usermode_loop(struct
/* Disable IRQs and retry */
local_irq_disable();
+ if (cached_flags & _TIF_LDT)
+ ldt_exit_user(regs);
+
cached_flags = READ_ONCE(current_thread_info()->flags);
if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -20,6 +20,8 @@ static inline void fill_ldt(struct desc_
desc->type = (info->read_exec_only ^ 1) << 1;
desc->type |= info->contents << 2;
+ /* Set ACCESS bit */
+ desc->type |= 1;
desc->s = 1;
desc->dpl = 0x3;
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -57,24 +57,34 @@ struct ldt_struct {
/*
* Used for LDT copy/destruction.
*/
-static inline void init_new_context_ldt(struct mm_struct *mm)
+static inline void init_new_context_ldt(struct task_struct *task,
+ struct mm_struct *mm)
{
mm->context.ldt = NULL;
init_rwsem(&mm->context.ldt_usr_sem);
+ /*
+ * Set the TIF flag unconditonally as in ldt_dup_context() the new
+ * task pointer is not available. In case there is no LDT this is a
+ * nop on the first exit to user space.
+ */
+ set_tsk_thread_flag(task, TIF_LDT);
}
int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
+void ldt_exit_user(struct pt_regs *regs);
void destroy_context_ldt(struct mm_struct *mm);
#else /* CONFIG_MODIFY_LDT_SYSCALL */
-static inline void init_new_context_ldt(struct mm_struct *mm) { }
+static inline void init_new_context_ldt(struct task_struct *task,
+ struct mm_struct *mm) { }
static inline int ldt_dup_context(struct mm_struct *oldmm,
struct mm_struct *mm)
{
return 0;
}
-static inline void destroy_context_ldt(struct mm_struct *mm) {}
+static inline void ldt_exit_user(struct pt_regs *regs) { }
+static inline void destroy_context_ldt(struct mm_struct *mm) { }
#endif
-static inline void load_mm_ldt(struct mm_struct *mm)
+static inline void load_mm_ldt(struct mm_struct *mm, struct task_struct *tsk)
{
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct ldt_struct *ldt;
@@ -83,28 +93,31 @@ static inline void load_mm_ldt(struct mm
ldt = READ_ONCE(mm->context.ldt);
/*
- * Any change to mm->context.ldt is followed by an IPI to all
- * CPUs with the mm active. The LDT will not be freed until
- * after the IPI is handled by all such CPUs. This means that,
- * if the ldt_struct changes before we return, the values we see
- * will be safe, and the new values will be loaded before we run
- * any user code.
+ * Clear LDT if the mm does not have it set or if this is a kernel
+ * thread which might temporarily use the mm of a user process via
+ * use_mm(). If the next task uses LDT then set it up and set
+ * TIF_LDT so it will touch the new LDT on exit to user space.
*
- * NB: don't try to convert this to use RCU without extreme care.
- * We would still need IRQs off, because we don't want to change
- * the local LDT after an IPI loaded a newer value than the one
- * that we can see.
+ * This code is run with interrupts disabled so it is serialized
+ * against the IPI from ldt_install_mm().
*/
- if (unlikely(ldt && !(current->flags & PF_KTHREAD))
- set_ldt(ldt->entries, ldt->nr_entries);
- else
+ if (likely(!ldt || (tsk->flags & PF_KTHREAD))) {
clear_LDT();
+ } else {
+ set_ldt(ldt->entries, ldt->nr_entries);
+ set_tsk_thread_flag(tsk, TIF_LDT);
+ }
#else
+ /*
+ * FIXME: This wants a comment why this actually does anything at
+ * all when the syscall is disabled.
+ */
clear_LDT();
#endif
}
-static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
+static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
{
#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
@@ -126,7 +139,7 @@ static inline void switch_ldt(struct mm_
*/
if (unlikely((unsigned long)prev->context.ldt |
(unsigned long)next->context.ldt))
- load_mm_ldt(next);
+ load_mm_ldt(next, tsk);
#endif
DEBUG_LOCKS_WARN_ON(preemptible());
@@ -150,7 +163,7 @@ static inline int init_new_context(struc
mm->context.execute_only_pkey = -1;
}
#endif
- init_new_context_ldt(mm);
+ init_new_context_ldt(tsk, mm);
return 0;
}
static inline void destroy_context(struct mm_struct *mm)
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,6 +83,7 @@ struct thread_info {
#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
+#define TIF_LDT 9 /* Populate LDT after fork */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
#define TIF_PATCH_PENDING 13 /* pending live patching update */
@@ -109,6 +110,7 @@ struct thread_info {
#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
+#define _TIF_LDT (1 << TIF_LDT)
#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
#define _TIF_UPROBE (1 << TIF_UPROBE)
#define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING)
@@ -141,7 +143,7 @@ struct thread_info {
_TIF_NEED_RESCHED | _TIF_SINGLESTEP | _TIF_SYSCALL_EMU | \
_TIF_SYSCALL_AUDIT | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE | \
_TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT | \
- _TIF_FSCHECK)
+ _TIF_FSCHECK | _TIF_LDT)
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1602,7 +1602,7 @@ void cpu_init(void)
set_tss_desc(cpu, t);
load_TR_desc();
- load_mm_ldt(&init_mm);
+ load_mm_ldt(&init_mm, current);
clear_all_debug_regs();
dbg_restore_debug_regs();
@@ -1660,7 +1660,7 @@ void cpu_init(void)
set_tss_desc(cpu, t);
load_TR_desc();
- load_mm_ldt(&init_mm);
+ load_mm_ldt(&init_mm, current);
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -164,6 +164,36 @@ int ldt_dup_context(struct mm_struct *ol
}
/*
+ * Touching the LDT entries with LAR makes sure that the CPU "caches" the
+ * ACCESSED bit in the LDT entry which is already set when the entry is
+ * stored.
+ */
+static inline void ldt_touch_seg(unsigned long seg)
+{
+ u16 ar, sel = (u16)seg & ~SEGMENT_RPL_MASK;
+
+ if (!(seg & SEGMENT_LDT))
+ return;
+
+ asm volatile ("lar %[sel], %[ar]"
+ : [ar] "=R" (ar)
+ : [sel] "R" (sel));
+}
+
+void ldt_exit_user(struct pt_regs *regs)
+{
+ struct mm_struct *mm = current->mm;
+
+ clear_tsk_thread_flag(current, TIF_LDT);
+
+ if (!mm || !mm->context.ldt)
+ return;
+
+ ldt_touch_seg(regs->cs);
+ ldt_touch_seg(regs->ss);
+}
+
+/*
* No need to lock the MM as we are the last user
*
* 64bit: Don't touch the LDT register - we're already in the next thread.
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -219,7 +219,7 @@ void switch_mm_irqs_off(struct mm_struct
}
load_mm_cr4(next);
- switch_ldt(real_prev, next);
+ switch_ldt(real_prev, next, tsk);
}
/*
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -180,7 +180,7 @@ static void fix_processor_context(void)
syscall_init(); /* This sets MSR_*STAR and related */
#endif
load_TR_desc(); /* This does ltr */
- load_mm_ldt(current->active_mm); /* This does lldt */
+ load_mm_ldt(current->active_mm, current);
initialize_tlbstate_and_flush();
fpu__resume_cpu();
On Tue, Dec 12, 2017 at 9:32 AM, Thomas Gleixner <[email protected]> wrote:
> From: Peter Zijlstra <[email protected]>
>
> When mapping the LDT RO the hardware will typically generate write faults
> on first use. These faults can be trapped and the backing pages can be
> modified by the kernel.
>
> There is one exception; IRET will immediately load CS/SS and unrecoverably
> #GP. To avoid this issue access the LDT descriptors used by CS/SS before
> the IRET to userspace.
>
> For this use LAR, which is a safe operation in that it will happily consume
> an invalid LDT descriptor without traps. It gets the CPU to load the
> descriptor and observes the (preset) ACCESS bit.
>
> So far none of the obvious candidates like dosemu/wine/etc. do care about
> the ACCESS bit at all, so it should be rather safe to enforce it.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> Signed-off-by: Thomas Gleixner <[email protected]>
> ---
> arch/x86/entry/common.c | 8 ++++-
> arch/x86/include/asm/desc.h | 2 +
> arch/x86/include/asm/mmu_context.h | 53 +++++++++++++++++++++++--------------
> arch/x86/include/asm/thread_info.h | 4 ++
> arch/x86/kernel/cpu/common.c | 4 +-
> arch/x86/kernel/ldt.c | 30 ++++++++++++++++++++
> arch/x86/mm/tlb.c | 2 -
> arch/x86/power/cpu.c | 2 -
> 8 files changed, 78 insertions(+), 27 deletions(-)
>
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -30,6 +30,7 @@
> #include <asm/vdso.h>
> #include <linux/uaccess.h>
> #include <asm/cpufeature.h>
> +#include <asm/mmu_context.h>
>
> #define CREATE_TRACE_POINTS
> #include <trace/events/syscalls.h>
> @@ -130,8 +131,8 @@ static long syscall_trace_enter(struct p
> return ret ?: regs->orig_ax;
> }
>
> -#define EXIT_TO_USERMODE_LOOP_FLAGS \
> - (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
> +#define EXIT_TO_USERMODE_LOOP_FLAGS \
> + (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | _TIF_LDT |\
> _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
>
> static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
> @@ -171,6 +172,9 @@ static void exit_to_usermode_loop(struct
> /* Disable IRQs and retry */
> local_irq_disable();
>
> + if (cached_flags & _TIF_LDT)
> + ldt_exit_user(regs);
Nope. To the extent that this code actually does anything (which it
shouldn't since you already forced the access bit), it's racy against
flush_ldt() from another thread, and that race will be exploitable for
privilege escalation. It needs to be outside the loopy part.
--Andy
On Tue, Dec 12, 2017 at 10:03:02AM -0800, Andy Lutomirski wrote:
> On Tue, Dec 12, 2017 at 9:32 AM, Thomas Gleixner <[email protected]> wrote:
> > @@ -171,6 +172,9 @@ static void exit_to_usermode_loop(struct
> > /* Disable IRQs and retry */
> > local_irq_disable();
> >
> > + if (cached_flags & _TIF_LDT)
> > + ldt_exit_user(regs);
>
> Nope. To the extent that this code actually does anything (which it
> shouldn't since you already forced the access bit),
Without this; even with the access bit set; IRET will go wobbly and
we'll #GP on the user-space side. Try it ;-)
> it's racy against
> flush_ldt() from another thread, and that race will be exploitable for
> privilege escalation. It needs to be outside the loopy part.
The flush_ldt (__ldt_install after these patches) would re-set the TIF
flag. But sure, we can move this outside the loop I suppose.
On Tue, Dec 12, 2017 at 10:09 AM, Peter Zijlstra <[email protected]> wrote:
> On Tue, Dec 12, 2017 at 10:03:02AM -0800, Andy Lutomirski wrote:
>> On Tue, Dec 12, 2017 at 9:32 AM, Thomas Gleixner <[email protected]> wrote:
>
>> > @@ -171,6 +172,9 @@ static void exit_to_usermode_loop(struct
>> > /* Disable IRQs and retry */
>> > local_irq_disable();
>> >
>> > + if (cached_flags & _TIF_LDT)
>> > + ldt_exit_user(regs);
>>
>> Nope. To the extent that this code actually does anything (which it
>> shouldn't since you already forced the access bit),
>
> Without this; even with the access bit set; IRET will go wobbly and
> we'll #GP on the user-space side. Try it ;-)
Maybe later.
But that means that we need Intel and AMD to confirm WTF is going on
before this blows up even with LAR on some other CPU.
>
>> it's racy against
>> flush_ldt() from another thread, and that race will be exploitable for
>> privilege escalation. It needs to be outside the loopy part.
>
> The flush_ldt (__ldt_install after these patches) would re-set the TIF
> flag. But sure, we can move this outside the loop I suppose.
> On Dec 12, 2017, at 10:10 AM, Andy Lutomirski <[email protected]> wrote:
>
>> On Tue, Dec 12, 2017 at 10:09 AM, Peter Zijlstra <[email protected]> wrote:
>>> On Tue, Dec 12, 2017 at 10:03:02AM -0800, Andy Lutomirski wrote:
>>> On Tue, Dec 12, 2017 at 9:32 AM, Thomas Gleixner <[email protected]> wrote:
>>
>>>> @@ -171,6 +172,9 @@ static void exit_to_usermode_loop(struct
>>>> /* Disable IRQs and retry */
>>>> local_irq_disable();
>>>>
>>>> + if (cached_flags & _TIF_LDT)
>>>> + ldt_exit_user(regs);
>>>
>>> Nope. To the extent that this code actually does anything (which it
>>> shouldn't since you already forced the access bit),
>>
>> Without this; even with the access bit set; IRET will go wobbly and
>> we'll #GP on the user-space side. Try it ;-)
>
> Maybe later.
>
> But that means that we need Intel and AMD to confirm WTF is going on
> before this blows up even with LAR on some other CPU.
>
>>
>>> it's racy against
>>> flush_ldt() from another thread, and that race will be exploitable for
>>> privilege escalation. It needs to be outside the loopy part.
>>
>> The flush_ldt (__ldt_install after these patches) would re-set the TIF
>> flag. But sure, we can move this outside the loop I suppose.
Also, why is LAR deferred to user exit? And I thought that LAR didn't set the accessed bit.
If I had to guess, I'd guess that LAR is actually generating a read fault and forcing the pagetables to get populated. If so, then it means the VMA code isn't quite right, or you're susceptible to failures under memory pressure.
Now maybe LAR will repopulate the PTE every time if you were to never clear it, but ick.
On Tue, Dec 12, 2017 at 10:22:48AM -0800, Andy Lutomirski wrote:
>
> Also, why is LAR deferred to user exit? And I thought that LAR didn't
> set the accessed bit.
LAR does not set the ACCESSED bit indeed, we need to explicitly set that
when creating the descriptor.
It also works if you do the LAR right after LLDT (which is what I
originally had). The reason its a TIF flag is that I originally LAR'ed
every entry in the table.
It got reduced to CS/SS, but the TIF thing stayed.
> If I had to guess, I'd guess that LAR is actually generating a read
> fault and forcing the pagetables to get populated. If so, then it
> means the VMA code isn't quite right, or you're susceptible to
> failures under memory pressure.
>
> Now maybe LAR will repopulate the PTE every time if you were to never
> clear it, but ick.
I did not observe #PFs from LAR, we had a giant pile of trace_printk()
in there.
On Tue, 12 Dec 2017, Peter Zijlstra wrote:
> On Tue, Dec 12, 2017 at 10:22:48AM -0800, Andy Lutomirski wrote:
> >
> > Also, why is LAR deferred to user exit? And I thought that LAR didn't
> > set the accessed bit.
>
> LAR does not set the ACCESSED bit indeed, we need to explicitly set that
> when creating the descriptor.
>
> It also works if you do the LAR right after LLDT (which is what I
> originally had). The reason its a TIF flag is that I originally LAR'ed
> every entry in the table.
>
> It got reduced to CS/SS, but the TIF thing stayed.
>
> > If I had to guess, I'd guess that LAR is actually generating a read
> > fault and forcing the pagetables to get populated. If so, then it
> > means the VMA code isn't quite right, or you're susceptible to
> > failures under memory pressure.
> >
> > Now maybe LAR will repopulate the PTE every time if you were to never
> > clear it, but ick.
>
> I did not observe #PFs from LAR, we had a giant pile of trace_printk()
> in there.
The pages are populated _before_ the new ldt is installed. So no memory
pressure issue, nothing. If the populate fails, then modify_ldt() returns
with an error.
Thanks,
tglx
On Tue, Dec 12, 2017 at 07:41:39PM +0100, Thomas Gleixner wrote:
> The pages are populated _before_ the new ldt is installed. So no memory
> pressure issue, nothing. If the populate fails, then modify_ldt() returns
> with an error.
Also, these pages are not visible to reclaim. They're not pagecache and
we didn't install a shrinker.
On Tue, Dec 12, 2017 at 9:32 AM, Thomas Gleixner <[email protected]> wrote:
>
> There is one exception; IRET will immediately load CS/SS and unrecoverably
> #GP. To avoid this issue access the LDT descriptors used by CS/SS before
> the IRET to userspace.
Ok, so the other patch made me nervous, this just makes me go "Hell no!".
This is exactly the kind of "now we get traps in random microcode
places that have never been tested" kind of thing that I was talking
about.
Why is the iret exception unrecoverable anyway? Does anybody even know?
Linus
> On Dec 12, 2017, at 11:05 AM, Linus Torvalds <[email protected]> wrote:
>
>> On Tue, Dec 12, 2017 at 9:32 AM, Thomas Gleixner <[email protected]> wrote:
>>
>> There is one exception; IRET will immediately load CS/SS and unrecoverably
>> #GP. To avoid this issue access the LDT descriptors used by CS/SS before
>> the IRET to userspace.
>
> Ok, so the other patch made me nervous, this just makes me go "Hell no!".
>
> This is exactly the kind of "now we get traps in random microcode
> places that have never been tested" kind of thing that I was talking
> about.
>
> Why is the iret exception unrecoverable anyway? Does anybody even know?
>
Weird microcode shit aside, a fault on IRET will return to kernel code with kernel GS, and then the next time we enter the kernel we're backwards. We could fix idtentry to get this right, but the code is already tangled enough.
This series is full of landmines, I think. My latest patch set has a fully functional LDT with PTI on, and the only thing particularly scary about it is that it fiddles with page tables. Other than that, there's no VMA magic, no RO magic, and no microcode magic. And the LDT is still normal kernel memory, so we can ignore a whole pile of potential attacks.
Also, how does it make any sense to have a cached descriptor that's not accessed? Xen PV does weird LDT page fault shit, and is works, so I suspect we're just misunderstanding something. The VMX spec kind of documents this...
> Linus
From: Andy Lutomirski
> Sent: 12 December 2017 19:27
...
> > Why is the iret exception unrecoverable anyway? Does anybody even know?
> >
>
> Weird microcode shit aside, a fault on IRET will return to kernel code with kernel GS, and then the
> next time we enter the kernel we're backwards. We could fix idtentry to get this right, but the code
> is already tangled enough.
...
Notwithstanding a readonly LDT, the iret (and pop %ds, pop %es that probably
precede it) are all likely to fault in kernel if the segment registers are invalid.
(Setting %fs and %gs for 32 bit processes is left to the reader.)
Unlike every other fault in the kernel code segment, gsbase will contain
the user value, not the kernel one.
The kernel code must detect this somehow and correct everything before (probably)
generating a SIGSEGV and returning to the user's signal handler with the
invalid segment registers in the signal context.
Assuming this won't happen (because the segment registers are always valid)
is likely to be a recipe for disaster (or an escalation).
I guess the problem with a readonly LDT is that you don't want to fault
setting the 'accesses' bit.
David