Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752433AbdLGHW1 (ORCPT ); Thu, 7 Dec 2017 02:22:27 -0500 Received: from mail.kernel.org ([198.145.29.99]:45588 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752405AbdLGHWW (ORCPT ); Thu, 7 Dec 2017 02:22:22 -0500 DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org 68DC8219A0 Authentication-Results: mail.kernel.org; dmarc=none (p=none dis=none) header.from=kernel.org Authentication-Results: mail.kernel.org; spf=none smtp.mailfrom=luto@kernel.org From: Andy Lutomirski To: x86@kernel.org Cc: linux-kernel@vger.kernel.org, Borislav Petkov , Brian Gerst , David Laight , Kees Cook , Peter Zijlstra , Andy Lutomirski Subject: [PATCH] LDT improvements Date: Wed, 6 Dec 2017 23:22:21 -0800 Message-Id: <48fe5cf1382d6a95c7b1837415882edcc81a9781.1512631324.git.luto@kernel.org> X-Mailer: git-send-email 2.13.6 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 10847 Lines: 330 I think I like this approach. I also think it might be nice to move the whole cpu_entry_area into this new pgd range so that we can stop mucking around with the fixmap. TODO: - It crashes in ldt_gdt_64. Not sure why. - 5-level docs aren't updated and the code is untested. Signed-off-by: Andy Lutomirski --- Documentation/x86/x86_64/mm.txt | 11 ++-- arch/x86/include/asm/mmu_context.h | 33 +++++++++- arch/x86/include/asm/pgtable_64_types.h | 2 + arch/x86/include/asm/processor.h | 23 +++++-- arch/x86/kernel/ldt.c | 109 +++++++++++++++++++++++++++++++- 5 files changed, 161 insertions(+), 17 deletions(-) diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 2d7d6590ade8..bfa44e1cb293 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -12,13 +12,15 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) ... unused hole ... ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) ... unused hole ... +fffffe8000000000 - fffffeffffffffff (=39 bits) LDT range ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ... unused hole ... ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space ... unused hole ... ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 -ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable) -ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls +ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space (variable) +[fixmap start] - ffffffffff5fffff kernel-internal fixmap range +ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole Virtual memory map with 5 level page tables: @@ -39,8 +41,9 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space ... unused hole ... ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 -ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space -ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls +ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space +[fixmap start] - ffffffffff5fffff kernel-internal fixmap range +ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole Architecture defines a 64-bit virtual address. Implementations can support diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 5e1a1ecb65c6..eb87bbeddacc 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -52,13 +52,29 @@ struct ldt_struct { */ struct desc_struct *entries; unsigned int nr_entries; + + /* + * If PTI is in use, then the entries array is not mapped while we're + * in user mode. The whole array will be aliased at the addressed + * given by ldt_slot_va(slot). + */ + int slot; }; +/* This is a multiple of PAGE_SIZE. */ +#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE) + +static void *ldt_slot_va(int slot) +{ + return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); +} + /* * Used for LDT copy/destruction. */ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm); void destroy_context_ldt(struct mm_struct *mm); +void ldt_arch_exit_mmap(struct mm_struct *mm); #else /* CONFIG_MODIFY_LDT_SYSCALL */ static inline int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) @@ -90,10 +106,20 @@ static inline void load_mm_ldt(struct mm_struct *mm) * that we can see. */ - if (unlikely(ldt)) - set_ldt(ldt->entries, ldt->nr_entries); - else + if (unlikely(ldt)) { + if (static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI)) { + if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) { + clear_LDT(); + return; + } + + set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries); + } else { + set_ldt(ldt->entries, ldt->nr_entries); + } + } else { clear_LDT(); + } #else clear_LDT(); #endif @@ -185,6 +211,7 @@ static inline void arch_dup_mmap(struct mm_struct *oldmm, static inline void arch_exit_mmap(struct mm_struct *mm) { paravirt_arch_exit_mmap(mm); + ldt_arch_exit_mmap(mm); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 6d5f45dcd4a1..130f575f8d1e 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -100,6 +100,8 @@ typedef struct { pteval_t pte; } pte_t; #define MODULES_LEN (MODULES_END - MODULES_VADDR) #define ESPFIX_PGD_ENTRY _AC(-2, UL) #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) +#define LDT_PGD_ENTRY _AC(-3, UL) +#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) #define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) #define EFI_VA_END (-68 * (_AC(1, UL) << 30)) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 9e482d8b0b97..9c18da64daa9 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -851,13 +851,22 @@ static inline void spin_lock_prefetch(const void *x) #else /* - * User space process size. 47bits minus one guard page. The guard - * page is necessary on Intel CPUs: if a SYSCALL instruction is at - * the highest possible canonical userspace address, then that - * syscall will enter the kernel with a non-canonical return - * address, and SYSRET will explode dangerously. We avoid this - * particular problem by preventing anything from being mapped - * at the maximum canonical address. + * User space process size. This is the first address outside the user range. + * There are a few constraints that determine this: + * + * On Intel CPUs, if a SYSCALL instruction is at the highest canonical + * address, then that syscall will enter the kernel with a + * non-canonical return address, and SYSRET will explode dangerously. + * We avoid this particular problem by preventing anything executable + * from being mapped at the maximum canonical address. + * + * On AMD CPUs in the Ryzen family, there's a nasty bug in which the + * CPUs malfunction if they execute code from the highest canonical page. + * They'll speculate right off the end of the canonical space, and + * bad things happen. This is worked around in the same way as the + * Intel problem. + * + * With page table isolation enabled, we map the LDT in ... [stay tuned] */ #define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index ae5615b03def..a0008fb26ba2 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -46,13 +47,12 @@ static void refresh_ldt_segments(void) static void flush_ldt(void *__mm) { struct mm_struct *mm = __mm; - mm_context_t *pc; if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) return; - pc = &mm->context; - set_ldt(pc->ldt->entries, pc->ldt->nr_entries); + __flush_tlb_all(); + load_mm_ldt(mm); refresh_ldt_segments(); } @@ -90,9 +90,93 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) } new_ldt->nr_entries = num_entries; + new_ldt->slot = -1; return new_ldt; } +static int map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) +{ +#ifdef CONFIG_X86_64 + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + bool is_vmalloc; + int i; + bool awful_hack; + + if (!static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI)) + return 0; + + WARN_ON(ldt->slot != -1); + + /* Both LDT slots are contained in a single PMD. */ + pgd = pgd_offset(mm, LDT_BASE_ADDR); + + awful_hack = pgd_none(*pgd); + + p4d = p4d_alloc(mm, pgd, LDT_BASE_ADDR); + if (!p4d) + return -ENOMEM; + + pud = pud_alloc(mm, p4d, LDT_BASE_ADDR); + if (!pud) + return -ENOMEM; + pmd = pmd_alloc(mm, pud, LDT_BASE_ADDR); + if (!pmd) + return -ENOMEM; + if (pte_alloc(mm, pmd, LDT_BASE_ADDR)) + return -ENOMEM; + + if (true) { + /* awful hack -- pti_set_user_pgd is a mess */ + /* we can't even use the bool awful_hack because of pti_init_all_pgds(), which poops on our pgd. */ + kernel_to_user_pgdp(pgd)->pgd = pgd->pgd; + } + + is_vmalloc = is_vmalloc_addr(ldt->entries); + + for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) { + unsigned long offset = i << PAGE_SHIFT; + unsigned long va = (unsigned long)ldt_slot_va(slot) + offset; + const void *src = (char *)ldt->entries + offset; + unsigned long pfn = is_vmalloc ? vmalloc_to_pfn(src) : + page_to_pfn(virt_to_page(src)); + + pte = pte_offset_kernel(pmd, va); + set_pte(pte, pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL))); + } + + flush_tlb_mm_range(mm, + (unsigned long)ldt_slot_va(slot), + (unsigned long)ldt_slot_va(slot) + LDT_SLOT_STRIDE, + 0); + + ldt->slot = slot; + + return 0; +#else + return -EINVAL; +#endif +} + +static void free_ldt_pgtables(struct mm_struct *mm) +{ +#ifdef CONFIG_X86_64 + struct mmu_gather tlb; + unsigned long start = LDT_BASE_ADDR; + unsigned long end = start + (1UL << PGDIR_SHIFT); + + if (!static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI)) + return; + + tlb_gather_mmu(&tlb, mm, start, end); + free_pgd_range(&tlb, start, end, start, end); + tlb_finish_mmu(&tlb, start, end); +#endif +} + /* After calling this, the LDT is immutable. */ static void finalize_ldt_struct(struct ldt_struct *ldt) { @@ -155,8 +239,17 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) memcpy(new_ldt->entries, old_mm->context.ldt->entries, new_ldt->nr_entries * LDT_ENTRY_SIZE); finalize_ldt_struct(new_ldt); + retval = map_ldt_struct(mm, new_ldt, 0); + if (retval) + goto out_free; mm->context.ldt = new_ldt; + goto out_unlock; + +out_free: + free_ldt_pgtables(mm); + free_ldt_struct(new_ldt); + return retval; out_unlock: mutex_unlock(&old_mm->context.lock); @@ -174,6 +267,11 @@ void destroy_context_ldt(struct mm_struct *mm) mm->context.ldt = NULL; } +void ldt_arch_exit_mmap(struct mm_struct *mm) +{ + free_ldt_pgtables(mm); +} + static int read_ldt(void __user *ptr, unsigned long bytecount) { struct mm_struct *mm = current->mm; @@ -285,6 +383,11 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) new_ldt->entries[ldt_info.entry_number] = ldt; finalize_ldt_struct(new_ldt); + error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0); + if (error) { + free_ldt_struct(old_ldt); + goto out_unlock; + } install_ldt(mm, new_ldt); free_ldt_struct(old_ldt); -- 2.13.6