Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755063AbeAMMwN (ORCPT + 1 other); Sat, 13 Jan 2018 07:52:13 -0500 Received: from atrey.karlin.mff.cuni.cz ([195.113.26.193]:35031 "EHLO atrey.karlin.mff.cuni.cz" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754959AbeAMMwL (ORCPT ); Sat, 13 Jan 2018 07:52:11 -0500 Date: Sat, 13 Jan 2018 13:52:08 +0100 From: Pavel Machek To: Linus Torvalds , vcaputo@pengaru.com Cc: Arnd Bergmann , Olivier Galibert , Linux Kernel Mailing List , jikos@suse.cz Subject: kernel page table isolation for x86-32 was Re: Linux 4.15-rc7 Message-ID: <20180113125208.GA32738@amd> References: <20180110233252.GA14739@amd> <20180112110624.GA13254@amd> <20180112193815.GA4918@amd> MIME-Version: 1.0 Content-Type: multipart/signed; micalg=pgp-sha1; protocol="application/pgp-signature"; boundary="huq684BweRXVnRxX" Content-Disposition: inline In-Reply-To: User-Agent: Mutt/1.5.23 (2014-03-12) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Return-Path: --huq684BweRXVnRxX Content-Type: text/plain; charset=us-ascii Content-Disposition: inline Content-Transfer-Encoding: quoted-printable Hi! > > I'll try to do the right thing. OTOH... I don't like the fact that > > kernel memory on my machine is currently readable, probably even from > > javascript. >=20 > Oh, absolutely. I'm just saying that it's probably best to try to > start from the x86-64 KPTI model, and see how that works for x86-32. Ok, it should not be too bad. Here's something... getting it to compile should be easy, getting it to work might be trickier. Not sure what needs to be done for the LDT. Pavel =09 diff --git a/Documentation/x86/pti.txt b/Documentation/x86/pti.txt index d11eff6..e13e1e5 100644 --- a/Documentation/x86/pti.txt +++ b/Documentation/x86/pti.txt @@ -124,7 +124,7 @@ Possible Future Work boot-time switching. =20 Testing -=3D=3D=3D=3D=3D=3D=3D=3D +=3D=3D=3D=3D=3D=3D=3D =20 To test stability of PTI, the following test procedure is recommended, ideally doing all of these in parallel: diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 45a63e0..b0485cc 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -332,6 +332,99 @@ For 32-bit we have the following conventions - kernel = is built with =20 #endif =20 +#else + +/* + * x86-32 kernel page table isolation. + */ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + +/* + * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two + * halves: + */ +#define PTI_SWITCH_PGTABLES_MASK (1< #include =20 +#include "calling.h" +=09 .section .entry.text, "ax" =20 /* @@ -428,6 +430,7 @@ ENTRY(entry_SYSENTER_32) pushl $0 /* pt_regs->ip =3D 0 (placeholder) */ pushl %eax /* pt_regs->orig_ax */ SAVE_ALL pt_regs_ax=3D$-ENOSYS /* save rest */ + SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=3D%edx save_reg=3D%ecx =20 /* * SYSENTER doesn't filter flags, so we need to clear NT, AC @@ -464,6 +467,7 @@ ENTRY(entry_SYSENTER_32) ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ "jmp .Lsyscall_32_done", X86_FEATURE_XENPV =20 + RESTORE_CR3 scratch_reg=3D%edx save_reg=3D%ecx /* Opportunistic SYSEXIT */ TRACE_IRQS_ON /* User mode traces as IRQs on. */ movl PT_EIP(%esp), %edx /* pt_regs->ip */ @@ -539,6 +543,7 @@ ENTRY(entry_INT80_32) ASM_CLAC pushl %eax /* pt_regs->orig_ax */ SAVE_ALL pt_regs_ax=3D$-ENOSYS /* save rest */ + SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=3D%edx save_reg=3D%ecx =20 /* * User mode is traced as though IRQs are on, and the interrupt gate @@ -552,6 +557,7 @@ ENTRY(entry_INT80_32) =20 restore_all: TRACE_IRQS_IRET + RESTORE_CR3 scratch_reg=3D%eax save_reg=3D%ecx .Lrestore_all_notrace: #ifdef CONFIG_X86_ESPFIX32 ALTERNATIVE "jmp .Lrestore_nocheck", "", X86_BUG_ESPFIX diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index e51b65a..a87fb89 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -4,7 +4,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs - * Copyright (C) 2000 Pavel Machek + * Copyright (C) 2000 Pavel Machek SuSE Labs * * entry.S contains the system-call and fault low-level handling routines. * diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtab= le_32.h index e67c062..1b36e56 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -107,4 +107,78 @@ do { \ */ #define LOWMEM_PAGES ((((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT)) =20 +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages + * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and + * the user one is in the last 4k. To switch between them, you + * just need to flip the 12th bit in their addresses. + */ +#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT + +#ifndef __ASSEMBLY__ +/* + * This generates better code than the inline assembly in + * __set_bit(). + */ +static inline void *ptr_set_bit(void *ptr, int bit) +{ + unsigned long __ptr =3D (unsigned long)ptr; + + __ptr |=3D BIT(bit); + return (void *)__ptr; +} +static inline void *ptr_clear_bit(void *ptr, int bit) +{ + unsigned long __ptr =3D (unsigned long)ptr; + + __ptr &=3D ~BIT(bit); + return (void *)__ptr; +} + +static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp) +{ + return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp) +{ + return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp) +{ + return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) +{ + return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); +} +#endif +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + +#ifndef __ASSEMBLY__ +#ifdef CONFIG_PAGE_TABLE_ISOLATION +pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd); + +/* + * Take a PGD location (pgdp) and a pgd value that needs to be set there. + * Populates the user and returns the resulting PGD that must be set in + * the kernel copy of the page tables. + */ +static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +{ + if (!static_cpu_has(X86_FEATURE_PTI)) + return pgd; + return __pti_set_user_pgd(pgdp, pgd); +} +#else +static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +{ + return pgd; +} +#endif +#endif + #endif /* _ASM_X86_PGTABLE_32_H */ diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm= /pgtable_32_types.h index ce245b0..804fc33 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -62,4 +62,6 @@ extern bool __vmalloc_start_set; /* set once high_memory = is set */ =20 #define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE) =20 +#define LDT_BASE_ADDR 0 /* FIXME */ + #endif /* _ASM_X86_PGTABLE_32_DEFS_H */ diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/= processor-flags.h index 6a60fea..8f1cf71 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -39,10 +39,6 @@ #define CR3_PCID_MASK 0xFFFull #define CR3_NOFLUSH BIT_ULL(63) =20 -#ifdef CONFIG_PAGE_TABLE_ISOLATION -# define X86_CR3_PTI_SWITCH_BIT 11 -#endif - #else /* * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save @@ -53,4 +49,8 @@ #define CR3_NOFLUSH 0 #endif =20 +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define X86_CR3_PTI_SWITCH_BIT 11 +#endif + #endif /* _ASM_X86_PROCESSOR_FLAGS_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index dbaf14d..849e073 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1,5 +1,7 @@ #define pr_fmt(fmt) "SMP alternatives: " fmt =20 +#define DEBUG + #include #include #include @@ -28,7 +30,7 @@ EXPORT_SYMBOL_GPL(alternatives_patched); =20 #define MAX_PATCH_LEN (255-1) =20 -static int __initdata_or_module debug_alternative; +static int __initdata_or_module debug_alternative =3D 1; =20 static int __init debug_alt(char *str) { @@ -60,7 +62,7 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt); #define DPRINTK(fmt, args...) \ do { \ if (debug_alternative) \ - printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \ + printk( "%s: " fmt "\n", __func__, ##args); \ } while (0) =20 #define DUMP_BYTES(buf, len, fmt, args...) \ @@ -71,7 +73,7 @@ do { \ if (!(len)) \ break; \ \ - printk(KERN_DEBUG fmt, ##args); \ + printk( fmt, ##args); \ for (j =3D 0; j < (len) - 1; j++) \ printk(KERN_CONT "%02hhx ", buf[j]); \ printk(KERN_CONT "%02hhx\n", buf[j]); \ @@ -373,6 +375,8 @@ void __init_or_module noinline apply_alternatives(struc= t alt_instr *start, u8 *instr, *replacement; u8 insnbuf[MAX_PATCH_LEN]; =20 + printk("apply_alternatives: entering\n"); + DPRINTK("alt table %p -> %p", start, end); /* * The scan order should be from start to end. A later scanned diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index c290209..002ffaf 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -505,6 +505,31 @@ __INITDATA GLOBAL(early_recursion_flag) .long 0 =20 +#define NEXT_PAGE(name) \ + .balign PAGE_SIZE; \ +GLOBAL(name) +=09 +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * Each PGD needs to be 8k long and 8k aligned. We do not + * ever go out to userspace with these, so we do not + * strictly *need* the second page, but this allows us to + * have a single set_pgd() implementation that does not + * need to worry about whether it has 4k or 8k to work + * with. + * + * This ensures PGDs are 8k long: + */ +#define PTI_USER_PGD_FILL 1024 +/* This ensures they are 8k-aligned: */ +#define NEXT_PGD_PAGE(name) \ + .balign 2 * PAGE_SIZE; \ +GLOBAL(name) +#else +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) +#define PTI_USER_PGD_FILL 0 +#endif + __REFDATA .align 4 ENTRY(initial_code) @@ -516,24 +541,26 @@ ENTRY(setup_once_ref) * BSS section */ __PAGE_ALIGNED_BSS - .align PAGE_SIZE #ifdef CONFIG_X86_PAE -.globl initial_pg_pmd +NEXT_PGD_PAGE(initial_pg_pmd) initial_pg_pmd: .fill 1024*KPMDS,4,0 + .fill PTI_USER_PGD_FILL,4,0 #else -.globl initial_page_table +NEXT_PGD_PAGE(initial_page_table) initial_page_table: .fill 1024,4,0 + .fill PTI_USER_PGD_FILL,4,0 #endif initial_pg_fixmap: .fill 1024,4,0 .globl empty_zero_page empty_zero_page: .fill 4096,1,0 -.globl swapper_pg_dir +NEXT_PGD_PAGE(swapper_pg_dir) swapper_pg_dir: .fill 1024,4,0 + .fill PTI_USER_PGD_FILL,4,0 EXPORT_SYMBOL(empty_zero_page) =20 /* diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 04a625f..57f5cd4 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -3,7 +3,7 @@ * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit * * Copyright (C) 2000 Andrea Arcangeli SuSE - * Copyright (C) 2000 Pavel Machek + * Copyright (C) 2000 Pavel Machek * Copyright (C) 2000 Karsten Keil * Copyright (C) 2001,2002 Andi Kleen * Copyright (C) 2005 Eric Biederman diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 2a4849e..896b53b 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -543,7 +543,11 @@ EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); static void ptdump_walk_user_pgd_level_checkwx(void) { #ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_X86_64 pgd_t *pgd =3D (pgd_t *) &init_top_pgt; +#else + pgd_t *pgd =3D swapper_pg_dir; +#endif =20 if (!static_cpu_has(X86_FEATURE_PTI)) return; diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index ce38f16..029b5c8 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -113,8 +113,11 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) * Top-level entries added to init_mm's usermode pgd after boot * will not be automatically propagated to other mms. */ +#ifdef X86_64 + /* FIXME? */ if (!pgdp_maps_userspace(pgdp)) return pgd; +#endif =20 /* * The user page tables get the full PGD, accessible from @@ -166,7 +169,9 @@ static __init p4d_t *pti_user_pagetable_walk_p4d(unsign= ed long address) =20 set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); } +#ifdef X86_64 BUILD_BUG_ON(pgd_large(*pgd) !=3D 0); +#endif =20 return p4d_offset(pgd, address); } diff --git a/security/Kconfig b/security/Kconfig index 1f96e19..ad77de4 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -57,7 +57,7 @@ config SECURITY_NETWORK config PAGE_TABLE_ISOLATION bool "Remove the kernel mapping in user mode" default y - depends on X86_64 && !UML + depends on (X86_32 || X86_64) && !UML help This feature reduces the number of hardware side channels by ensuring that the majority of kernel addresses are not mapped --=20 (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blo= g.html --huq684BweRXVnRxX Content-Type: application/pgp-signature; name="signature.asc" Content-Description: Digital signature -----BEGIN PGP SIGNATURE----- Version: GnuPG v1 iEYEARECAAYFAlpaAPgACgkQMOfwapXb+vIzCQCcCeDCjnSnu47jAWGXHp0j7YkV Q4kAoLtluZmUOjFlrt1o3PpSpVNwb4SW =RAye -----END PGP SIGNATURE----- --huq684BweRXVnRxX--