Changes since V163:
- Moved the cpu entry area out of the fixmap because that caused failures
due to fixmap size and cleanup_highmap() zapping fixmap PTEs.
- Moved all cpu entry area related code into separate files. The
hodgepodge in cpu/common.c was really not appropriate.
- Folded Juergens XEN PV fix for vsyscall
- Folded Peters ACCESS bit simplification
- Cleaned up and fixed dump_pagetables
- Added Vlastimils PTI/NOPTI marker for dumpstack
- Addressed various minor review comments
Diffstat against V163 appended.
Thanks to everyone who looked and cared!
It's perfect now because I'm going to have quiet holidays no matter what.
Nevertheless, please review and test the hell out of it.
The lot is based on:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git WIP.x86/pti.entry
The series is also available from git:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git WIP.x86/pti
The patch tarball is at:
https://tglx.de/~tglx/patches-pti-181.tar.bz2
sha1sum of decompressed tarball: 3b9c1729efe58e793a40031f54e82cac8aba3884
Thanks,
tglx
---
Documentation/x86/x86_64/mm.txt | 4
arch/x86/Kconfig | 3
arch/x86/entry/vsyscall/vsyscall_64.c | 7 -
arch/x86/events/intel/ds.c | 53 ++++++----
arch/x86/include/asm/cpu_entry_area.h | 87 +++++++++++++++++
arch/x86/include/asm/desc.h | 1
arch/x86/include/asm/fixmap.h | 96 -------------------
arch/x86/include/asm/pgtable.h | 4
arch/x86/include/asm/pgtable_32_types.h | 15 ++-
arch/x86/include/asm/pgtable_64_types.h | 55 ++++++-----
arch/x86/kernel/cpu/common.c | 126 -------------------------
arch/x86/kernel/dumpstack.c | 7 +
arch/x86/kernel/tls.c | 11 --
arch/x86/kernel/traps.c | 6 -
arch/x86/mm/Makefile | 8 -
arch/x86/mm/cpu_entry_area.c | 159 ++++++++++++++++++++++++++++++++
arch/x86/mm/dump_pagetables.c | 105 ++++++++++++---------
arch/x86/mm/init_32.c | 6 +
arch/x86/mm/kasan_init_64.c | 6 -
arch/x86/mm/pgtable_32.c | 1
arch/x86/mm/pti.c | 52 +++++-----
arch/x86/xen/mmu_pv.c | 2
22 files changed, 449 insertions(+), 365 deletions(-)
On Wed, 20 Dec 2017, Thomas Gleixner wrote:
> Changes since V163:
>
> - Moved the cpu entry area out of the fixmap because that caused failures
> due to fixmap size and cleanup_highmap() zapping fixmap PTEs.
>
> - Moved all cpu entry area related code into separate files. The
> hodgepodge in cpu/common.c was really not appropriate.
>
> - Folded Juergens XEN PV fix for vsyscall
>
> - Folded Peters ACCESS bit simplification
>
> - Cleaned up and fixed dump_pagetables
>
> - Added Vlastimils PTI/NOPTI marker for dumpstack
>
> - Addressed various minor review comments
>
> Diffstat against V163 appended.
>
> Thanks to everyone who looked and cared!
>
> It's perfect now because I'm going to have quiet holidays no matter what.
Almost perfect. 0-day is amazing. It unearthed yet more include hell. I'm
not going to repost the whole thing. Find the delta fix below.
I've updated the git tree at:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git WIP.x86/pti
head commit is now: 9b12709513089f4e71190685930e7f9f0d75fee7
The new patch tarball is at:
https://tglx.de/~tglx/patches-pti-184.tar.bz2
sha1sum of decompressed tarball: 2dbdfb57cf65a0c1e558ed55747e17d3c8d8adee
Thanks,
tglx
8<--------------
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index e05a39029446..4a7884b8dca5 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -71,13 +71,7 @@ extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
#define CPU_ENTRY_AREA_MAP_SIZE \
(CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE)
-static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
-{
- unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
- BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
-
- return (struct cpu_entry_area *) va;
-}
+extern struct cpu_entry_area *get_cpu_entry_area(int cpu);
static inline struct entry_stack *cpu_entry_stack(int cpu)
{
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index 66c0d1207243..b5dfb762c64c 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/spinlock.h>
#include <linux/percpu.h>
+
#include <asm/cpu_entry_area.h>
#include <asm/pgtable.h>
#include <asm/fixmap.h>
@@ -13,6 +15,15 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
#endif
+struct cpu_entry_area *get_cpu_entry_area(int cpu)
+{
+ unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
+ BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
+
+ return (struct cpu_entry_area *) va;
+}
+EXPORT_SYMBOL(get_cpu_entry_area);
+
void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags)
{
unsigned long va = (unsigned long) cea_vaddr;
On Wed, Dec 20, 2017 at 10:35:03PM +0100, Thomas Gleixner wrote:
> The series is also available from git:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git WIP.x86/pti
The patchset looks sane in 5-level paging configuration as long as commit
c739f930be1d ("x86/espfix/64: Fix espfix double-fault handling on 5-level
systems") from tip/x86/urgent is applied.
Tested-by: Kirill A. Shutemov <[email protected]>
BTW, can we push the patch to Linus?
--
Kirill A. Shutemov
On 12/20/2017 04:35 PM, Thomas Gleixner wrote:
> Changes since V163:
>
> - Moved the cpu entry area out of the fixmap because that caused failures
> due to fixmap size and cleanup_highmap() zapping fixmap PTEs.
>
> - Moved all cpu entry area related code into separate files. The
> hodgepodge in cpu/common.c was really not appropriate.
>
> - Folded Juergens XEN PV fix for vsyscall
>
> - Folded Peters ACCESS bit simplification
>
> - Cleaned up and fixed dump_pagetables
>
> - Added Vlastimils PTI/NOPTI marker for dumpstack
>
> - Addressed various minor review comments
>
> Diffstat against V163 appended.
>
> Thanks to everyone who looked and cared!
>
> It's perfect now because I'm going to have quiet holidays no matter what.
>
> Nevertheless, please review and test the hell out of it.
Passed my nightly tests, with both baremetal and various Xen guests.
-boris
>
> The lot is based on:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git WIP.x86/pti.entry
>
> The series is also available from git:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git WIP.x86/pti
>
> The patch tarball is at:
>
> https://tglx.de/~tglx/patches-pti-181.tar.bz2
>
> sha1sum of decompressed tarball: 3b9c1729efe58e793a40031f54e82cac8aba3884
>
> Thanks,
>
> tglx
>
> ---
>
> Documentation/x86/x86_64/mm.txt | 4
> arch/x86/Kconfig | 3
> arch/x86/entry/vsyscall/vsyscall_64.c | 7 -
> arch/x86/events/intel/ds.c | 53 ++++++----
> arch/x86/include/asm/cpu_entry_area.h | 87 +++++++++++++++++
> arch/x86/include/asm/desc.h | 1
> arch/x86/include/asm/fixmap.h | 96 -------------------
> arch/x86/include/asm/pgtable.h | 4
> arch/x86/include/asm/pgtable_32_types.h | 15 ++-
> arch/x86/include/asm/pgtable_64_types.h | 55 ++++++-----
> arch/x86/kernel/cpu/common.c | 126 -------------------------
> arch/x86/kernel/dumpstack.c | 7 +
> arch/x86/kernel/tls.c | 11 --
> arch/x86/kernel/traps.c | 6 -
> arch/x86/mm/Makefile | 8 -
> arch/x86/mm/cpu_entry_area.c | 159 ++++++++++++++++++++++++++++++++
> arch/x86/mm/dump_pagetables.c | 105 ++++++++++++---------
> arch/x86/mm/init_32.c | 6 +
> arch/x86/mm/kasan_init_64.c | 6 -
> arch/x86/mm/pgtable_32.c | 1
> arch/x86/mm/pti.c | 52 +++++-----
> arch/x86/xen/mmu_pv.c | 2
> 22 files changed, 449 insertions(+), 365 deletions(-)
>
On Thu, Dec 21, 2017 at 03:57:02PM +0300, Kirill A. Shutemov wrote:
> On Wed, Dec 20, 2017 at 10:35:03PM +0100, Thomas Gleixner wrote:
> > The series is also available from git:
> >
> > git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git WIP.x86/pti
>
> The patchset looks sane in 5-level paging configuration as long as commit
> c739f930be1d ("x86/espfix/64: Fix espfix double-fault handling on 5-level
> systems") from tip/x86/urgent is applied.
>
> Tested-by: Kirill A. Shutemov <[email protected]>
Failed to boot with EFI. I don't think it's limited to 5-level paging.
The fix is below.
BUG: unable to handle kernel paging request at ff1000017d803000
IP: __pti_set_user_pgd+0x22/0x44
PGD 4be4067 P4D 4be5067 PUD 27f905067 PMD 27f718067 PTE 800000017d803060
Oops: 0002 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.14.0-00174-g9b1270951308 #6613
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
task: ffffffff822114c0 task.stack: ffffffff82200000
RIP: 0010:__pti_set_user_pgd+0x22/0x44
RSP: 0000:ffffffff82203c70 EFLAGS: 00000202
RAX: 000000007c490063 RBX: ffffffff82203e28 RCX: 0000000000000002
RDX: 0000000000000001 RSI: 000000007c490063 RDI: ff1000017d803000
RBP: 000000007ff58000 R08: 0000000000000000 R09: 0000000000000067
R10: ffffffff82203a78 R11: 0000000000000001 R12: ff1000017d802000
R13: 0000000000000000 R14: 000000007ff58000 R15: ffffffff82203e28
FS: 0000000000000000(0000) GS:ff1000007f000000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ff1000017d803000 CR3: 000000000440a000 CR4: 00000000000016b0
Call Trace:
__cpa_process_fault+0x3f6/0x5d0
? get_page_from_freelist+0x34f/0xbd0
__change_page_attr_set_clr+0x820/0xd70
? __alloc_pages_nodemask+0x124/0xfd0
? __alloc_pages_nodemask+0x124/0xfd0
? printk+0x3e/0x46
? kernel_map_pages_in_pgd+0x91/0x180
kernel_map_pages_in_pgd+0x91/0x180
? __map_region+0x37/0x53
__map_region+0x37/0x53
efi_map_region+0x27/0xb3
efi_enter_virtual_mode+0x26f/0x490
start_kernel+0x368/0x3df
secondary_startup_64+0xab/0xb0
Code: 90 90 90 90 90 90 90 90 90 48 89 fa 48 89 f0 81 e2 ff 0f 00 00 48 81 fa ff 07 00 00 77 16 48 89 f2 48 81 cf 00 10 00 00 83 e2 05 <48> 89 37 48 83 fa 05 74 01 c3 48 83 3d 6c 7c 29 01 00 79 f5 48
RIP: __pti_set_user_pgd+0x22/0x44 RSP: ffffffff82203c70
CR2: ff1000017d803000
-----------8<-----------
>From 373d3c99f9a8a70f19363c6f007e78216a5f935e Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <[email protected]>
Date: Thu, 21 Dec 2017 19:11:54 +0300
Subject: [PATCH] x86/efi: allocate two PGD for EFI page tables if PTI is
enabled
EFI has its own top-level page table to avoid inserting EFI region
mappings into standard kernel page tables.
For PTI, we need to allocate two PGD page tables instead of one.
The user side is never used, but this allows us to use normal helpers to
deal with EFI page tables.
Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/include/asm/pgalloc.h | 11 +++++++++++
arch/x86/mm/pgtable.c | 11 -----------
arch/x86/platform/efi/efi_64.c | 5 ++++-
3 files changed, 15 insertions(+), 12 deletions(-)
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 4b5e1eafada7..aff42e1da6ee 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -30,6 +30,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {}
*/
extern gfp_t __userpte_alloc_gfp;
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * Instead of one PGD, we acquire two PGDs. Being order-1, it is
+ * both 8k in size and 8k-aligned. That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER 1
+#else
+#define PGD_ALLOCATION_ORDER 0
+#endif
+
/*
* Allocate and free page tables.
*/
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index c05b6dccc72d..9b7bcbd33cc2 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -356,17 +356,6 @@ static inline void _pgd_free(pgd_t *pgd)
}
#else
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
-/*
- * Instead of one PGD, we acquire two PGDs. Being order-1, it is
- * both 8k in size and 8k-aligned. That lets us just flip bit 12
- * in a pointer to swap between the two 4k halves.
- */
-#define PGD_ALLOCATION_ORDER 1
-#else
-#define PGD_ALLOCATION_ORDER 0
-#endif
-
static inline pgd_t *_pgd_alloc(void)
{
return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 20fb31579b69..39c4b35ac7a4 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -195,6 +195,9 @@ static pgd_t *efi_pgd;
* because we want to avoid inserting EFI region mappings (EFI_VA_END
* to EFI_VA_START) into the standard kernel page tables. Everything
* else can be shared, see efi_sync_low_kernel_mappings().
+ *
+ * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the
+ * allocation.
*/
int __init efi_alloc_page_tables(void)
{
@@ -207,7 +210,7 @@ int __init efi_alloc_page_tables(void)
return 0;
gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO;
- efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
+ efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER);
if (!efi_pgd)
return -ENOMEM;
--
Kirill A. Shutemov
On Thu, 21 Dec 2017, Kirill A. Shutemov wrote:
> On Thu, Dec 21, 2017 at 03:57:02PM +0300, Kirill A. Shutemov wrote:
> > On Wed, Dec 20, 2017 at 10:35:03PM +0100, Thomas Gleixner wrote:
> > > The series is also available from git:
> > >
> > > git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git WIP.x86/pti
> >
> > The patchset looks sane in 5-level paging configuration as long as commit
> > c739f930be1d ("x86/espfix/64: Fix espfix double-fault handling on 5-level
> > systems") from tip/x86/urgent is applied.
> >
> > Tested-by: Kirill A. Shutemov <[email protected]>
>
> Failed to boot with EFI. I don't think it's limited to 5-level paging.
Uuurg. That's there forever.
> The fix is below.
Thanks for catching and fixing that.
tglx