2017-03-14 17:05:34

by Thomas Garnier

[permalink] [raw]
Subject: [PATCH v7 1/3] x86/mm: Adapt MODULES_END based on Fixmap section size

This patch aligns MODULES_END to the beginning of the Fixmap section.
It optimizes the space available for both sections. The address is
pre-computed based on the number of pages required by the Fixmap
section.

It will allow GDT remapping in the Fixmap section. The current
MODULES_END static address does not provide enough space for the kernel
to support a large number of processors.

Signed-off-by: Thomas Garnier <[email protected]>
---
Based on next-20170308
---
Documentation/x86/x86_64/mm.txt | 5 ++++-
arch/x86/include/asm/pgtable_64_types.h | 3 ++-
arch/x86/kernel/module.c | 1 +
arch/x86/mm/dump_pagetables.c | 1 +
arch/x86/mm/kasan_init_64.c | 1 +
mm/vmalloc.c | 1 +
6 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 5724092db811..ee3f9c30957c 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -19,7 +19,7 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space
+ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable)
ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole

@@ -39,6 +39,9 @@ memory window (this size is arbitrary, it can be raised later if needed).
The mappings are not part of any other kernel PGD and are only available
during EFI runtime calls.

+The module mapping space size changes based on the CONFIG requirements for the
+following fixmap section.
+
Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all
physical memory, vmalloc/ioremap space and virtual memory map are randomized.
Their order is preserved but their base will be offset early at boot time.
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 3a264200c62f..bb05e21cf3c7 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -67,7 +67,8 @@ typedef struct { pteval_t pte; } pte_t;
#endif /* CONFIG_RANDOMIZE_MEMORY */
#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
-#define MODULES_END _AC(0xffffffffff000000, UL)
+/* The module sections ends with the start of the fixmap */
+#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1)
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
#define ESPFIX_PGD_ENTRY _AC(-2, UL)
#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT)
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 477ae806c2fa..fad61caac75e 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -35,6 +35,7 @@
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/setup.h>
+#include <asm/fixmap.h>

#if 0
#define DEBUGP(fmt, ...) \
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 58b5bee7ea27..75efeecc85eb 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -20,6 +20,7 @@

#include <asm/kasan.h>
#include <asm/pgtable.h>
+#include <asm/fixmap.h>

/*
* The dumper groups pagetable entries of the same type into one, and for
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 8d63d7a104c3..1bde19ef86bd 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -9,6 +9,7 @@

#include <asm/tlbflush.h>
#include <asm/sections.h>
+#include <asm/fixmap.h>

extern pgd_t early_level4_pgt[PTRS_PER_PGD];
extern struct range pfn_mapped[E820_X_MAX];
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 32979d945766..1fc9598ed019 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -35,6 +35,7 @@
#include <linux/uaccess.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
+#include <asm/fixmap.h>

#include "internal.h"

--
2.12.0.367.g23dc2f6d3c-goog


2017-03-14 17:05:44

by Thomas Garnier

[permalink] [raw]
Subject: [PATCH v7 3/3] x86: Make the GDT remapping read-only on 64-bit

This patch makes the GDT remapped pages read-only to prevent corruption.
This change is done only on 64-bit.

The native_load_tr_desc function was adapted to correctly handle a
read-only GDT. The LTR instruction always writes to the GDT TSS entry.
This generates a page fault if the GDT is read-only. This change checks
if the current GDT is a remap and swap GDTs as needed. This function was
tested by booting multiple machines and checking hibernation works
properly.

KVM SVM and VMX were adapted to use the writeable GDT. On VMX, the
per-cpu variable was removed for functions to fetch the original GDT.
Instead of reloading the previous GDT, VMX will reload the fixmap GDT as
expected. For testing, VMs were started and restored on multiple
configurations.

Signed-off-by: Thomas Garnier <[email protected]>
---
Based on next-20170308
---
arch/x86/include/asm/desc.h | 106 +++++++++++++++++++++++++--------------
arch/x86/include/asm/processor.h | 1 +
arch/x86/kernel/cpu/common.c | 28 ++++++++---
arch/x86/kvm/svm.c | 4 +-
arch/x86/kvm/vmx.c | 12 ++---
5 files changed, 96 insertions(+), 55 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 4b5ef0c64291..ec05f9c1a62c 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -248,9 +248,77 @@ static inline void native_set_ldt(const void *addr, unsigned int entries)
}
}

+static inline void native_load_gdt(const struct desc_ptr *dtr)
+{
+ asm volatile("lgdt %0"::"m" (*dtr));
+}
+
+static inline void native_load_idt(const struct desc_ptr *dtr)
+{
+ asm volatile("lidt %0"::"m" (*dtr));
+}
+
+static inline void native_store_gdt(struct desc_ptr *dtr)
+{
+ asm volatile("sgdt %0":"=m" (*dtr));
+}
+
+static inline void native_store_idt(struct desc_ptr *dtr)
+{
+ asm volatile("sidt %0":"=m" (*dtr));
+}
+
+/*
+ * The LTR instruction marks the TSS GDT entry as busy. On 64-bit, the GDT is
+ * a read-only remapping. To prevent a page fault, the GDT is switched to the
+ * original writeable version when needed.
+ */
+#ifdef CONFIG_X86_64
static inline void native_load_tr_desc(void)
{
+ struct desc_ptr gdt;
+ int cpu = raw_smp_processor_id();
+ bool restore = 0;
+ struct desc_struct *fixmap_gdt;
+
+ native_store_gdt(&gdt);
+ fixmap_gdt = get_cpu_gdt_ro(cpu);
+
+ /*
+ * If the current GDT is the read-only fixmap, swap to the original
+ * writeable version. Swap back at the end.
+ */
+ if (gdt.address == (unsigned long)fixmap_gdt) {
+ load_direct_gdt(cpu);
+ restore = 1;
+ }
asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+ if (restore)
+ load_fixmap_gdt(cpu);
+}
+#else
+static inline void native_load_tr_desc(void)
+{
+ asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+}
+#endif
+
+static inline unsigned long native_store_tr(void)
+{
+ unsigned long tr;
+
+ asm volatile("str %0":"=r" (tr));
+
+ return tr;
+}
+
+static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+ struct desc_struct *gdt = get_cpu_gdt_rw(cpu);
+ unsigned int i;
+
+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+ gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
}

DECLARE_PER_CPU(bool, __tss_limit_invalid);
@@ -305,44 +373,6 @@ static inline void invalidate_tss_limit(void)
this_cpu_write(__tss_limit_invalid, true);
}

-static inline void native_load_gdt(const struct desc_ptr *dtr)
-{
- asm volatile("lgdt %0"::"m" (*dtr));
-}
-
-static inline void native_load_idt(const struct desc_ptr *dtr)
-{
- asm volatile("lidt %0"::"m" (*dtr));
-}
-
-static inline void native_store_gdt(struct desc_ptr *dtr)
-{
- asm volatile("sgdt %0":"=m" (*dtr));
-}
-
-static inline void native_store_idt(struct desc_ptr *dtr)
-{
- asm volatile("sidt %0":"=m" (*dtr));
-}
-
-static inline unsigned long native_store_tr(void)
-{
- unsigned long tr;
-
- asm volatile("str %0":"=r" (tr));
-
- return tr;
-}
-
-static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
-{
- struct desc_struct *gdt = get_cpu_gdt_rw(cpu);
- unsigned int i;
-
- for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
- gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
-}
-
/* This intentionally ignores lm, since 32-bit apps don't have that field. */
#define LDT_empty(info) \
((info)->base_addr == 0 && \
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 2ec4d2dc559b..28828f1f99a4 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -716,6 +716,7 @@ extern struct desc_ptr early_gdt_descr;

extern void cpu_set_gdt(int);
extern void switch_to_new_gdt(int);
+extern void load_direct_gdt(int);
extern void load_fixmap_gdt(int);
extern void load_percpu_segment(int);
extern void cpu_init(void);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3cf1590ec9ce..f8e22dbad86c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -448,8 +448,15 @@ void load_percpu_segment(int cpu)
load_stack_canary_segment();
}

-/* Used by XEN to force the GDT read-only when required */
+/*
+ * On 64-bit the GDT remapping is read-only.
+ * A global is used for Xen to change the default when required.
+ */
+#ifdef CONFIG_X86_64
+pgprot_t pg_fixmap_gdt_flags = PAGE_KERNEL_RO;
+#else
pgprot_t pg_fixmap_gdt_flags = PAGE_KERNEL;
+#endif

/* Setup the fixmap mapping only once per-processor */
static inline void setup_fixmap_gdt(int cpu)
@@ -458,6 +465,17 @@ static inline void setup_fixmap_gdt(int cpu)
__pa(get_cpu_gdt_rw(cpu)), pg_fixmap_gdt_flags);
}

+/* Load the original GDT from the per-cpu structure */
+void load_direct_gdt(int cpu)
+{
+ struct desc_ptr gdt_descr;
+
+ gdt_descr.address = (long)get_cpu_gdt_rw(cpu);
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
+}
+EXPORT_SYMBOL_GPL(load_direct_gdt);
+
/* Load a fixmap remapping of the per-cpu GDT */
void load_fixmap_gdt(int cpu)
{
@@ -467,6 +485,7 @@ void load_fixmap_gdt(int cpu)
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
}
+EXPORT_SYMBOL_GPL(load_fixmap_gdt);

/*
* Current gdt points %fs at the "master" per-cpu area: after this,
@@ -474,11 +493,8 @@ void load_fixmap_gdt(int cpu)
*/
void switch_to_new_gdt(int cpu)
{
- struct desc_ptr gdt_descr;
-
- gdt_descr.address = (long)get_cpu_gdt_rw(cpu);
- gdt_descr.size = GDT_SIZE - 1;
- load_gdt(&gdt_descr);
+ /* Load the original GDT */
+ load_direct_gdt(cpu);
/* Reload the per-cpu base */
load_percpu_segment(cpu);
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d1efe2c62b3f..c02b9af2056a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -741,7 +741,6 @@ static int svm_hardware_enable(void)

struct svm_cpu_data *sd;
uint64_t efer;
- struct desc_ptr gdt_descr;
struct desc_struct *gdt;
int me = raw_smp_processor_id();

@@ -763,8 +762,7 @@ static int svm_hardware_enable(void)
sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
sd->next_asid = sd->max_asid + 1;

- native_store_gdt(&gdt_descr);
- gdt = (struct desc_struct *)gdt_descr.address;
+ gdt = get_current_gdt_rw();
sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);

wrmsrl(MSR_EFER, efer | EFER_SVME);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 283aa8601833..cfed1fff43ec 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -935,7 +935,6 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
* when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
*/
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
-static DEFINE_PER_CPU(struct desc_ptr, host_gdt);

/*
* We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
@@ -2052,14 +2051,13 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
*/
static unsigned long segment_base(u16 selector)
{
- struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
struct desc_struct *table;
unsigned long v;

if (!(selector & ~SEGMENT_RPL_MASK))
return 0;

- table = (struct desc_struct *)gdt->address;
+ table = get_current_gdt_ro();

if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
u16 ldt_selector = kvm_read_ldt();
@@ -2164,7 +2162,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
#endif
if (vmx->host_state.msr_host_bndcfgs)
wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
- load_gdt(this_cpu_ptr(&host_gdt));
+ load_fixmap_gdt(raw_smp_processor_id());
}

static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -2266,7 +2264,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
}

if (!already_loaded) {
- struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
+ unsigned long gdt = get_current_gdt_ro_vaddr();
unsigned long sysenter_esp;

kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
@@ -2277,7 +2275,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
*/
vmcs_writel(HOST_TR_BASE,
(unsigned long)this_cpu_ptr(&cpu_tss));
- vmcs_writel(HOST_GDTR_BASE, gdt->address);
+ vmcs_writel(HOST_GDTR_BASE, gdt); /* 22.2.4 */

/*
* VM exits change the host TR limit to 0x67 after a VM
@@ -3465,8 +3463,6 @@ static int hardware_enable(void)
ept_sync_global();
}

- native_store_gdt(this_cpu_ptr(&host_gdt));
-
return 0;
}

--
2.12.0.367.g23dc2f6d3c-goog

2017-03-14 17:06:10

by Thomas Garnier

[permalink] [raw]
Subject: [PATCH v7 2/3] x86: Remap GDT tables in the Fixmap section

Each processor holds a GDT in its per-cpu structure. The sgdt
instruction gives the base address of the current GDT. This address can
be used to bypass KASLR memory randomization. With another bug, an
attacker could target other per-cpu structures or deduce the base of
the main memory section (PAGE_OFFSET).

This patch relocates the GDT table for each processor inside the
Fixmap section. The space is reserved based on number of supported
processors.

For consistency, the remapping is done by default on 32 and 64-bit.

Each processor switches to its remapped GDT at the end of
initialization. For hibernation, the main processor returns with the
original GDT and switches back to the remapping at completion.

This patch was tested on both architectures. Hibernation and KVM were
both tested specially for their usage of the GDT.

Thanks to Boris Ostrovsky <[email protected]> for testing and
recommending changes for Xen support.

Signed-off-by: Thomas Garnier <[email protected]>
---
Based on next-20170308
---
arch/x86/entry/vdso/vma.c | 2 +-
arch/x86/include/asm/desc.h | 58 ++++++++++++++++++++++++++++++++---
arch/x86/include/asm/fixmap.h | 4 +++
arch/x86/include/asm/processor.h | 1 +
arch/x86/include/asm/stackprotector.h | 2 +-
arch/x86/kernel/acpi/sleep.c | 2 +-
arch/x86/kernel/apm_32.c | 6 ++--
arch/x86/kernel/cpu/common.c | 29 ++++++++++++++++--
arch/x86/kernel/setup_percpu.c | 2 +-
arch/x86/kernel/smpboot.c | 2 +-
arch/x86/platform/efi/efi_32.c | 4 +--
arch/x86/power/cpu.c | 7 +++--
arch/x86/xen/enlighten.c | 5 ++-
arch/x86/xen/mmu.c | 1 +
arch/x86/xen/smp.c | 2 +-
drivers/lguest/x86/core.c | 6 ++--
drivers/pnp/pnpbios/bioscalls.c | 10 +++---
17 files changed, 114 insertions(+), 29 deletions(-)

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 226ca70dc6bd..5c5d4d7618e6 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -354,7 +354,7 @@ static void vgetcpu_cpu_init(void *arg)
d.p = 1; /* Present */
d.d = 1; /* 32-bit */

- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
+ write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
}

static int vgetcpu_online(unsigned int cpu)
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 1548ca92ad3f..4b5ef0c64291 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -4,6 +4,7 @@
#include <asm/desc_defs.h>
#include <asm/ldt.h>
#include <asm/mmu.h>
+#include <asm/fixmap.h>

#include <linux/smp.h>
#include <linux/percpu.h>
@@ -38,6 +39,7 @@ extern struct desc_ptr idt_descr;
extern gate_desc idt_table[];
extern const struct desc_ptr debug_idt_descr;
extern gate_desc debug_idt_table[];
+extern pgprot_t pg_fixmap_gdt_flags;

struct gdt_page {
struct desc_struct gdt[GDT_ENTRIES];
@@ -45,11 +47,57 @@ struct gdt_page {

DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);

-static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+/* Provide the original GDT */
+static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu)
{
return per_cpu(gdt_page, cpu).gdt;
}

+static inline unsigned long get_cpu_gdt_rw_vaddr(unsigned int cpu)
+{
+ return (unsigned long)get_cpu_gdt_rw(cpu);
+}
+
+/* Provide the current original GDT */
+static inline struct desc_struct *get_current_gdt_rw(void)
+{
+ return this_cpu_ptr(&gdt_page)->gdt;
+}
+
+static inline unsigned long get_current_gdt_rw_vaddr(void)
+{
+ return (unsigned long)get_current_gdt_rw();
+}
+
+/* Get the fixmap index for a specific processor */
+static inline unsigned int get_cpu_gdt_ro_index(int cpu)
+{
+ return FIX_GDT_REMAP_BEGIN + cpu;
+}
+
+/* Provide the fixmap address of the remapped GDT */
+static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
+{
+ unsigned int idx = get_cpu_gdt_ro_index(cpu);
+ return (struct desc_struct *)__fix_to_virt(idx);
+}
+
+static inline unsigned long get_cpu_gdt_ro_vaddr(int cpu)
+{
+ return (unsigned long)get_cpu_gdt_ro(cpu);
+}
+
+/* Provide the current read-only GDT */
+static inline struct desc_struct *get_current_gdt_ro(void)
+{
+ return get_cpu_gdt_ro(smp_processor_id());
+}
+
+static inline unsigned long get_current_gdt_ro_vaddr(void)
+{
+ return (unsigned long)get_current_gdt_ro();
+}
+
#ifdef CONFIG_X86_64

static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
@@ -174,7 +222,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned t

static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
{
- struct desc_struct *d = get_cpu_gdt_table(cpu);
+ struct desc_struct *d = get_cpu_gdt_rw(cpu);
tss_desc tss;

set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
@@ -194,7 +242,7 @@ static inline void native_set_ldt(const void *addr, unsigned int entries)

set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
entries * LDT_ENTRY_SIZE - 1);
- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
+ write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_LDT,
&ldt, DESC_LDT);
asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
}
@@ -209,7 +257,7 @@ DECLARE_PER_CPU(bool, __tss_limit_invalid);

static inline void force_reload_TR(void)
{
- struct desc_struct *d = get_cpu_gdt_table(smp_processor_id());
+ struct desc_struct *d = get_current_gdt_rw();
tss_desc tss;

memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc));
@@ -288,7 +336,7 @@ static inline unsigned long native_store_tr(void)

static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
{
- struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+ struct desc_struct *gdt = get_cpu_gdt_rw(cpu);
unsigned int i;

for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 8554f960e21b..b65155cc3760 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -100,6 +100,10 @@ enum fixed_addresses {
#ifdef CONFIG_X86_INTEL_MID
FIX_LNW_VRTC,
#endif
+ /* Fixmap entries to remap the GDTs, one per processor. */
+ FIX_GDT_REMAP_BEGIN,
+ FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1,
+
__end_of_permanent_fixed_addresses,

/*
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f385eca5407a..2ec4d2dc559b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -716,6 +716,7 @@ extern struct desc_ptr early_gdt_descr;

extern void cpu_set_gdt(int);
extern void switch_to_new_gdt(int);
+extern void load_fixmap_gdt(int);
extern void load_percpu_segment(int);
extern void cpu_init(void);

diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 58505f01962f..dcbd9bcce714 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -87,7 +87,7 @@ static inline void setup_stack_canary_segment(int cpu)
{
#ifdef CONFIG_X86_32
unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu);
- struct desc_struct *gdt_table = get_cpu_gdt_table(cpu);
+ struct desc_struct *gdt_table = get_cpu_gdt_rw(cpu);
struct desc_struct desc;

desc = gdt_table[GDT_ENTRY_STACK_CANARY];
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 48587335ede8..ed014814ea35 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -101,7 +101,7 @@ int x86_acpi_suspend_lowlevel(void)
#ifdef CONFIG_SMP
initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
early_gdt_descr.address =
- (unsigned long)get_cpu_gdt_table(smp_processor_id());
+ (unsigned long)get_cpu_gdt_rw(smp_processor_id());
initial_gs = per_cpu_offset(smp_processor_id());
#endif
initial_code = (unsigned long)wakeup_long64;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 5a414545e8a3..446b0d3d4932 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -609,7 +609,7 @@ static long __apm_bios_call(void *_call)

cpu = get_cpu();
BUG_ON(cpu != 0);
- gdt = get_cpu_gdt_table(cpu);
+ gdt = get_cpu_gdt_rw(cpu);
save_desc_40 = gdt[0x40 / 8];
gdt[0x40 / 8] = bad_bios_desc;

@@ -685,7 +685,7 @@ static long __apm_bios_call_simple(void *_call)

cpu = get_cpu();
BUG_ON(cpu != 0);
- gdt = get_cpu_gdt_table(cpu);
+ gdt = get_cpu_gdt_rw(cpu);
save_desc_40 = gdt[0x40 / 8];
gdt[0x40 / 8] = bad_bios_desc;

@@ -2352,7 +2352,7 @@ static int __init apm_init(void)
* Note we only set APM segments on CPU zero, since we pin the APM
* code to that CPU.
*/
- gdt = get_cpu_gdt_table(0);
+ gdt = get_cpu_gdt_rw(0);
set_desc_base(&gdt[APM_CS >> 3],
(unsigned long)__va((unsigned long)apm_info.bios.cseg << 4));
set_desc_base(&gdt[APM_CS_16 >> 3],
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 58094a1f9e9d..3cf1590ec9ce 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -448,6 +448,26 @@ void load_percpu_segment(int cpu)
load_stack_canary_segment();
}

+/* Used by XEN to force the GDT read-only when required */
+pgprot_t pg_fixmap_gdt_flags = PAGE_KERNEL;
+
+/* Setup the fixmap mapping only once per-processor */
+static inline void setup_fixmap_gdt(int cpu)
+{
+ __set_fixmap(get_cpu_gdt_ro_index(cpu),
+ __pa(get_cpu_gdt_rw(cpu)), pg_fixmap_gdt_flags);
+}
+
+/* Load a fixmap remapping of the per-cpu GDT */
+void load_fixmap_gdt(int cpu)
+{
+ struct desc_ptr gdt_descr;
+
+ gdt_descr.address = (long)get_cpu_gdt_ro(cpu);
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
+}
+
/*
* Current gdt points %fs at the "master" per-cpu area: after this,
* it's on the real one.
@@ -456,11 +476,10 @@ void switch_to_new_gdt(int cpu)
{
struct desc_ptr gdt_descr;

- gdt_descr.address = (long)get_cpu_gdt_table(cpu);
+ gdt_descr.address = (long)get_cpu_gdt_rw(cpu);
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
/* Reload the per-cpu base */
-
load_percpu_segment(cpu);
}

@@ -1526,6 +1545,9 @@ void cpu_init(void)

if (is_uv_system())
uv_cpu_init();
+
+ setup_fixmap_gdt(cpu);
+ load_fixmap_gdt(cpu);
}

#else
@@ -1581,6 +1603,9 @@ void cpu_init(void)
dbg_restore_debug_regs();

fpu__init_cpu();
+
+ setup_fixmap_gdt(cpu);
+ load_fixmap_gdt(cpu);
}
#endif

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 9820d6d977c6..11338b0b3ad2 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -160,7 +160,7 @@ static inline void setup_percpu_segment(int cpu)
pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
0x2 | DESCTYPE_S, 0x8);
gdt.s = 1;
- write_gdt_entry(get_cpu_gdt_table(cpu),
+ write_gdt_entry(get_cpu_gdt_rw(cpu),
GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
#endif
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index bd1f1ad35284..f04479a8f74f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -983,7 +983,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
unsigned long timeout;

idle->thread.sp = (unsigned long)task_pt_regs(idle);
- early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+ early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
initial_code = (unsigned long)start_secondary;
initial_stack = idle->thread.sp;

diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index cef39b097649..950071171436 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -68,7 +68,7 @@ pgd_t * __init efi_call_phys_prolog(void)
load_cr3(initial_page_table);
__flush_tlb_all();

- gdt_descr.address = __pa(get_cpu_gdt_table(0));
+ gdt_descr.address = __pa(get_cpu_gdt_rw(0));
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);

@@ -79,7 +79,7 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
{
struct desc_ptr gdt_descr;

- gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
+ gdt_descr.address = (unsigned long)get_cpu_gdt_rw(0);
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);

diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 66ade16c7693..6b05a9219ea2 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -95,7 +95,7 @@ static void __save_processor_state(struct saved_context *ctxt)
* 'pmode_gdt' in wakeup_start.
*/
ctxt->gdt_desc.size = GDT_SIZE - 1;
- ctxt->gdt_desc.address = (unsigned long)get_cpu_gdt_table(smp_processor_id());
+ ctxt->gdt_desc.address = (unsigned long)get_cpu_gdt_rw(smp_processor_id());

store_tr(ctxt->tr);

@@ -162,7 +162,7 @@ static void fix_processor_context(void)
int cpu = smp_processor_id();
struct tss_struct *t = &per_cpu(cpu_tss, cpu);
#ifdef CONFIG_X86_64
- struct desc_struct *desc = get_cpu_gdt_table(cpu);
+ struct desc_struct *desc = get_cpu_gdt_rw(cpu);
tss_desc tss;
#endif
set_tss_desc(cpu, t); /*
@@ -183,6 +183,9 @@ static void fix_processor_context(void)
load_mm_ldt(current->active_mm); /* This does lldt */

fpu__resume_cpu();
+
+ /* The processor is back on the direct GDT, load back the fixmap */
+ load_fixmap_gdt(cpu);
}

/**
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index ec1d5c46e58f..08faa61de5f7 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -710,7 +710,7 @@ static void load_TLS_descriptor(struct thread_struct *t,

*shadow = t->tls_array[i];

- gdt = get_cpu_gdt_table(cpu);
+ gdt = get_cpu_gdt_rw(cpu);
maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
mc = __xen_mc_entry(0);

@@ -1545,6 +1545,9 @@ asmlinkage __visible void __init xen_start_kernel(void)
*/
xen_initial_gdt = &per_cpu(gdt_page, 0);

+ /* GDT can only be remapped RO */
+ pg_fixmap_gdt_flags = PAGE_KERNEL_RO;
+
xen_smp_init();

#ifdef CONFIG_ACPI_NUMA
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 37cb5aad71de..ebbfe00133f7 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2326,6 +2326,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif
case FIX_TEXT_POKE0:
case FIX_TEXT_POKE1:
+ case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END:
/* All local page mappings */
pte = pfn_pte(phys, prot);
break;
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 7ff2f1bfb7ec..eaa36162ed4a 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -392,7 +392,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
if (ctxt == NULL)
return -ENOMEM;

- gdt = get_cpu_gdt_table(cpu);
+ gdt = get_cpu_gdt_rw(cpu);

#ifdef CONFIG_X86_32
ctxt->user_regs.fs = __KERNEL_PERCPU;
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index d71f6323ac00..b4f79b923aea 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -504,7 +504,7 @@ void __init lguest_arch_host_init(void)
* byte, not the size, hence the "-1").
*/
state->host_gdt_desc.size = GDT_SIZE-1;
- state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
+ state->host_gdt_desc.address = (long)get_cpu_gdt_rw(i);

/*
* All CPUs on the Host use the same Interrupt Descriptor
@@ -554,8 +554,8 @@ void __init lguest_arch_host_init(void)
* The Host needs to be able to use the LGUEST segments on this
* CPU, too, so put them in the Host GDT.
*/
- get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
- get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
+ get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
+ get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
}

/*
diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c
index 438d4c72c7b3..ff563db025b3 100644
--- a/drivers/pnp/pnpbios/bioscalls.c
+++ b/drivers/pnp/pnpbios/bioscalls.c
@@ -54,7 +54,7 @@ __asm__(".text \n"

#define Q2_SET_SEL(cpu, selname, address, size) \
do { \
- struct desc_struct *gdt = get_cpu_gdt_table((cpu)); \
+ struct desc_struct *gdt = get_cpu_gdt_rw((cpu)); \
set_desc_base(&gdt[(selname) >> 3], (u32)(address)); \
set_desc_limit(&gdt[(selname) >> 3], (size) - 1); \
} while(0)
@@ -95,8 +95,8 @@ static inline u16 call_pnp_bios(u16 func, u16 arg1, u16 arg2, u16 arg3,
return PNP_FUNCTION_NOT_SUPPORTED;

cpu = get_cpu();
- save_desc_40 = get_cpu_gdt_table(cpu)[0x40 / 8];
- get_cpu_gdt_table(cpu)[0x40 / 8] = bad_bios_desc;
+ save_desc_40 = get_cpu_gdt_rw(cpu)[0x40 / 8];
+ get_cpu_gdt_rw(cpu)[0x40 / 8] = bad_bios_desc;

/* On some boxes IRQ's during PnP BIOS calls are deadly. */
spin_lock_irqsave(&pnp_bios_lock, flags);
@@ -134,7 +134,7 @@ static inline u16 call_pnp_bios(u16 func, u16 arg1, u16 arg2, u16 arg3,
:"memory");
spin_unlock_irqrestore(&pnp_bios_lock, flags);

- get_cpu_gdt_table(cpu)[0x40 / 8] = save_desc_40;
+ get_cpu_gdt_rw(cpu)[0x40 / 8] = save_desc_40;
put_cpu();

/* If we get here and this is set then the PnP BIOS faulted on us. */
@@ -477,7 +477,7 @@ void pnpbios_calls_init(union pnp_bios_install_struct *header)
pnp_bios_callpoint.segment = PNP_CS16;

for_each_possible_cpu(i) {
- struct desc_struct *gdt = get_cpu_gdt_table(i);
+ struct desc_struct *gdt = get_cpu_gdt_rw(i);
if (!gdt)
continue;
set_desc_base(&gdt[GDT_ENTRY_PNPBIOS_CS32],
--
2.12.0.367.g23dc2f6d3c-goog

2017-03-14 21:04:34

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH v7 3/3] x86: Make the GDT remapping read-only on 64-bit

On Tue 2017-03-14 10:05:08, Thomas Garnier wrote:
> This patch makes the GDT remapped pages read-only to prevent corruption.
> This change is done only on 64-bit.
>
> The native_load_tr_desc function was adapted to correctly handle a
> read-only GDT. The LTR instruction always writes to the GDT TSS entry.
> This generates a page fault if the GDT is read-only. This change checks
> if the current GDT is a remap and swap GDTs as needed. This function was
> tested by booting multiple machines and checking hibernation works
> properly.
>
> KVM SVM and VMX were adapted to use the writeable GDT. On VMX, the
> per-cpu variable was removed for functions to fetch the original GDT.
> Instead of reloading the previous GDT, VMX will reload the fixmap GDT as
> expected. For testing, VMs were started and restored on multiple
> configurations.
>
> Signed-off-by: Thomas Garnier <[email protected]>

Can we get the same change for 32-bit, too? Growing differences
between 32 and 64 bit are a bit of a problem...
Pavel

--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html


Attachments:
(No filename) (1.14 kB)
signature.asc (181.00 B)
Digital signature
Download all attachments

2017-03-14 22:46:37

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH v7 3/3] x86: Make the GDT remapping read-only on 64-bit

<[email protected]>,"Luis R . Rodriguez" <[email protected]>,Stanislaw Gruszka <[email protected]>,Peter Zijlstra <[email protected]>,Josh Poimboeuf <[email protected]>,Vitaly Kuznetsov <[email protected]>,Tim Chen <[email protected]>,Joerg Roedel <[email protected]>,Message-ID: <[email protected]>

On March 14, 2017 2:20:19 PM PDT, Thomas Garnier <[email protected]> wrote:
>On Tue, Mar 14, 2017 at 2:04 PM, Pavel Machek <[email protected]> wrote:
>> On Tue 2017-03-14 10:05:08, Thomas Garnier wrote:
>>> This patch makes the GDT remapped pages read-only to prevent
>corruption.
>>> This change is done only on 64-bit.
>>>
>>> The native_load_tr_desc function was adapted to correctly handle a
>>> read-only GDT. The LTR instruction always writes to the GDT TSS
>entry.
>>> This generates a page fault if the GDT is read-only. This change
>checks
>>> if the current GDT is a remap and swap GDTs as needed. This function
>was
>>> tested by booting multiple machines and checking hibernation works
>>> properly.
>>>
>>> KVM SVM and VMX were adapted to use the writeable GDT. On VMX, the
>>> per-cpu variable was removed for functions to fetch the original
>GDT.
>>> Instead of reloading the previous GDT, VMX will reload the fixmap
>GDT as
>>> expected. For testing, VMs were started and restored on multiple
>>> configurations.
>>>
>>> Signed-off-by: Thomas Garnier <[email protected]>
>>
>> Can we get the same change for 32-bit, too? Growing differences
>> between 32 and 64 bit are a bit of a problem...
>> Pavel
>
>It was discussed on previous versions that 32-bit read-only support
>would create issues that why it was favor for 64-bit only right now.
>
>>
>> --
>> (english) http://www.livejournal.com/~pavelmachek
>> (cesky, pictures)
>http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

We can't make the GDT read-only on 32 bits since we use task switches for last-resort recovery. 64 bits has IST instead.
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.

2017-03-15 13:58:32

by Boris Ostrovsky

[permalink] [raw]
Subject: Re: [PATCH v7 1/3] x86/mm: Adapt MODULES_END based on Fixmap section size

On 03/14/2017 01:05 PM, Thomas Garnier wrote:
> This patch aligns MODULES_END to the beginning of the Fixmap section.
> It optimizes the space available for both sections. The address is
> pre-computed based on the number of pages required by the Fixmap
> section.
>
> It will allow GDT remapping in the Fixmap section. The current
> MODULES_END static address does not provide enough space for the kernel
> to support a large number of processors.
>
> Signed-off-by: Thomas Garnier <[email protected]>

For Xen bits (and to some extent bare-metal):

Reviewed-and-Tested-by: Boris Ostrovsky <[email protected]>


2017-03-16 08:10:25

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH v7 1/3] x86/mm: Adapt MODULES_END based on Fixmap section size


* Thomas Garnier <[email protected]> wrote:

> This patch aligns MODULES_END to the beginning of the Fixmap section.
> It optimizes the space available for both sections. The address is
> pre-computed based on the number of pages required by the Fixmap
> section.
>
> It will allow GDT remapping in the Fixmap section. The current
> MODULES_END static address does not provide enough space for the kernel
> to support a large number of processors.
>
> Signed-off-by: Thomas Garnier <[email protected]>
> ---
> Based on next-20170308
> ---
> Documentation/x86/x86_64/mm.txt | 5 ++++-
> arch/x86/include/asm/pgtable_64_types.h | 3 ++-
> arch/x86/kernel/module.c | 1 +
> arch/x86/mm/dump_pagetables.c | 1 +
> arch/x86/mm/kasan_init_64.c | 1 +
> mm/vmalloc.c | 1 +

> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -35,6 +35,7 @@
> #include <linux/uaccess.h>
> #include <asm/tlbflush.h>
> #include <asm/shmparam.h>
> +#include <asm/fixmap.h>
>
> #include "internal.h"

Note that asm/fixmap.h is an x86-ism that isn't present in many other
architectures, so this hunk will break the build.

To make progress with these patches I've fixed it up with an ugly #ifdef
CONFIG_X86, but it needs a real solution instead before this can be pushed
upstream.

Thanks,

Ingo

=====================>
mm/vmalloc.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index dabea6a29fad..b7d2a23349f4 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -35,7 +35,10 @@
#include <linux/uaccess.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
-#include <asm/fixmap.h>
+
+#ifdef CONFIG_X86
+# include <asm/fixmap.h>
+#endif

#include "internal.h"


Subject: [tip:x86/mm] x86/mm: Adapt MODULES_END based on fixmap section size

Commit-ID: f06bdd4001c257792c54dce9427399f2896470af
Gitweb: http://git.kernel.org/tip/f06bdd4001c257792c54dce9427399f2896470af
Author: Thomas Garnier <[email protected]>
AuthorDate: Tue, 14 Mar 2017 10:05:06 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Thu, 16 Mar 2017 09:06:24 +0100

x86/mm: Adapt MODULES_END based on fixmap section size

This patch aligns MODULES_END to the beginning of the fixmap section.
It optimizes the space available for both sections. The address is
pre-computed based on the number of pages required by the fixmap
section.

It will allow GDT remapping in the fixmap section. The current
MODULES_END static address does not provide enough space for the kernel
to support a large number of processors.

Signed-off-by: Thomas Garnier <[email protected]>
Cc: Alexander Potapenko <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Andrey Ryabinin <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Ard Biesheuvel <[email protected]>
Cc: Boris Ostrovsky <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Chris Wilson <[email protected]>
Cc: Christian Borntraeger <[email protected]>
Cc: Dmitry Vyukov <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Jiri Kosina <[email protected]>
Cc: Joerg Roedel <[email protected]>
Cc: Jonathan Corbet <[email protected]>
Cc: Josh Poimboeuf <[email protected]>
Cc: Juergen Gross <[email protected]>
Cc: Kees Cook <[email protected]>
Cc: Len Brown <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Lorenzo Stoakes <[email protected]>
Cc: Luis R . Rodriguez <[email protected]>
Cc: Matt Fleming <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Paolo Bonzini <[email protected]>
Cc: Paul Gortmaker <[email protected]>
Cc: Pavel Machek <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Radim Krčmář <[email protected]>
Cc: Rafael J . Wysocki <[email protected]>
Cc: Rusty Russell <[email protected]>
Cc: Stanislaw Gruszka <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Vitaly Kuznetsov <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: zijun_hu <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
[ Small build fix. ]
Signed-off-by: Ingo Molnar <[email protected]>
---
Documentation/x86/x86_64/mm.txt | 5 ++++-
arch/x86/include/asm/pgtable_64_types.h | 3 ++-
arch/x86/kernel/module.c | 1 +
arch/x86/mm/dump_pagetables.c | 1 +
arch/x86/mm/kasan_init_64.c | 1 +
mm/vmalloc.c | 4 ++++
6 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 5724092..ee3f9c3 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -19,7 +19,7 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space
+ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable)
ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole

@@ -39,6 +39,9 @@ memory window (this size is arbitrary, it can be raised later if needed).
The mappings are not part of any other kernel PGD and are only available
during EFI runtime calls.

+The module mapping space size changes based on the CONFIG requirements for the
+following fixmap section.
+
Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all
physical memory, vmalloc/ioremap space and virtual memory map are randomized.
Their order is preserved but their base will be offset early at boot time.
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 0b2797e..516593e6 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -68,7 +68,8 @@ typedef struct { pteval_t pte; } pte_t;
#endif /* CONFIG_RANDOMIZE_MEMORY */
#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
-#define MODULES_END _AC(0xffffffffff000000, UL)
+/* The module sections ends with the start of the fixmap */
+#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1)
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
#define ESPFIX_PGD_ENTRY _AC(-2, UL)
#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT)
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 477ae80..fad61ca 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -35,6 +35,7 @@
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/setup.h>
+#include <asm/fixmap.h>

#if 0
#define DEBUGP(fmt, ...) \
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 58b5bee..75efeec 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -20,6 +20,7 @@

#include <asm/kasan.h>
#include <asm/pgtable.h>
+#include <asm/fixmap.h>

/*
* The dumper groups pagetable entries of the same type into one, and for
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 8d63d7a..1bde19e 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -9,6 +9,7 @@

#include <asm/tlbflush.h>
#include <asm/sections.h>
+#include <asm/fixmap.h>

extern pgd_t early_level4_pgt[PTRS_PER_PGD];
extern struct range pfn_mapped[E820_X_MAX];
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0dd8022..b7d2a23 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -36,6 +36,10 @@
#include <asm/tlbflush.h>
#include <asm/shmparam.h>

+#ifdef CONFIG_X86
+# include <asm/fixmap.h>
+#endif
+
#include "internal.h"

struct vfree_deferred {

Subject: [tip:x86/mm] x86: Make the GDT remapping read-only on 64-bit

Commit-ID: 45fc8757d1d2128e342b4e7ef39adedf7752faac
Gitweb: http://git.kernel.org/tip/45fc8757d1d2128e342b4e7ef39adedf7752faac
Author: Thomas Garnier <[email protected]>
AuthorDate: Tue, 14 Mar 2017 10:05:08 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Thu, 16 Mar 2017 09:06:35 +0100

x86: Make the GDT remapping read-only on 64-bit

This patch makes the GDT remapped pages read-only, to prevent accidental
(or intentional) corruption of this key data structure.

This change is done only on 64-bit, because 32-bit needs it to be writable
for TSS switches.

The native_load_tr_desc function was adapted to correctly handle a
read-only GDT. The LTR instruction always writes to the GDT TSS entry.
This generates a page fault if the GDT is read-only. This change checks
if the current GDT is a remap and swap GDTs as needed. This function was
tested by booting multiple machines and checking hibernation works
properly.

KVM SVM and VMX were adapted to use the writeable GDT. On VMX, the
per-cpu variable was removed for functions to fetch the original GDT.
Instead of reloading the previous GDT, VMX will reload the fixmap GDT as
expected. For testing, VMs were started and restored on multiple
configurations.

Signed-off-by: Thomas Garnier <[email protected]>
Cc: Alexander Potapenko <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Andrey Ryabinin <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Ard Biesheuvel <[email protected]>
Cc: Boris Ostrovsky <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Chris Wilson <[email protected]>
Cc: Christian Borntraeger <[email protected]>
Cc: Dmitry Vyukov <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Jiri Kosina <[email protected]>
Cc: Joerg Roedel <[email protected]>
Cc: Jonathan Corbet <[email protected]>
Cc: Josh Poimboeuf <[email protected]>
Cc: Juergen Gross <[email protected]>
Cc: Kees Cook <[email protected]>
Cc: Len Brown <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Lorenzo Stoakes <[email protected]>
Cc: Luis R . Rodriguez <[email protected]>
Cc: Matt Fleming <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Paolo Bonzini <[email protected]>
Cc: Paul Gortmaker <[email protected]>
Cc: Pavel Machek <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Radim Krčmář <[email protected]>
Cc: Rafael J . Wysocki <[email protected]>
Cc: Rusty Russell <[email protected]>
Cc: Stanislaw Gruszka <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Vitaly Kuznetsov <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: zijun_hu <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/include/asm/desc.h | 106 +++++++++++++++++++++++++--------------
arch/x86/include/asm/processor.h | 1 +
arch/x86/kernel/cpu/common.c | 28 ++++++++---
arch/x86/kvm/svm.c | 4 +-
arch/x86/kvm/vmx.c | 12 ++---
5 files changed, 96 insertions(+), 55 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 4b5ef0c..ec05f9c 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -248,9 +248,77 @@ static inline void native_set_ldt(const void *addr, unsigned int entries)
}
}

+static inline void native_load_gdt(const struct desc_ptr *dtr)
+{
+ asm volatile("lgdt %0"::"m" (*dtr));
+}
+
+static inline void native_load_idt(const struct desc_ptr *dtr)
+{
+ asm volatile("lidt %0"::"m" (*dtr));
+}
+
+static inline void native_store_gdt(struct desc_ptr *dtr)
+{
+ asm volatile("sgdt %0":"=m" (*dtr));
+}
+
+static inline void native_store_idt(struct desc_ptr *dtr)
+{
+ asm volatile("sidt %0":"=m" (*dtr));
+}
+
+/*
+ * The LTR instruction marks the TSS GDT entry as busy. On 64-bit, the GDT is
+ * a read-only remapping. To prevent a page fault, the GDT is switched to the
+ * original writeable version when needed.
+ */
+#ifdef CONFIG_X86_64
static inline void native_load_tr_desc(void)
{
+ struct desc_ptr gdt;
+ int cpu = raw_smp_processor_id();
+ bool restore = 0;
+ struct desc_struct *fixmap_gdt;
+
+ native_store_gdt(&gdt);
+ fixmap_gdt = get_cpu_gdt_ro(cpu);
+
+ /*
+ * If the current GDT is the read-only fixmap, swap to the original
+ * writeable version. Swap back at the end.
+ */
+ if (gdt.address == (unsigned long)fixmap_gdt) {
+ load_direct_gdt(cpu);
+ restore = 1;
+ }
asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+ if (restore)
+ load_fixmap_gdt(cpu);
+}
+#else
+static inline void native_load_tr_desc(void)
+{
+ asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+}
+#endif
+
+static inline unsigned long native_store_tr(void)
+{
+ unsigned long tr;
+
+ asm volatile("str %0":"=r" (tr));
+
+ return tr;
+}
+
+static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+ struct desc_struct *gdt = get_cpu_gdt_rw(cpu);
+ unsigned int i;
+
+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+ gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
}

DECLARE_PER_CPU(bool, __tss_limit_invalid);
@@ -305,44 +373,6 @@ static inline void invalidate_tss_limit(void)
this_cpu_write(__tss_limit_invalid, true);
}

-static inline void native_load_gdt(const struct desc_ptr *dtr)
-{
- asm volatile("lgdt %0"::"m" (*dtr));
-}
-
-static inline void native_load_idt(const struct desc_ptr *dtr)
-{
- asm volatile("lidt %0"::"m" (*dtr));
-}
-
-static inline void native_store_gdt(struct desc_ptr *dtr)
-{
- asm volatile("sgdt %0":"=m" (*dtr));
-}
-
-static inline void native_store_idt(struct desc_ptr *dtr)
-{
- asm volatile("sidt %0":"=m" (*dtr));
-}
-
-static inline unsigned long native_store_tr(void)
-{
- unsigned long tr;
-
- asm volatile("str %0":"=r" (tr));
-
- return tr;
-}
-
-static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
-{
- struct desc_struct *gdt = get_cpu_gdt_rw(cpu);
- unsigned int i;
-
- for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
- gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
-}
-
/* This intentionally ignores lm, since 32-bit apps don't have that field. */
#define LDT_empty(info) \
((info)->base_addr == 0 && \
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 1150e1b..edf42c4 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -716,6 +716,7 @@ extern struct desc_ptr early_gdt_descr;

extern void cpu_set_gdt(int);
extern void switch_to_new_gdt(int);
+extern void load_direct_gdt(int);
extern void load_fixmap_gdt(int);
extern void load_percpu_segment(int);
extern void cpu_init(void);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3cf1590..f8e22db 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -448,8 +448,15 @@ void load_percpu_segment(int cpu)
load_stack_canary_segment();
}

-/* Used by XEN to force the GDT read-only when required */
+/*
+ * On 64-bit the GDT remapping is read-only.
+ * A global is used for Xen to change the default when required.
+ */
+#ifdef CONFIG_X86_64
+pgprot_t pg_fixmap_gdt_flags = PAGE_KERNEL_RO;
+#else
pgprot_t pg_fixmap_gdt_flags = PAGE_KERNEL;
+#endif

/* Setup the fixmap mapping only once per-processor */
static inline void setup_fixmap_gdt(int cpu)
@@ -458,6 +465,17 @@ static inline void setup_fixmap_gdt(int cpu)
__pa(get_cpu_gdt_rw(cpu)), pg_fixmap_gdt_flags);
}

+/* Load the original GDT from the per-cpu structure */
+void load_direct_gdt(int cpu)
+{
+ struct desc_ptr gdt_descr;
+
+ gdt_descr.address = (long)get_cpu_gdt_rw(cpu);
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
+}
+EXPORT_SYMBOL_GPL(load_direct_gdt);
+
/* Load a fixmap remapping of the per-cpu GDT */
void load_fixmap_gdt(int cpu)
{
@@ -467,6 +485,7 @@ void load_fixmap_gdt(int cpu)
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
}
+EXPORT_SYMBOL_GPL(load_fixmap_gdt);

/*
* Current gdt points %fs at the "master" per-cpu area: after this,
@@ -474,11 +493,8 @@ void load_fixmap_gdt(int cpu)
*/
void switch_to_new_gdt(int cpu)
{
- struct desc_ptr gdt_descr;
-
- gdt_descr.address = (long)get_cpu_gdt_rw(cpu);
- gdt_descr.size = GDT_SIZE - 1;
- load_gdt(&gdt_descr);
+ /* Load the original GDT */
+ load_direct_gdt(cpu);
/* Reload the per-cpu base */
load_percpu_segment(cpu);
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d1efe2c..c02b9af 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -741,7 +741,6 @@ static int svm_hardware_enable(void)

struct svm_cpu_data *sd;
uint64_t efer;
- struct desc_ptr gdt_descr;
struct desc_struct *gdt;
int me = raw_smp_processor_id();

@@ -763,8 +762,7 @@ static int svm_hardware_enable(void)
sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
sd->next_asid = sd->max_asid + 1;

- native_store_gdt(&gdt_descr);
- gdt = (struct desc_struct *)gdt_descr.address;
+ gdt = get_current_gdt_rw();
sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);

wrmsrl(MSR_EFER, efer | EFER_SVME);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 98e82ee..596a76d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -935,7 +935,6 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
* when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
*/
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
-static DEFINE_PER_CPU(struct desc_ptr, host_gdt);

/*
* We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
@@ -2052,14 +2051,13 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
*/
static unsigned long segment_base(u16 selector)
{
- struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
struct desc_struct *table;
unsigned long v;

if (!(selector & ~SEGMENT_RPL_MASK))
return 0;

- table = (struct desc_struct *)gdt->address;
+ table = get_current_gdt_ro();

if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
u16 ldt_selector = kvm_read_ldt();
@@ -2164,7 +2162,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
#endif
if (vmx->host_state.msr_host_bndcfgs)
wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
- load_gdt(this_cpu_ptr(&host_gdt));
+ load_fixmap_gdt(raw_smp_processor_id());
}

static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -2266,7 +2264,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
}

if (!already_loaded) {
- struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
+ unsigned long gdt = get_current_gdt_ro_vaddr();
unsigned long sysenter_esp;

kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
@@ -2277,7 +2275,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
*/
vmcs_writel(HOST_TR_BASE,
(unsigned long)this_cpu_ptr(&cpu_tss));
- vmcs_writel(HOST_GDTR_BASE, gdt->address);
+ vmcs_writel(HOST_GDTR_BASE, gdt); /* 22.2.4 */

/*
* VM exits change the host TR limit to 0x67 after a VM
@@ -3465,8 +3463,6 @@ static int hardware_enable(void)
ept_sync_global();
}

- native_store_gdt(this_cpu_ptr(&host_gdt));
-
return 0;
}


Subject: [tip:x86/mm] x86: Remap GDT tables in the fixmap section

Commit-ID: 69218e47994da614e7af600bf06887750ab6657a
Gitweb: http://git.kernel.org/tip/69218e47994da614e7af600bf06887750ab6657a
Author: Thomas Garnier <[email protected]>
AuthorDate: Tue, 14 Mar 2017 10:05:07 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Thu, 16 Mar 2017 09:06:35 +0100

x86: Remap GDT tables in the fixmap section

Each processor holds a GDT in its per-cpu structure. The sgdt
instruction gives the base address of the current GDT. This address can
be used to bypass KASLR memory randomization. With another bug, an
attacker could target other per-cpu structures or deduce the base of
the main memory section (PAGE_OFFSET).

This patch relocates the GDT table for each processor inside the
fixmap section. The space is reserved based on number of supported
processors.

For consistency, the remapping is done by default on 32 and 64-bit.

Each processor switches to its remapped GDT at the end of
initialization. For hibernation, the main processor returns with the
original GDT and switches back to the remapping at completion.

This patch was tested on both architectures. Hibernation and KVM were
both tested specially for their usage of the GDT.

Thanks to Boris Ostrovsky <[email protected]> for testing and
recommending changes for Xen support.

Signed-off-by: Thomas Garnier <[email protected]>
Cc: Alexander Potapenko <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Andrey Ryabinin <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Ard Biesheuvel <[email protected]>
Cc: Boris Ostrovsky <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Chris Wilson <[email protected]>
Cc: Christian Borntraeger <[email protected]>
Cc: Dmitry Vyukov <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Jiri Kosina <[email protected]>
Cc: Joerg Roedel <[email protected]>
Cc: Jonathan Corbet <[email protected]>
Cc: Josh Poimboeuf <[email protected]>
Cc: Juergen Gross <[email protected]>
Cc: Kees Cook <[email protected]>
Cc: Len Brown <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Lorenzo Stoakes <[email protected]>
Cc: Luis R . Rodriguez <[email protected]>
Cc: Matt Fleming <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Paolo Bonzini <[email protected]>
Cc: Paul Gortmaker <[email protected]>
Cc: Pavel Machek <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Radim Krčmář <[email protected]>
Cc: Rafael J . Wysocki <[email protected]>
Cc: Rusty Russell <[email protected]>
Cc: Stanislaw Gruszka <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Vitaly Kuznetsov <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: zijun_hu <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/entry/vdso/vma.c | 2 +-
arch/x86/include/asm/desc.h | 58 ++++++++++++++++++++++++++++++++---
arch/x86/include/asm/fixmap.h | 4 +++
arch/x86/include/asm/processor.h | 1 +
arch/x86/include/asm/stackprotector.h | 2 +-
arch/x86/kernel/acpi/sleep.c | 2 +-
arch/x86/kernel/apm_32.c | 6 ++--
arch/x86/kernel/cpu/common.c | 29 ++++++++++++++++--
arch/x86/kernel/setup_percpu.c | 2 +-
arch/x86/kernel/smpboot.c | 2 +-
arch/x86/platform/efi/efi_32.c | 4 +--
arch/x86/power/cpu.c | 7 +++--
arch/x86/xen/enlighten.c | 5 ++-
arch/x86/xen/mmu.c | 1 +
arch/x86/xen/smp.c | 2 +-
drivers/lguest/x86/core.c | 6 ++--
drivers/pnp/pnpbios/bioscalls.c | 10 +++---
17 files changed, 114 insertions(+), 29 deletions(-)

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 226ca70..5c5d4d7 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -354,7 +354,7 @@ static void vgetcpu_cpu_init(void *arg)
d.p = 1; /* Present */
d.d = 1; /* 32-bit */

- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
+ write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
}

static int vgetcpu_online(unsigned int cpu)
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 1548ca9..4b5ef0c 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -4,6 +4,7 @@
#include <asm/desc_defs.h>
#include <asm/ldt.h>
#include <asm/mmu.h>
+#include <asm/fixmap.h>

#include <linux/smp.h>
#include <linux/percpu.h>
@@ -38,6 +39,7 @@ extern struct desc_ptr idt_descr;
extern gate_desc idt_table[];
extern const struct desc_ptr debug_idt_descr;
extern gate_desc debug_idt_table[];
+extern pgprot_t pg_fixmap_gdt_flags;

struct gdt_page {
struct desc_struct gdt[GDT_ENTRIES];
@@ -45,11 +47,57 @@ struct gdt_page {

DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);

-static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+/* Provide the original GDT */
+static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu)
{
return per_cpu(gdt_page, cpu).gdt;
}

+static inline unsigned long get_cpu_gdt_rw_vaddr(unsigned int cpu)
+{
+ return (unsigned long)get_cpu_gdt_rw(cpu);
+}
+
+/* Provide the current original GDT */
+static inline struct desc_struct *get_current_gdt_rw(void)
+{
+ return this_cpu_ptr(&gdt_page)->gdt;
+}
+
+static inline unsigned long get_current_gdt_rw_vaddr(void)
+{
+ return (unsigned long)get_current_gdt_rw();
+}
+
+/* Get the fixmap index for a specific processor */
+static inline unsigned int get_cpu_gdt_ro_index(int cpu)
+{
+ return FIX_GDT_REMAP_BEGIN + cpu;
+}
+
+/* Provide the fixmap address of the remapped GDT */
+static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
+{
+ unsigned int idx = get_cpu_gdt_ro_index(cpu);
+ return (struct desc_struct *)__fix_to_virt(idx);
+}
+
+static inline unsigned long get_cpu_gdt_ro_vaddr(int cpu)
+{
+ return (unsigned long)get_cpu_gdt_ro(cpu);
+}
+
+/* Provide the current read-only GDT */
+static inline struct desc_struct *get_current_gdt_ro(void)
+{
+ return get_cpu_gdt_ro(smp_processor_id());
+}
+
+static inline unsigned long get_current_gdt_ro_vaddr(void)
+{
+ return (unsigned long)get_current_gdt_ro();
+}
+
#ifdef CONFIG_X86_64

static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
@@ -174,7 +222,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned t

static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
{
- struct desc_struct *d = get_cpu_gdt_table(cpu);
+ struct desc_struct *d = get_cpu_gdt_rw(cpu);
tss_desc tss;

set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
@@ -194,7 +242,7 @@ static inline void native_set_ldt(const void *addr, unsigned int entries)

set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
entries * LDT_ENTRY_SIZE - 1);
- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
+ write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_LDT,
&ldt, DESC_LDT);
asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
}
@@ -209,7 +257,7 @@ DECLARE_PER_CPU(bool, __tss_limit_invalid);

static inline void force_reload_TR(void)
{
- struct desc_struct *d = get_cpu_gdt_table(smp_processor_id());
+ struct desc_struct *d = get_current_gdt_rw();
tss_desc tss;

memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc));
@@ -288,7 +336,7 @@ static inline unsigned long native_store_tr(void)

static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
{
- struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+ struct desc_struct *gdt = get_cpu_gdt_rw(cpu);
unsigned int i;

for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 8554f96..b65155c 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -100,6 +100,10 @@ enum fixed_addresses {
#ifdef CONFIG_X86_INTEL_MID
FIX_LNW_VRTC,
#endif
+ /* Fixmap entries to remap the GDTs, one per processor. */
+ FIX_GDT_REMAP_BEGIN,
+ FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1,
+
__end_of_permanent_fixed_addresses,

/*
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7caa2ac..1150e1b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -716,6 +716,7 @@ extern struct desc_ptr early_gdt_descr;

extern void cpu_set_gdt(int);
extern void switch_to_new_gdt(int);
+extern void load_fixmap_gdt(int);
extern void load_percpu_segment(int);
extern void cpu_init(void);

diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 58505f0..dcbd9bc 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -87,7 +87,7 @@ static inline void setup_stack_canary_segment(int cpu)
{
#ifdef CONFIG_X86_32
unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu);
- struct desc_struct *gdt_table = get_cpu_gdt_table(cpu);
+ struct desc_struct *gdt_table = get_cpu_gdt_rw(cpu);
struct desc_struct desc;

desc = gdt_table[GDT_ENTRY_STACK_CANARY];
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 4858733..ed01481 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -101,7 +101,7 @@ int x86_acpi_suspend_lowlevel(void)
#ifdef CONFIG_SMP
initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
early_gdt_descr.address =
- (unsigned long)get_cpu_gdt_table(smp_processor_id());
+ (unsigned long)get_cpu_gdt_rw(smp_processor_id());
initial_gs = per_cpu_offset(smp_processor_id());
#endif
initial_code = (unsigned long)wakeup_long64;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 5a41454..446b0d3 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -609,7 +609,7 @@ static long __apm_bios_call(void *_call)

cpu = get_cpu();
BUG_ON(cpu != 0);
- gdt = get_cpu_gdt_table(cpu);
+ gdt = get_cpu_gdt_rw(cpu);
save_desc_40 = gdt[0x40 / 8];
gdt[0x40 / 8] = bad_bios_desc;

@@ -685,7 +685,7 @@ static long __apm_bios_call_simple(void *_call)

cpu = get_cpu();
BUG_ON(cpu != 0);
- gdt = get_cpu_gdt_table(cpu);
+ gdt = get_cpu_gdt_rw(cpu);
save_desc_40 = gdt[0x40 / 8];
gdt[0x40 / 8] = bad_bios_desc;

@@ -2352,7 +2352,7 @@ static int __init apm_init(void)
* Note we only set APM segments on CPU zero, since we pin the APM
* code to that CPU.
*/
- gdt = get_cpu_gdt_table(0);
+ gdt = get_cpu_gdt_rw(0);
set_desc_base(&gdt[APM_CS >> 3],
(unsigned long)__va((unsigned long)apm_info.bios.cseg << 4));
set_desc_base(&gdt[APM_CS_16 >> 3],
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 58094a1..3cf1590 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -448,6 +448,26 @@ void load_percpu_segment(int cpu)
load_stack_canary_segment();
}

+/* Used by XEN to force the GDT read-only when required */
+pgprot_t pg_fixmap_gdt_flags = PAGE_KERNEL;
+
+/* Setup the fixmap mapping only once per-processor */
+static inline void setup_fixmap_gdt(int cpu)
+{
+ __set_fixmap(get_cpu_gdt_ro_index(cpu),
+ __pa(get_cpu_gdt_rw(cpu)), pg_fixmap_gdt_flags);
+}
+
+/* Load a fixmap remapping of the per-cpu GDT */
+void load_fixmap_gdt(int cpu)
+{
+ struct desc_ptr gdt_descr;
+
+ gdt_descr.address = (long)get_cpu_gdt_ro(cpu);
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
+}
+
/*
* Current gdt points %fs at the "master" per-cpu area: after this,
* it's on the real one.
@@ -456,11 +476,10 @@ void switch_to_new_gdt(int cpu)
{
struct desc_ptr gdt_descr;

- gdt_descr.address = (long)get_cpu_gdt_table(cpu);
+ gdt_descr.address = (long)get_cpu_gdt_rw(cpu);
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
/* Reload the per-cpu base */
-
load_percpu_segment(cpu);
}

@@ -1526,6 +1545,9 @@ void cpu_init(void)

if (is_uv_system())
uv_cpu_init();
+
+ setup_fixmap_gdt(cpu);
+ load_fixmap_gdt(cpu);
}

#else
@@ -1581,6 +1603,9 @@ void cpu_init(void)
dbg_restore_debug_regs();

fpu__init_cpu();
+
+ setup_fixmap_gdt(cpu);
+ load_fixmap_gdt(cpu);
}
#endif

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 9820d6d..11338b0 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -160,7 +160,7 @@ static inline void setup_percpu_segment(int cpu)
pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
0x2 | DESCTYPE_S, 0x8);
gdt.s = 1;
- write_gdt_entry(get_cpu_gdt_table(cpu),
+ write_gdt_entry(get_cpu_gdt_rw(cpu),
GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
#endif
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index bd1f1ad..f04479a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -983,7 +983,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
unsigned long timeout;

idle->thread.sp = (unsigned long)task_pt_regs(idle);
- early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+ early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
initial_code = (unsigned long)start_secondary;
initial_stack = idle->thread.sp;

diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index cef39b0..9500711 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -68,7 +68,7 @@ pgd_t * __init efi_call_phys_prolog(void)
load_cr3(initial_page_table);
__flush_tlb_all();

- gdt_descr.address = __pa(get_cpu_gdt_table(0));
+ gdt_descr.address = __pa(get_cpu_gdt_rw(0));
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);

@@ -79,7 +79,7 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
{
struct desc_ptr gdt_descr;

- gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
+ gdt_descr.address = (unsigned long)get_cpu_gdt_rw(0);
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);

diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 66ade16..6b05a92 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -95,7 +95,7 @@ static void __save_processor_state(struct saved_context *ctxt)
* 'pmode_gdt' in wakeup_start.
*/
ctxt->gdt_desc.size = GDT_SIZE - 1;
- ctxt->gdt_desc.address = (unsigned long)get_cpu_gdt_table(smp_processor_id());
+ ctxt->gdt_desc.address = (unsigned long)get_cpu_gdt_rw(smp_processor_id());

store_tr(ctxt->tr);

@@ -162,7 +162,7 @@ static void fix_processor_context(void)
int cpu = smp_processor_id();
struct tss_struct *t = &per_cpu(cpu_tss, cpu);
#ifdef CONFIG_X86_64
- struct desc_struct *desc = get_cpu_gdt_table(cpu);
+ struct desc_struct *desc = get_cpu_gdt_rw(cpu);
tss_desc tss;
#endif
set_tss_desc(cpu, t); /*
@@ -183,6 +183,9 @@ static void fix_processor_context(void)
load_mm_ldt(current->active_mm); /* This does lldt */

fpu__resume_cpu();
+
+ /* The processor is back on the direct GDT, load back the fixmap */
+ load_fixmap_gdt(cpu);
}

/**
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index ec1d5c4..08faa61 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -710,7 +710,7 @@ static void load_TLS_descriptor(struct thread_struct *t,

*shadow = t->tls_array[i];

- gdt = get_cpu_gdt_table(cpu);
+ gdt = get_cpu_gdt_rw(cpu);
maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
mc = __xen_mc_entry(0);

@@ -1545,6 +1545,9 @@ asmlinkage __visible void __init xen_start_kernel(void)
*/
xen_initial_gdt = &per_cpu(gdt_page, 0);

+ /* GDT can only be remapped RO */
+ pg_fixmap_gdt_flags = PAGE_KERNEL_RO;
+
xen_smp_init();

#ifdef CONFIG_ACPI_NUMA
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 37cb5aa..ebbfe00 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2326,6 +2326,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif
case FIX_TEXT_POKE0:
case FIX_TEXT_POKE1:
+ case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END:
/* All local page mappings */
pte = pfn_pte(phys, prot);
break;
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 7ff2f1b..eaa3616 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -392,7 +392,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
if (ctxt == NULL)
return -ENOMEM;

- gdt = get_cpu_gdt_table(cpu);
+ gdt = get_cpu_gdt_rw(cpu);

#ifdef CONFIG_X86_32
ctxt->user_regs.fs = __KERNEL_PERCPU;
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index d71f632..b4f79b9 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -504,7 +504,7 @@ void __init lguest_arch_host_init(void)
* byte, not the size, hence the "-1").
*/
state->host_gdt_desc.size = GDT_SIZE-1;
- state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
+ state->host_gdt_desc.address = (long)get_cpu_gdt_rw(i);

/*
* All CPUs on the Host use the same Interrupt Descriptor
@@ -554,8 +554,8 @@ void __init lguest_arch_host_init(void)
* The Host needs to be able to use the LGUEST segments on this
* CPU, too, so put them in the Host GDT.
*/
- get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
- get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
+ get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
+ get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
}

/*
diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c
index 438d4c7..ff563db 100644
--- a/drivers/pnp/pnpbios/bioscalls.c
+++ b/drivers/pnp/pnpbios/bioscalls.c
@@ -54,7 +54,7 @@ __asm__(".text \n"

#define Q2_SET_SEL(cpu, selname, address, size) \
do { \
- struct desc_struct *gdt = get_cpu_gdt_table((cpu)); \
+ struct desc_struct *gdt = get_cpu_gdt_rw((cpu)); \
set_desc_base(&gdt[(selname) >> 3], (u32)(address)); \
set_desc_limit(&gdt[(selname) >> 3], (size) - 1); \
} while(0)
@@ -95,8 +95,8 @@ static inline u16 call_pnp_bios(u16 func, u16 arg1, u16 arg2, u16 arg3,
return PNP_FUNCTION_NOT_SUPPORTED;

cpu = get_cpu();
- save_desc_40 = get_cpu_gdt_table(cpu)[0x40 / 8];
- get_cpu_gdt_table(cpu)[0x40 / 8] = bad_bios_desc;
+ save_desc_40 = get_cpu_gdt_rw(cpu)[0x40 / 8];
+ get_cpu_gdt_rw(cpu)[0x40 / 8] = bad_bios_desc;

/* On some boxes IRQ's during PnP BIOS calls are deadly. */
spin_lock_irqsave(&pnp_bios_lock, flags);
@@ -134,7 +134,7 @@ static inline u16 call_pnp_bios(u16 func, u16 arg1, u16 arg2, u16 arg3,
:"memory");
spin_unlock_irqrestore(&pnp_bios_lock, flags);

- get_cpu_gdt_table(cpu)[0x40 / 8] = save_desc_40;
+ get_cpu_gdt_rw(cpu)[0x40 / 8] = save_desc_40;
put_cpu();

/* If we get here and this is set then the PnP BIOS faulted on us. */
@@ -477,7 +477,7 @@ void pnpbios_calls_init(union pnp_bios_install_struct *header)
pnp_bios_callpoint.segment = PNP_CS16;

for_each_possible_cpu(i) {
- struct desc_struct *gdt = get_cpu_gdt_table(i);
+ struct desc_struct *gdt = get_cpu_gdt_rw(i);
if (!gdt)
continue;
set_desc_base(&gdt[GDT_ENTRY_PNPBIOS_CS32],