2015-07-21 19:59:38

by Andy Lutomirski

[permalink] [raw]
Subject: [PATCH v2 0/3] x86: modify_ldt improvement, test, and config option

Here's v2. It fixes the "dazed and confused" issue, I hope. It's also
probably a good general attack surface reduction, and it replaces some
scary code with IMO less scary code.

Also, servers and embedded systems should probably turn off modify_ldt.
This makes that possible.

Xen people, can you take a look at this? I think that, with this change,
write_ldt_entry is unnecessary.

Changes from v1:
- The config option is new.
- The test case is new.
- Fixed a missing allocation failure check.
- Fixed a use-after-free on fork().

Andy Lutomirski (3):
x86/ldt: Make modify_ldt synchronous
x86/ldt: Make modify_ldt optional
selftests/x86, x86/ldt: Add a selftest for modify_ldt

arch/x86/Kconfig | 17 ++
arch/x86/include/asm/desc.h | 15 --
arch/x86/include/asm/mmu.h | 5 +-
arch/x86/include/asm/mmu_context.h | 63 ++++-
arch/x86/kernel/Makefile | 3 +-
arch/x86/kernel/cpu/common.c | 4 +-
arch/x86/kernel/cpu/perf_event.c | 16 +-
arch/x86/kernel/ldt.c | 247 +++++++++--------
arch/x86/kernel/process_64.c | 6 +-
arch/x86/kernel/step.c | 8 +-
arch/x86/power/cpu.c | 3 +-
kernel/sys_ni.c | 1 +
tools/testing/selftests/x86/Makefile | 2 +-
tools/testing/selftests/x86/ldt_gdt.c | 492 ++++++++++++++++++++++++++++++++++
14 files changed, 730 insertions(+), 152 deletions(-)
create mode 100644 tools/testing/selftests/x86/ldt_gdt.c

--
2.4.3


2015-07-21 19:59:44

by Andy Lutomirski

[permalink] [raw]
Subject: [PATCH v2 1/3] x86/ldt: Make modify_ldt synchronous

modify_ldt has questionable locking and does not synchronize
threads. Improve it: redesign the locking and synchronize all
threads' LDTs using an IPI on all modifications.

This will dramatically slow down modify_ldt in multithreaded
programs, but there shouldn't be any multithreaded programs that
care about modify_ldt's performance in the first place.

Cc: [email protected]
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/include/asm/desc.h | 15 ---
arch/x86/include/asm/mmu.h | 3 +-
arch/x86/include/asm/mmu_context.h | 48 ++++++-
arch/x86/kernel/cpu/common.c | 4 +-
arch/x86/kernel/cpu/perf_event.c | 12 +-
arch/x86/kernel/ldt.c | 247 +++++++++++++++++++------------------
arch/x86/kernel/process_64.c | 4 +-
arch/x86/kernel/step.c | 6 +-
arch/x86/power/cpu.c | 3 +-
9 files changed, 192 insertions(+), 150 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index a0bf89fd2647..4e10d73cf018 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -280,21 +280,6 @@ static inline void clear_LDT(void)
set_ldt(NULL, 0);
}

-/*
- * load one particular LDT into the current CPU
- */
-static inline void load_LDT_nolock(mm_context_t *pc)
-{
- set_ldt(pc->ldt, pc->size);
-}
-
-static inline void load_LDT(mm_context_t *pc)
-{
- preempt_disable();
- load_LDT_nolock(pc);
- preempt_enable();
-}
-
static inline unsigned long get_desc_base(const struct desc_struct *desc)
{
return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 09b9620a73b4..364d27481a52 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -9,8 +9,7 @@
* we put the segment information here.
*/
typedef struct {
- void *ldt;
- int size;
+ struct ldt_struct *ldt;

#ifdef CONFIG_X86_64
/* True if mm supports a task running in 32 bit compatibility mode. */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 5e8daee7c5c9..1ff121fbf366 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -34,6 +34,44 @@ static inline void load_mm_cr4(struct mm_struct *mm) {}
#endif

/*
+ * ldt_structs can be allocated, used, and freed, but they are never
+ * modified while live.
+ */
+struct ldt_struct {
+ int size;
+ int __pad; /* keep the descriptors naturally aligned. */
+ struct desc_struct entries[];
+};
+
+static inline void load_mm_ldt(struct mm_struct *mm)
+{
+ struct ldt_struct *ldt;
+ DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+
+ /* lockless_dereference synchronizes with smp_store_release */
+ ldt = lockless_dereference(mm->context.ldt);
+
+ /*
+ * Any change to mm->context.ldt is followed by an IPI to all
+ * CPUs with the mm active. The LDT will not be freed until
+ * after the IPI is handled by all such CPUs. This means that,
+ * if the ldt_struct changes before we return, the values we see
+ * will be safe, and the new values will be loaded before we run
+ * any user code.
+ *
+ * NB: don't try to convert this to use RCU without extreme care.
+ * We would still need IRQs off, because we don't want to change
+ * the local LDT after an IPI loaded a newer value than the one
+ * that we can see.
+ */
+
+ if (unlikely(ldt))
+ set_ldt(ldt->entries, ldt->size);
+ else
+ clear_LDT();
+}
+
+/*
* Used for LDT copy/destruction.
*/
int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
@@ -78,12 +116,12 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
* was called and then modify_ldt changed
* prev->context.ldt but suppressed an IPI to this CPU.
* In this case, prev->context.ldt != NULL, because we
- * never free an LDT while the mm still exists. That
- * means that next->context.ldt != prev->context.ldt,
- * because mms never share an LDT.
+ * never set context.ldt to NULL while the mm still
+ * exists. That means that next->context.ldt !=
+ * prev->context.ldt, because mms never share an LDT.
*/
if (unlikely(prev->context.ldt != next->context.ldt))
- load_LDT_nolock(&next->context);
+ load_mm_ldt(next);
}
#ifdef CONFIG_SMP
else {
@@ -106,7 +144,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
load_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
load_mm_cr4(next);
- load_LDT_nolock(&next->context);
+ load_mm_ldt(next);
}
}
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 922c5e0cea4c..cb9e5df42dd2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1410,7 +1410,7 @@ void cpu_init(void)
load_sp0(t, &current->thread);
set_tss_desc(cpu, t);
load_TR_desc();
- load_LDT(&init_mm.context);
+ load_mm_ldt(&init_mm);

clear_all_debug_regs();
dbg_restore_debug_regs();
@@ -1459,7 +1459,7 @@ void cpu_init(void)
load_sp0(t, thread);
set_tss_desc(cpu, t);
load_TR_desc();
- load_LDT(&init_mm.context);
+ load_mm_ldt(&init_mm);

t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 3658de47900f..9469dfa55607 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2179,21 +2179,25 @@ static unsigned long get_segment_base(unsigned int segment)
int idx = segment >> 3;

if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+ struct ldt_struct *ldt;
+
if (idx > LDT_ENTRIES)
return 0;

- if (idx > current->active_mm->context.size)
+ /* IRQs are off, so this synchronizes with smp_store_release */
+ ldt = lockless_dereference(current->active_mm->context.ldt);
+ if (!ldt || idx > ldt->size)
return 0;

- desc = current->active_mm->context.ldt;
+ desc = &ldt->entries[idx];
} else {
if (idx > GDT_ENTRIES)
return 0;

- desc = raw_cpu_ptr(gdt_page.gdt);
+ desc = raw_cpu_ptr(gdt_page.gdt) + idx;
}

- return get_desc_base(desc + idx);
+ return get_desc_base(desc);
}

#ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index c37886d759cc..d65e6ec90338 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -20,82 +20,70 @@
#include <asm/mmu_context.h>
#include <asm/syscalls.h>

-#ifdef CONFIG_SMP
static void flush_ldt(void *current_mm)
{
- if (current->active_mm == current_mm)
- load_LDT(&current->active_mm->context);
+ if (current->active_mm == current_mm) {
+ /* context.lock is held for us, so we don't need any locking. */
+ mm_context_t *pc = &current->active_mm->context;
+ set_ldt(pc->ldt->entries, pc->ldt->size);
+ }
}
-#endif

-static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
+static struct ldt_struct *alloc_ldt_struct(int size)
{
- void *oldldt, *newldt;
- int oldsize;
+ struct ldt_struct *new_ldt;
+ int alloc_size;

- if (mincount <= pc->size)
- return 0;
- oldsize = pc->size;
- mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
- (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
- if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
- newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
+ if (size > LDT_ENTRIES)
+ return NULL;
+
+ BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
+ alloc_size = sizeof(struct ldt_struct) + size * LDT_ENTRY_SIZE;
+
+ if (alloc_size > PAGE_SIZE)
+ new_ldt = vmalloc(alloc_size);
else
- newldt = (void *)__get_free_page(GFP_KERNEL);
+ new_ldt = (void *)__get_free_page(GFP_KERNEL);
+ if (!new_ldt)
+ return NULL;

- if (!newldt)
- return -ENOMEM;
+ new_ldt->size = size;
+ paravirt_alloc_ldt(new_ldt->entries, size);
+ return new_ldt;
+}

- if (oldsize)
- memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
- oldldt = pc->ldt;
- memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
- (mincount - oldsize) * LDT_ENTRY_SIZE);
+static void install_ldt(struct mm_struct *current_mm,
+ struct ldt_struct *ldt)
+{
+ /* context.lock is held */
+ preempt_disable();

- paravirt_alloc_ldt(newldt, mincount);
+ /* Synchronizes with lockless_dereference in load_mm_ldt. */
+ smp_store_release(&current_mm->context.ldt, ldt);

-#ifdef CONFIG_X86_64
- /* CHECKME: Do we really need this ? */
- wmb();
-#endif
- pc->ldt = newldt;
- wmb();
- pc->size = mincount;
- wmb();
+ /* Activate for this CPU. */
+ flush_ldt(current->mm);

- if (reload) {
#ifdef CONFIG_SMP
- preempt_disable();
- load_LDT(pc);
- if (!cpumask_equal(mm_cpumask(current->mm),
- cpumask_of(smp_processor_id())))
- smp_call_function(flush_ldt, current->mm, 1);
- preempt_enable();
-#else
- load_LDT(pc);
+ /* Synchronize with other CPUs. */
+ if (!cpumask_equal(mm_cpumask(current_mm),
+ cpumask_of(smp_processor_id())))
+ smp_call_function(flush_ldt, current_mm, 1);
#endif
- }
- if (oldsize) {
- paravirt_free_ldt(oldldt, oldsize);
- if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree(oldldt);
- else
- put_page(virt_to_page(oldldt));
- }
- return 0;
+ preempt_enable();
}

-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+static void free_ldt_struct(struct ldt_struct *ldt)
{
- int err = alloc_ldt(new, old->size, 0);
- int i;
-
- if (err < 0)
- return err;
-
- for (i = 0; i < old->size; i++)
- write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
- return 0;
+ if (unlikely(ldt)) {
+ int alloc_size = sizeof(struct ldt_struct) +
+ ldt->size * LDT_ENTRY_SIZE;
+ paravirt_free_ldt(ldt->entries, ldt->size);
+ if (alloc_size > PAGE_SIZE)
+ vfree(ldt);
+ else
+ put_page(virt_to_page(ldt));
+ }
}

/*
@@ -106,15 +94,35 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
{
struct mm_struct *old_mm;
int retval = 0;
+ int i;

mutex_init(&mm->context.lock);
- mm->context.size = 0;
old_mm = current->mm;
- if (old_mm && old_mm->context.size > 0) {
- mutex_lock(&old_mm->context.lock);
- retval = copy_ldt(&mm->context, &old_mm->context);
- mutex_unlock(&old_mm->context.lock);
+ if (!old_mm) {
+ mm->context.ldt = NULL;
+ return 0;
}
+
+ mutex_lock(&old_mm->context.lock);
+ if (old_mm->context.ldt) {
+ struct ldt_struct *new_ldt =
+ alloc_ldt_struct(old_mm->context.ldt->size);
+ if (!new_ldt) {
+ retval = -ENOMEM;
+ goto out_unlock;
+ }
+
+ for (i = 0; i < old_mm->context.ldt->size; i++)
+ write_ldt_entry(new_ldt->entries, i,
+ &old_mm->context.ldt->entries[i]);
+
+ mm->context.ldt = new_ldt;
+ } else {
+ mm->context.ldt = NULL;
+ }
+
+out_unlock:
+ mutex_unlock(&old_mm->context.lock);
return retval;
}

@@ -125,53 +133,47 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
*/
void destroy_context(struct mm_struct *mm)
{
- if (mm->context.size) {
-#ifdef CONFIG_X86_32
- /* CHECKME: Can this ever happen ? */
- if (mm == current->active_mm)
- clear_LDT();
-#endif
- paravirt_free_ldt(mm->context.ldt, mm->context.size);
- if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree(mm->context.ldt);
- else
- put_page(virt_to_page(mm->context.ldt));
- mm->context.size = 0;
- }
+ free_ldt_struct(mm->context.ldt);
+ mm->context.ldt = NULL;
}

static int read_ldt(void __user *ptr, unsigned long bytecount)
{
- int err;
+ int retval;
unsigned long size;
struct mm_struct *mm = current->mm;

- if (!mm->context.size)
- return 0;
+ mutex_lock(&mm->context.lock);
+
+ if (!mm->context.ldt) {
+ retval = 0;
+ goto out_unlock;
+ }
+
if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;

- mutex_lock(&mm->context.lock);
- size = mm->context.size * LDT_ENTRY_SIZE;
+ size = mm->context.ldt->size * LDT_ENTRY_SIZE;
if (size > bytecount)
size = bytecount;

- err = 0;
- if (copy_to_user(ptr, mm->context.ldt, size))
- err = -EFAULT;
- mutex_unlock(&mm->context.lock);
- if (err < 0)
- goto error_return;
+ if (copy_to_user(ptr, mm->context.ldt->entries, size)) {
+ retval = -EFAULT;
+ goto out_unlock;
+ }
+
if (size != bytecount) {
- /* zero-fill the rest */
+ /* Zero-fill the rest and pretend we read bytecount bytes. */
if (clear_user(ptr + size, bytecount - size) != 0) {
- err = -EFAULT;
- goto error_return;
+ retval = -EFAULT;
+ goto out_unlock;
}
}
- return bytecount;
-error_return:
- return err;
+ retval = bytecount;
+
+out_unlock:
+ mutex_unlock(&mm->context.lock);
+ return retval;
}

static int read_default_ldt(void __user *ptr, unsigned long bytecount)
@@ -189,12 +191,16 @@ static int read_default_ldt(void __user *ptr, unsigned long bytecount)
return bytecount;
}

+static struct desc_struct blank_desc;
+
static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
{
struct mm_struct *mm = current->mm;
struct desc_struct ldt;
- int error;
+ int error, i;
struct user_desc ldt_info;
+ int oldsize, newsize;
+ struct ldt_struct *new_ldt, *old_ldt;

error = -EINVAL;
if (bytecount != sizeof(ldt_info))
@@ -213,34 +219,41 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
goto out;
}

- mutex_lock(&mm->context.lock);
- if (ldt_info.entry_number >= mm->context.size) {
- error = alloc_ldt(&current->mm->context,
- ldt_info.entry_number + 1, 1);
- if (error < 0)
- goto out_unlock;
- }
-
- /* Allow LDTs to be cleared by the user. */
- if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
- if (oldmode || LDT_empty(&ldt_info)) {
- memset(&ldt, 0, sizeof(ldt));
- goto install;
+ if ((oldmode && ldt_info.base_addr == 0 && ldt_info.limit == 0) ||
+ LDT_empty(&ldt_info)) {
+ /* The user wants to clear the entry. */
+ memset(&ldt, 0, sizeof(ldt));
+ } else {
+ if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
+ error = -EINVAL;
+ goto out;
}
+
+ fill_ldt(&ldt, &ldt_info);
+ if (oldmode)
+ ldt.avl = 0;
}

- if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
- error = -EINVAL;
+ mutex_lock(&mm->context.lock);
+
+ old_ldt = mm->context.ldt;
+ oldsize = old_ldt ? old_ldt->size : 0;
+ newsize = max((int)(ldt_info.entry_number + 1), oldsize);
+
+ error = -ENOMEM;
+ new_ldt = alloc_ldt_struct(newsize);
+ if (!new_ldt)
goto out_unlock;
- }

- fill_ldt(&ldt, &ldt_info);
- if (oldmode)
- ldt.avl = 0;
+ for (i = 0; i < oldsize; i++)
+ write_ldt_entry(new_ldt->entries, i,
+ &old_ldt->entries[i]);
+ for (i = oldsize; i < newsize; i++)
+ write_ldt_entry(new_ldt->entries, i, &blank_desc);
+ write_ldt_entry(new_ldt->entries, ldt_info.entry_number, &ldt);

- /* Install the new entry ... */
-install:
- write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
+ install_ldt(mm, new_ldt);
+ free_ldt_struct(old_ldt);
error = 0;

out_unlock:
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 71d7849a07f7..f6b916387590 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -121,11 +121,11 @@ void __show_regs(struct pt_regs *regs, int all)
void release_thread(struct task_struct *dead_task)
{
if (dead_task->mm) {
- if (dead_task->mm->context.size) {
+ if (dead_task->mm->context.ldt) {
pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
dead_task->comm,
dead_task->mm->context.ldt,
- dead_task->mm->context.size);
+ dead_task->mm->context.ldt->size);
BUG();
}
}
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 9b4d51d0c0d0..6273324186ac 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -5,6 +5,7 @@
#include <linux/mm.h>
#include <linux/ptrace.h>
#include <asm/desc.h>
+#include <asm/mmu_context.h>

unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs)
{
@@ -30,10 +31,11 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
seg &= ~7UL;

mutex_lock(&child->mm->context.lock);
- if (unlikely((seg >> 3) >= child->mm->context.size))
+ if (unlikely(!child->mm->context.ldt ||
+ (seg >> 3) >= child->mm->context.ldt->size))
addr = -1L; /* bogus selector, access would fault */
else {
- desc = child->mm->context.ldt + seg;
+ desc = &child->mm->context.ldt->entries[seg];
base = get_desc_base(desc);

/* 16-bit code segment? */
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 0d7dd1f5ac36..9ab52791fed5 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -22,6 +22,7 @@
#include <asm/fpu/internal.h>
#include <asm/debugreg.h>
#include <asm/cpu.h>
+#include <asm/mmu_context.h>

#ifdef CONFIG_X86_32
__visible unsigned long saved_context_ebx;
@@ -153,7 +154,7 @@ static void fix_processor_context(void)
syscall_init(); /* This sets MSR_*STAR and related */
#endif
load_TR_desc(); /* This does ltr */
- load_LDT(&current->active_mm->context); /* This does lldt */
+ load_mm_ldt(current->active_mm); /* This does lldt */

fpu__resume_cpu();
}
--
2.4.3

2015-07-21 20:03:16

by Andy Lutomirski

[permalink] [raw]
Subject: [PATCH v2 2/3] x86/ldt: Make modify_ldt optional

The modify_ldt syscall exposes a large attack surface and is
unnecessary for modern userspace. Make it optional.

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/Kconfig | 17 +++++++++++++++++
arch/x86/include/asm/mmu.h | 2 ++
arch/x86/include/asm/mmu_context.h | 31 +++++++++++++++++++++++--------
arch/x86/kernel/Makefile | 3 ++-
arch/x86/kernel/cpu/perf_event.c | 4 ++++
arch/x86/kernel/process_64.c | 2 ++
arch/x86/kernel/step.c | 2 ++
kernel/sys_ni.c | 1 +
8 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 55bced17dc95..a7ff3980bd65 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1009,6 +1009,7 @@ config VM86
config X86_16BIT
bool "Enable support for 16-bit segments" if EXPERT
default y
+ depends on MODIFY_LDT_SYSCALL
---help---
This option is required by programs like Wine to run 16-bit
protected mode legacy code on x86 processors. Disabling
@@ -2047,6 +2048,22 @@ config CMDLINE_OVERRIDE
This is used to work around broken boot loaders. This should
be set to 'N' under normal conditions.

+config MODIFY_LDT_SYSCALL
+ bool "Enable the LDT (local descriptor table)" if EXPERT
+ default y
+ ---help---
+ Linux can allow user programs to install a per-process x86
+ Local Descriptor Table (LDT) using the modify_ldt(2) system
+ call. This is required to run 16-bit or segmented code such as
+ DOSEMU or some Wine programs. It is also used by some very old
+ threading libraries.
+
+ Enabling this feature adds a small amount of overhead to
+ context switches and increases the low-level kernel attack
+ surface. Disabling it removes the modify_ldt(2) system call.
+
+ Saying 'N' here may make sense for embedded or server kernels.
+
source "kernel/livepatch/Kconfig"

endmenu
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 364d27481a52..55234d5e7160 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -9,7 +9,9 @@
* we put the segment information here.
*/
typedef struct {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct ldt_struct *ldt;
+#endif

#ifdef CONFIG_X86_64
/* True if mm supports a task running in 32 bit compatibility mode. */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 1ff121fbf366..26e774da82da 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -33,6 +33,7 @@ static inline void load_mm_cr4(struct mm_struct *mm)
static inline void load_mm_cr4(struct mm_struct *mm) {}
#endif

+#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
* ldt_structs can be allocated, used, and freed, but they are never
* modified while live.
@@ -43,10 +44,24 @@ struct ldt_struct {
struct desc_struct entries[];
};

+/*
+ * Used for LDT copy/destruction.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+void destroy_context(struct mm_struct *mm);
+#else /* CONFIG_MODIFY_LDT_SYSCALL */
+static inline int init_new_context(struct task_struct *tsk,
+ struct mm_struct *mm)
+{
+ return 0;
+}
+static inline void destroy_context(struct mm_struct *mm) {}
+#endif
+
static inline void load_mm_ldt(struct mm_struct *mm)
{
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct ldt_struct *ldt;
- DEBUG_LOCKS_WARN_ON(!irqs_disabled());

/* lockless_dereference synchronizes with smp_store_release */
ldt = lockless_dereference(mm->context.ldt);
@@ -69,14 +84,12 @@ static inline void load_mm_ldt(struct mm_struct *mm)
set_ldt(ldt->entries, ldt->size);
else
clear_LDT();
-}
-
-/*
- * Used for LDT copy/destruction.
- */
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
-void destroy_context(struct mm_struct *mm);
+#else
+ clear_LDT();
+#endif

+ DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+}

static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
@@ -108,6 +121,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
/* Load per-mm CR4 state */
load_mm_cr4(next);

+#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
* Load the LDT, if the LDT is different.
*
@@ -122,6 +136,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
*/
if (unlikely(prev->context.ldt != next->context.ldt))
load_mm_ldt(next);
+#endif
}
#ifdef CONFIG_SMP
else {
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0f15af41bd80..2b507befcd3f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -24,7 +24,8 @@ CFLAGS_irq.o := -I$(src)/../include/asm/trace

obj-y := process_$(BITS).o signal.o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
-obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o
+obj-y += time.o ioport.o dumpstack.o nmi.o
+obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-y += probe_roms.o
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9469dfa55607..58b872ef2329 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2179,6 +2179,7 @@ static unsigned long get_segment_base(unsigned int segment)
int idx = segment >> 3;

if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct ldt_struct *ldt;

if (idx > LDT_ENTRIES)
@@ -2190,6 +2191,9 @@ static unsigned long get_segment_base(unsigned int segment)
return 0;

desc = &ldt->entries[idx];
+#else
+ return 0;
+#endif
} else {
if (idx > GDT_ENTRIES)
return 0;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index f6b916387590..941295ddf802 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -121,6 +121,7 @@ void __show_regs(struct pt_regs *regs, int all)
void release_thread(struct task_struct *dead_task)
{
if (dead_task->mm) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
if (dead_task->mm->context.ldt) {
pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
dead_task->comm,
@@ -128,6 +129,7 @@ void release_thread(struct task_struct *dead_task)
dead_task->mm->context.ldt->size);
BUG();
}
+#endif
}
}

diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 6273324186ac..fd88e152d584 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -18,6 +18,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
return addr;
}

+#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
* We'll assume that the code segments in the GDT
* are all zero-based. That is largely true: the
@@ -45,6 +46,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
}
mutex_unlock(&child->mm->context.lock);
}
+#endif

return addr;
}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7995ef5868d8..ca7d84f438f1 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -140,6 +140,7 @@ cond_syscall(sys_sgetmask);
cond_syscall(sys_ssetmask);
cond_syscall(sys_vm86old);
cond_syscall(sys_vm86);
+cond_syscall(sys_modify_ldt);
cond_syscall(sys_ipc);
cond_syscall(compat_sys_ipc);
cond_syscall(compat_sys_sysctl);
--
2.4.3

2015-07-21 20:02:50

by Andy Lutomirski

[permalink] [raw]
Subject: [PATCH v2 3/3] selftests/x86, x86/ldt: Add a selftest for modify_ldt

This tests general modify_ldt behavior (only writes, so far) as
well as synchronous updates via IPI. It fails on old kernels.

I called this ldt_gdt because I'll add set_thread_area tests to
it at some point.

Signed-off-by: Andy Lutomirski <[email protected]>
---
tools/testing/selftests/x86/Makefile | 2 +-
tools/testing/selftests/x86/ldt_gdt.c | 492 ++++++++++++++++++++++++++++++++++
2 files changed, 493 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/x86/ldt_gdt.c

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index caa60d56d7d1..4138387b892c 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -4,7 +4,7 @@ include ../lib.mk

.PHONY: all all_32 all_64 warn_32bit_failure clean

-TARGETS_C_BOTHBITS := sigreturn single_step_syscall sysret_ss_attrs
+TARGETS_C_BOTHBITS := sigreturn single_step_syscall sysret_ss_attrs ldt_gdt
TARGETS_C_32BIT_ONLY := entry_from_vm86

TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c
new file mode 100644
index 000000000000..6f6699f0351a
--- /dev/null
+++ b/tools/testing/selftests/x86/ldt_gdt.c
@@ -0,0 +1,492 @@
+/*
+ * ldt_gdt.c - Test cases for LDT and GDT access
+ * Copyright (c) 2015 Andrew Lutomirski
+ */
+
+#define _GNU_SOURCE
+#include <err.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <signal.h>
+#include <setjmp.h>
+#include <string.h>
+#include <errno.h>
+#include <xmmintrin.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <asm/ldt.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdbool.h>
+#include <pthread.h>
+#include <sched.h>
+#include <linux/futex.h>
+
+#define AR_ACCESSED (1<<8)
+
+#define AR_TYPE_RODATA (0 * (1<<9))
+#define AR_TYPE_RWDATA (1 * (1<<9))
+#define AR_TYPE_RODATA_EXPDOWN (2 * (1<<9))
+#define AR_TYPE_RWDATA_EXPDOWN (3 * (1<<9))
+#define AR_TYPE_XOCODE (4 * (1<<9))
+#define AR_TYPE_XRCODE (5 * (1<<9))
+#define AR_TYPE_XOCODE_CONF (6 * (1<<9))
+#define AR_TYPE_XRCODE_CONF (7 * (1<<9))
+
+#define AR_DPL3 (3 * (1<<13))
+
+#define AR_S (1 << 12)
+#define AR_P (1 << 15)
+#define AR_AVL (1 << 20)
+#define AR_L (1 << 21)
+#define AR_DB (1 << 22)
+#define AR_G (1 << 23)
+
+static int nerrs;
+
+static void check_invalid_segment(uint16_t index, int ldt)
+{
+ uint32_t has_limit = 0, has_ar = 0, limit, ar;
+ uint32_t selector = (index << 3) | (ldt << 2) | 3;
+
+ asm ("lsl %[selector], %[limit]\n\t"
+ "jnz 1f\n\t"
+ "movl $1, %[has_limit]\n\t"
+ "1:"
+ : [limit] "=r" (limit), [has_limit] "+rm" (has_limit)
+ : [selector] "r" (selector));
+ asm ("larl %[selector], %[ar]\n\t"
+ "jnz 1f\n\t"
+ "movl $1, %[has_ar]\n\t"
+ "1:"
+ : [ar] "=r" (ar), [has_ar] "+rm" (has_ar)
+ : [selector] "r" (selector));
+
+ if (has_limit || has_ar) {
+ printf("[FAIL]\t%s entry %hu is valid but should be invalid\n",
+ (ldt ? "LDT" : "GDT"), index);
+ nerrs++;
+ } else {
+ printf("[OK]\t%s entry %hu is invalid\n",
+ (ldt ? "LDT" : "GDT"), index);
+ }
+}
+
+static void check_valid_segment(uint16_t index, int ldt,
+ uint32_t expected_ar, uint32_t expected_limit)
+{
+ uint32_t has_limit = 0, has_ar = 0, limit, ar;
+ uint32_t selector = (index << 3) | (ldt << 2) | 3;
+
+ asm ("lsl %[selector], %[limit]\n\t"
+ "jnz 1f\n\t"
+ "movl $1, %[has_limit]\n\t"
+ "1:"
+ : [limit] "=r" (limit), [has_limit] "+rm" (has_limit)
+ : [selector] "r" (selector));
+ asm ("larl %[selector], %[ar]\n\t"
+ "jnz 1f\n\t"
+ "movl $1, %[has_ar]\n\t"
+ "1:"
+ : [ar] "=r" (ar), [has_ar] "+rm" (has_ar)
+ : [selector] "r" (selector));
+
+ if (!has_limit || !has_ar) {
+ printf("[FAIL]\t%s entry %hu is invalid but should be valid\n",
+ (ldt ? "LDT" : "GDT"), index);
+ nerrs++;
+ return;
+ }
+
+ if (ar != expected_ar) {
+ printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n",
+ (ldt ? "LDT" : "GDT"), index, ar, expected_ar);
+ nerrs++;
+ } else if (limit != expected_limit) {
+ printf("[FAIL]\t%s entry %hu has limit 0x%08X but expected 0x%08X\n",
+ (ldt ? "LDT" : "GDT"), index, limit, expected_limit);
+ nerrs++;
+ } else {
+ printf("[OK]\t%s entry %hu has AR 0x%08X and limit 0x%08X\n",
+ (ldt ? "LDT" : "GDT"), index, ar, limit);
+ }
+}
+
+static bool install_valid_mode(const struct user_desc *desc, uint32_t ar,
+ bool oldmode)
+{
+ int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11,
+ desc, sizeof(*desc));
+ if (ret < -1)
+ errno = -ret;
+ if (ret == 0) {
+ uint32_t limit = desc->limit;
+ if (desc->limit_in_pages)
+ limit = (limit << 12) + 4095;
+ check_valid_segment(desc->entry_number, 1, ar, limit);
+ return true;
+ } else if (errno == ENOSYS) {
+ printf("[OK]\tmodify_ldt is returned -ENOSYS\n");
+ return false;
+ } else {
+ if (desc->seg_32bit) {
+ printf("[FAIL]\tUnexpected modify_ldt failure %d\n",
+ errno);
+ nerrs++;
+ return false;
+ } else {
+ printf("[OK]\tmodify_ldt rejected 16 bit segment\n");
+ return false;
+ }
+ }
+}
+
+static bool install_valid(const struct user_desc *desc, uint32_t ar)
+{
+ return install_valid_mode(desc, ar, false);
+}
+
+static void install_invalid(const struct user_desc *desc, bool oldmode)
+{
+ int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11,
+ desc, sizeof(*desc));
+ if (ret < -1)
+ errno = -ret;
+ if (ret == 0) {
+ check_invalid_segment(desc->entry_number, 1);
+ } else if (errno == ENOSYS) {
+ printf("[OK]\tmodify_ldt is returned -ENOSYS\n");
+ } else {
+ if (desc->seg_32bit) {
+ printf("[FAIL]\tUnexpected modify_ldt failure %d\n",
+ errno);
+ nerrs++;
+ } else {
+ printf("[OK]\tmodify_ldt rejected 16 bit segment\n");
+ }
+ }
+}
+
+static int safe_modify_ldt(int func, struct user_desc *ptr,
+ unsigned long bytecount)
+{
+ int ret = syscall(SYS_modify_ldt, 0x11, ptr, bytecount);
+ if (ret < -1)
+ errno = -ret;
+ return ret;
+}
+
+static void fail_install(struct user_desc *desc)
+{
+ if (safe_modify_ldt(0x11, desc, sizeof(*desc)) == 0) {
+ printf("[FAIL]\tmodify_ldt accepted a bad descriptor\n");
+ nerrs++;
+ } else if (errno == ENOSYS) {
+ printf("[OK]\tmodify_ldt is returned -ENOSYS\n");
+ } else {
+ printf("[OK]\tmodify_ldt failure %d\n", errno);
+ }
+}
+
+static void do_simple_tests(void)
+{
+ struct user_desc desc = {
+ .entry_number = 0,
+ .base_addr = 0,
+ .limit = 10,
+ .seg_32bit = 1,
+ .contents = 2, /* Code, not conforming */
+ .read_exec_only = 0,
+ .limit_in_pages = 0,
+ .seg_not_present = 0,
+ .useable = 0
+ };
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB);
+
+ desc.limit_in_pages = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+ AR_S | AR_P | AR_DB | AR_G);
+
+ check_invalid_segment(1, 1);
+
+ desc.entry_number = 2;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+ AR_S | AR_P | AR_DB | AR_G);
+
+ check_invalid_segment(1, 1);
+
+ desc.base_addr = 0xf0000000;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+ AR_S | AR_P | AR_DB | AR_G);
+
+ desc.useable = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+ AR_S | AR_P | AR_DB | AR_G | AR_AVL);
+
+ desc.seg_not_present = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+ AR_S | AR_DB | AR_G | AR_AVL);
+
+ desc.seg_32bit = 0;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+ AR_S | AR_G | AR_AVL);
+
+ desc.seg_32bit = 1;
+ desc.contents = 0;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA |
+ AR_S | AR_DB | AR_G | AR_AVL);
+
+ desc.read_exec_only = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA |
+ AR_S | AR_DB | AR_G | AR_AVL);
+
+ desc.contents = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA_EXPDOWN |
+ AR_S | AR_DB | AR_G | AR_AVL);
+
+ desc.read_exec_only = 0;
+ desc.limit_in_pages = 0;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA_EXPDOWN |
+ AR_S | AR_DB | AR_AVL);
+
+ desc.contents = 3;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE_CONF |
+ AR_S | AR_DB | AR_AVL);
+
+ desc.read_exec_only = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE_CONF |
+ AR_S | AR_DB | AR_AVL);
+
+ desc.read_exec_only = 0;
+ desc.contents = 2;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+ AR_S | AR_DB | AR_AVL);
+
+ desc.read_exec_only = 1;
+
+#ifdef __x86_64__
+ desc.lm = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE |
+ AR_S | AR_DB | AR_AVL);
+ desc.lm = 0;
+#endif
+
+ bool entry1_okay = install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE |
+ AR_S | AR_DB | AR_AVL);
+
+ if (entry1_okay) {
+ printf("[RUN]\tTest fork\n");
+ pid_t child = fork();
+ if (child == 0) {
+ nerrs = 0;
+ check_valid_segment(desc.entry_number, 1,
+ AR_DPL3 | AR_TYPE_XOCODE |
+ AR_S | AR_DB | AR_AVL, desc.limit);
+ check_invalid_segment(1, 1);
+ exit(nerrs ? 1 : 0);
+ } else {
+ int status;
+ if (waitpid(child, &status, 0) != child ||
+ !WIFEXITED(status)) {
+ printf("[FAIL]\tChild died\n");
+ nerrs++;
+ } else if (WEXITSTATUS(status) != 0) {
+ printf("[FAIL]\tChild failed\n");
+ nerrs++;
+ } else {
+ printf("[OK]\tChild succeeded\n");
+ }
+ }
+ } else {
+ printf("[SKIP]\tSkipping fork test because have no LDT\n");
+ }
+
+ /* Test entry_number too high. */
+ desc.entry_number = 100000;
+ fail_install(&desc);
+
+ /* Test deletion and actions mistakeable for deletion. */
+ memset(&desc, 0, sizeof(desc));
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P);
+
+ desc.seg_not_present = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S);
+
+ desc.seg_not_present = 0;
+ desc.read_exec_only = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S | AR_P);
+
+ desc.read_exec_only = 0;
+ desc.seg_not_present = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S);
+
+ desc.read_exec_only = 1;
+ desc.limit = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S);
+
+ desc.limit = 0;
+ desc.base_addr = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S);
+
+ desc.base_addr = 0;
+ install_invalid(&desc, false);
+
+ desc.seg_not_present = 0;
+ desc.read_exec_only = 0;
+ desc.seg_32bit = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P | AR_DB);
+ install_invalid(&desc, true);
+}
+
+/*
+ * 0: thread is idle
+ * 1: thread armed
+ * 2: thread should clear LDT entry 0
+ * 3: thread should exit
+ */
+static volatile unsigned int ftx;
+
+static void *threadproc(void *ctx)
+{
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ CPU_SET(1, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
+ err(1, "sched_setaffinity to CPU 1"); /* should never fail */
+
+ while (1) {
+ syscall(SYS_futex, &ftx, FUTEX_WAIT, 0, NULL, NULL, 0);
+ while (ftx != 2) {
+ if (ftx == 3)
+ return NULL;
+ }
+
+ /* clear LDT entry 0 */
+ const struct user_desc desc = {};
+ if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) != 0)
+ err(1, "modify_ldt");
+
+ ftx = 0;
+ }
+}
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
+{
+ siglongjmp(jmpbuf, 1);
+}
+
+static void do_multicpu_tests(void)
+{
+ cpu_set_t cpuset;
+ pthread_t thread;
+ int failures = 0, iters = 5, i;
+ unsigned short orig_ss;
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(1, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
+ printf("[SKIP]\tCannot set affinity to CPU 1\n");
+ return;
+ }
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(0, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
+ printf("[SKIP]\tCannot set affinity to CPU 0\n");
+ return;
+ }
+
+ sethandler(SIGSEGV, sigsegv, 0);
+
+ printf("[RUN]\tCross-CPU LDT invalidation\n");
+
+ if (pthread_create(&thread, 0, threadproc, 0) != 0)
+ err(1, "pthread_create");
+
+ asm volatile ("mov %%ss, %0" : "=rm" (orig_ss));
+
+ for (i = 0; i < 5; i++) {
+ if (sigsetjmp(jmpbuf, 1) != 0)
+ continue;
+
+ /* Make sure the thread is ready after the last test. */
+ while (ftx != 0)
+ ;
+
+ struct user_desc desc = {
+ .entry_number = 0,
+ .base_addr = 0,
+ .limit = 0xfffff,
+ .seg_32bit = 1,
+ .contents = 0, /* Data */
+ .read_exec_only = 0,
+ .limit_in_pages = 1,
+ .seg_not_present = 0,
+ .useable = 0
+ };
+
+ if (safe_modify_ldt(0x11, &desc, sizeof(desc)) != 0) {
+ if (errno != ENOSYS)
+ err(1, "modify_ldt");
+ printf("[SKIP]\tmodify_ldt unavailable\n");
+ break;
+ }
+
+ /* Arm the thread. */
+ ftx = 1;
+ syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+
+ asm volatile ("mov %0, %%ss" : : "r" (0x7));
+
+ /* Go! */
+ ftx = 2;
+
+ while (ftx != 0)
+ ;
+
+ /*
+ * On success, modify_ldt will segfault us synchronously,
+ * and we'll escape via siglongjmp.
+ */
+
+ failures++;
+ asm volatile ("mov %0, %%ss" : : "rm" (orig_ss));
+ };
+
+ ftx = 3;
+ syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+
+ if (pthread_join(thread, NULL) != 0)
+ err(1, "pthread_join");
+
+ if (failures) {
+ printf("[FAIL]\t%d of %d iterations failed\n", failures, iters);
+ nerrs++;
+ } else {
+ printf("[OK]\tAll %d iterations succeeded\n", iters);
+ }
+}
+
+int main()
+{
+ do_simple_tests();
+
+ do_multicpu_tests();
+
+ return nerrs ? 1 : 0;
+}
--
2.4.3

2015-07-21 20:21:31

by Sasha Levin

[permalink] [raw]
Subject: Re: [PATCH v2 2/3] x86/ldt: Make modify_ldt optional

On 07/21/2015 03:59 PM, Andy Lutomirski wrote:
> The modify_ldt syscall exposes a large attack surface and is
> unnecessary for modern userspace. Make it optional.

Since this a "default y" option I think we need to make the
implications of this a bit clearer.

Do we know what userspace would break?

Maybe add a WARN_ONCE() in a stub syscall?


Thanks,
Sasha

2015-07-21 20:28:07

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2 2/3] x86/ldt: Make modify_ldt optional

On Tue, Jul 21, 2015 at 1:20 PM, Sasha Levin <[email protected]> wrote:
> On 07/21/2015 03:59 PM, Andy Lutomirski wrote:
>> The modify_ldt syscall exposes a large attack surface and is
>> unnecessary for modern userspace. Make it optional.
>
> Since this a "default y" option I think we need to make the
> implications of this a bit clearer.

Do you mean improving the help text? To be clear, there's no change
on a non-EXPERT of default EXPERT kernel here.

>
> Do we know what userspace would break?

Some Wine and some DOSEMU most likely. Also many of the exploits I've
written over the past year or two :)

>
> Maybe add a WARN_ONCE() in a stub syscall?
>

I think if we do that then we should do it for all the syscall disabling things.

--Andy

2015-07-21 20:29:00

by Brian Gerst

[permalink] [raw]
Subject: Re: [PATCH v2 2/3] x86/ldt: Make modify_ldt optional

On Tue, Jul 21, 2015 at 3:59 PM, Andy Lutomirski <[email protected]> wrote:
> The modify_ldt syscall exposes a large attack surface and is
> unnecessary for modern userspace. Make it optional.
>
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
> arch/x86/Kconfig | 17 +++++++++++++++++
> arch/x86/include/asm/mmu.h | 2 ++
> arch/x86/include/asm/mmu_context.h | 31 +++++++++++++++++++++++--------
> arch/x86/kernel/Makefile | 3 ++-
> arch/x86/kernel/cpu/perf_event.c | 4 ++++
> arch/x86/kernel/process_64.c | 2 ++
> arch/x86/kernel/step.c | 2 ++
> kernel/sys_ni.c | 1 +
> 8 files changed, 53 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 55bced17dc95..a7ff3980bd65 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1009,6 +1009,7 @@ config VM86
> config X86_16BIT
> bool "Enable support for 16-bit segments" if EXPERT
> default y
> + depends on MODIFY_LDT_SYSCALL
> ---help---
> This option is required by programs like Wine to run 16-bit
> protected mode legacy code on x86 processors. Disabling
> @@ -2047,6 +2048,22 @@ config CMDLINE_OVERRIDE
> This is used to work around broken boot loaders. This should
> be set to 'N' under normal conditions.
>
> +config MODIFY_LDT_SYSCALL
> + bool "Enable the LDT (local descriptor table)" if EXPERT
> + default y
> + ---help---
> + Linux can allow user programs to install a per-process x86
> + Local Descriptor Table (LDT) using the modify_ldt(2) system
> + call. This is required to run 16-bit or segmented code such as
> + DOSEMU or some Wine programs. It is also used by some very old
> + threading libraries.
> +
> + Enabling this feature adds a small amount of overhead to
> + context switches and increases the low-level kernel attack
> + surface. Disabling it removes the modify_ldt(2) system call.
> +
> + Saying 'N' here may make sense for embedded or server kernels.
> +

I believe Wine still uses the LDT for thread-local data, even for 32
and 64-bit programs. This is separate from the Linux runtime TLS.

--
Brian Gerst

2015-07-21 20:35:00

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2 2/3] x86/ldt: Make modify_ldt optional

On Tue, Jul 21, 2015 at 1:28 PM, Brian Gerst <[email protected]> wrote:
> On Tue, Jul 21, 2015 at 3:59 PM, Andy Lutomirski <[email protected]> wrote:
>> The modify_ldt syscall exposes a large attack surface and is
>> unnecessary for modern userspace. Make it optional.
>>
>> Signed-off-by: Andy Lutomirski <[email protected]>
>> ---
>> arch/x86/Kconfig | 17 +++++++++++++++++
>> arch/x86/include/asm/mmu.h | 2 ++
>> arch/x86/include/asm/mmu_context.h | 31 +++++++++++++++++++++++--------
>> arch/x86/kernel/Makefile | 3 ++-
>> arch/x86/kernel/cpu/perf_event.c | 4 ++++
>> arch/x86/kernel/process_64.c | 2 ++
>> arch/x86/kernel/step.c | 2 ++
>> kernel/sys_ni.c | 1 +
>> 8 files changed, 53 insertions(+), 9 deletions(-)
>>
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index 55bced17dc95..a7ff3980bd65 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -1009,6 +1009,7 @@ config VM86
>> config X86_16BIT
>> bool "Enable support for 16-bit segments" if EXPERT
>> default y
>> + depends on MODIFY_LDT_SYSCALL
>> ---help---
>> This option is required by programs like Wine to run 16-bit
>> protected mode legacy code on x86 processors. Disabling
>> @@ -2047,6 +2048,22 @@ config CMDLINE_OVERRIDE
>> This is used to work around broken boot loaders. This should
>> be set to 'N' under normal conditions.
>>
>> +config MODIFY_LDT_SYSCALL
>> + bool "Enable the LDT (local descriptor table)" if EXPERT
>> + default y
>> + ---help---
>> + Linux can allow user programs to install a per-process x86
>> + Local Descriptor Table (LDT) using the modify_ldt(2) system
>> + call. This is required to run 16-bit or segmented code such as
>> + DOSEMU or some Wine programs. It is also used by some very old
>> + threading libraries.
>> +
>> + Enabling this feature adds a small amount of overhead to
>> + context switches and increases the low-level kernel attack
>> + surface. Disabling it removes the modify_ldt(2) system call.
>> +
>> + Saying 'N' here may make sense for embedded or server kernels.
>> +
>
> I believe Wine still uses the LDT for thread-local data, even for 32
> and 64-bit programs. This is separate from the Linux runtime TLS.
>

Really? I thought the whole reason we had three set_thread_area slots
was for Wine.

--Andy

2015-07-21 20:54:10

by Brian Gerst

[permalink] [raw]
Subject: Re: [PATCH v2 2/3] x86/ldt: Make modify_ldt optional

On Tue, Jul 21, 2015 at 4:34 PM, Andy Lutomirski <[email protected]> wrote:
> On Tue, Jul 21, 2015 at 1:28 PM, Brian Gerst <[email protected]> wrote:
>> On Tue, Jul 21, 2015 at 3:59 PM, Andy Lutomirski <[email protected]> wrote:
>>> The modify_ldt syscall exposes a large attack surface and is
>>> unnecessary for modern userspace. Make it optional.
>>>
>>> Signed-off-by: Andy Lutomirski <[email protected]>
>>> ---
>>> arch/x86/Kconfig | 17 +++++++++++++++++
>>> arch/x86/include/asm/mmu.h | 2 ++
>>> arch/x86/include/asm/mmu_context.h | 31 +++++++++++++++++++++++--------
>>> arch/x86/kernel/Makefile | 3 ++-
>>> arch/x86/kernel/cpu/perf_event.c | 4 ++++
>>> arch/x86/kernel/process_64.c | 2 ++
>>> arch/x86/kernel/step.c | 2 ++
>>> kernel/sys_ni.c | 1 +
>>> 8 files changed, 53 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>>> index 55bced17dc95..a7ff3980bd65 100644
>>> --- a/arch/x86/Kconfig
>>> +++ b/arch/x86/Kconfig
>>> @@ -1009,6 +1009,7 @@ config VM86
>>> config X86_16BIT
>>> bool "Enable support for 16-bit segments" if EXPERT
>>> default y
>>> + depends on MODIFY_LDT_SYSCALL
>>> ---help---
>>> This option is required by programs like Wine to run 16-bit
>>> protected mode legacy code on x86 processors. Disabling
>>> @@ -2047,6 +2048,22 @@ config CMDLINE_OVERRIDE
>>> This is used to work around broken boot loaders. This should
>>> be set to 'N' under normal conditions.
>>>
>>> +config MODIFY_LDT_SYSCALL
>>> + bool "Enable the LDT (local descriptor table)" if EXPERT
>>> + default y
>>> + ---help---
>>> + Linux can allow user programs to install a per-process x86
>>> + Local Descriptor Table (LDT) using the modify_ldt(2) system
>>> + call. This is required to run 16-bit or segmented code such as
>>> + DOSEMU or some Wine programs. It is also used by some very old
>>> + threading libraries.
>>> +
>>> + Enabling this feature adds a small amount of overhead to
>>> + context switches and increases the low-level kernel attack
>>> + surface. Disabling it removes the modify_ldt(2) system call.
>>> +
>>> + Saying 'N' here may make sense for embedded or server kernels.
>>> +
>>
>> I believe Wine still uses the LDT for thread-local data, even for 32
>> and 64-bit programs. This is separate from the Linux runtime TLS.
>>
>
> Really? I thought the whole reason we had three set_thread_area slots
> was for Wine.

You're right. I wasn't sure if Wine was ever changed to use that.

--
Brian Gerst

2015-07-21 21:55:27

by Boris Ostrovsky

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] x86/ldt: Make modify_ldt synchronous

On 07/21/2015 03:59 PM, Andy Lutomirski wrote:
> modify_ldt has questionable locking and does not synchronize
> threads. Improve it: redesign the locking and synchronize all
> threads' LDTs using an IPI on all modifications.
>
> This will dramatically slow down modify_ldt in multithreaded
> programs, but there shouldn't be any multithreaded programs that
> care about modify_ldt's performance in the first place.
>
> Cc: [email protected]
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
> arch/x86/include/asm/desc.h | 15 ---
> arch/x86/include/asm/mmu.h | 3 +-
> arch/x86/include/asm/mmu_context.h | 48 ++++++-
> arch/x86/kernel/cpu/common.c | 4 +-
> arch/x86/kernel/cpu/perf_event.c | 12 +-
> arch/x86/kernel/ldt.c | 247 +++++++++++++++++++------------------
> arch/x86/kernel/process_64.c | 4 +-
> arch/x86/kernel/step.c | 6 +-
> arch/x86/power/cpu.c | 3 +-
> 9 files changed, 192 insertions(+), 150 deletions(-)
>
> diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
> index a0bf89fd2647..4e10d73cf018 100644
> --- a/arch/x86/include/asm/desc.h
> +++ b/arch/x86/include/asm/desc.h
> @@ -280,21 +280,6 @@ static inline void clear_LDT(void)
> set_ldt(NULL, 0);
> }
>
> -/*
> - * load one particular LDT into the current CPU
> - */
> -static inline void load_LDT_nolock(mm_context_t *pc)
> -{
> - set_ldt(pc->ldt, pc->size);
> -}
> -
> -static inline void load_LDT(mm_context_t *pc)
> -{
> - preempt_disable();
> - load_LDT_nolock(pc);
> - preempt_enable();
> -}
> -
> static inline unsigned long get_desc_base(const struct desc_struct *desc)
> {
> return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
> diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
> index 09b9620a73b4..364d27481a52 100644
> --- a/arch/x86/include/asm/mmu.h
> +++ b/arch/x86/include/asm/mmu.h
> @@ -9,8 +9,7 @@
> * we put the segment information here.
> */
> typedef struct {
> - void *ldt;
> - int size;
> + struct ldt_struct *ldt;
>
> #ifdef CONFIG_X86_64
> /* True if mm supports a task running in 32 bit compatibility mode. */
> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
> index 5e8daee7c5c9..1ff121fbf366 100644
> --- a/arch/x86/include/asm/mmu_context.h
> +++ b/arch/x86/include/asm/mmu_context.h
> @@ -34,6 +34,44 @@ static inline void load_mm_cr4(struct mm_struct *mm) {}
> #endif
>
> /*
> + * ldt_structs can be allocated, used, and freed, but they are never
> + * modified while live.
> + */
> +struct ldt_struct {
> + int size;
> + int __pad; /* keep the descriptors naturally aligned. */
> + struct desc_struct entries[];
> +};



This breaks Xen which expects LDT to be page-aligned. Not sure why.

Jan, Andrew?

-boris


> +
> +static inline void load_mm_ldt(struct mm_struct *mm)
> +{
> + struct ldt_struct *ldt;
> + DEBUG_LOCKS_WARN_ON(!irqs_disabled());
> +
> + /* lockless_dereference synchronizes with smp_store_release */
> + ldt = lockless_dereference(mm->context.ldt);
> +
> + /*
> + * Any change to mm->context.ldt is followed by an IPI to all
> + * CPUs with the mm active. The LDT will not be freed until
> + * after the IPI is handled by all such CPUs. This means that,
> + * if the ldt_struct changes before we return, the values we see
> + * will be safe, and the new values will be loaded before we run
> + * any user code.
> + *
> + * NB: don't try to convert this to use RCU without extreme care.
> + * We would still need IRQs off, because we don't want to change
> + * the local LDT after an IPI loaded a newer value than the one
> + * that we can see.
> + */
> +
> + if (unlikely(ldt))
> + set_ldt(ldt->entries, ldt->size);
> + else
> + clear_LDT();
> +}
> +
> +/*
> * Used for LDT copy/destruction.
> */
> int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
> @@ -78,12 +116,12 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
> * was called and then modify_ldt changed
> * prev->context.ldt but suppressed an IPI to this CPU.
> * In this case, prev->context.ldt != NULL, because we
> - * never free an LDT while the mm still exists. That
> - * means that next->context.ldt != prev->context.ldt,
> - * because mms never share an LDT.
> + * never set context.ldt to NULL while the mm still
> + * exists. That means that next->context.ldt !=
> + * prev->context.ldt, because mms never share an LDT.
> */
> if (unlikely(prev->context.ldt != next->context.ldt))
> - load_LDT_nolock(&next->context);
> + load_mm_ldt(next);
> }
> #ifdef CONFIG_SMP
> else {
> @@ -106,7 +144,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
> load_cr3(next->pgd);
> trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
> load_mm_cr4(next);
> - load_LDT_nolock(&next->context);
> + load_mm_ldt(next);
> }
> }
> #endif
> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
> index 922c5e0cea4c..cb9e5df42dd2 100644
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -1410,7 +1410,7 @@ void cpu_init(void)
> load_sp0(t, &current->thread);
> set_tss_desc(cpu, t);
> load_TR_desc();
> - load_LDT(&init_mm.context);
> + load_mm_ldt(&init_mm);
>
> clear_all_debug_regs();
> dbg_restore_debug_regs();
> @@ -1459,7 +1459,7 @@ void cpu_init(void)
> load_sp0(t, thread);
> set_tss_desc(cpu, t);
> load_TR_desc();
> - load_LDT(&init_mm.context);
> + load_mm_ldt(&init_mm);
>
> t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
>
> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
> index 3658de47900f..9469dfa55607 100644
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c
> @@ -2179,21 +2179,25 @@ static unsigned long get_segment_base(unsigned int segment)
> int idx = segment >> 3;
>
> if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
> + struct ldt_struct *ldt;
> +
> if (idx > LDT_ENTRIES)
> return 0;
>
> - if (idx > current->active_mm->context.size)
> + /* IRQs are off, so this synchronizes with smp_store_release */
> + ldt = lockless_dereference(current->active_mm->context.ldt);
> + if (!ldt || idx > ldt->size)
> return 0;
>
> - desc = current->active_mm->context.ldt;
> + desc = &ldt->entries[idx];
> } else {
> if (idx > GDT_ENTRIES)
> return 0;
>
> - desc = raw_cpu_ptr(gdt_page.gdt);
> + desc = raw_cpu_ptr(gdt_page.gdt) + idx;
> }
>
> - return get_desc_base(desc + idx);
> + return get_desc_base(desc);
> }
>
> #ifdef CONFIG_COMPAT
> diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
> index c37886d759cc..d65e6ec90338 100644
> --- a/arch/x86/kernel/ldt.c
> +++ b/arch/x86/kernel/ldt.c
> @@ -20,82 +20,70 @@
> #include <asm/mmu_context.h>
> #include <asm/syscalls.h>
>
> -#ifdef CONFIG_SMP
> static void flush_ldt(void *current_mm)
> {
> - if (current->active_mm == current_mm)
> - load_LDT(&current->active_mm->context);
> + if (current->active_mm == current_mm) {
> + /* context.lock is held for us, so we don't need any locking. */
> + mm_context_t *pc = &current->active_mm->context;
> + set_ldt(pc->ldt->entries, pc->ldt->size);
> + }
> }
> -#endif
>
> -static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
> +static struct ldt_struct *alloc_ldt_struct(int size)
> {
> - void *oldldt, *newldt;
> - int oldsize;
> + struct ldt_struct *new_ldt;
> + int alloc_size;
>
> - if (mincount <= pc->size)
> - return 0;
> - oldsize = pc->size;
> - mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
> - (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
> - if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
> - newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
> + if (size > LDT_ENTRIES)
> + return NULL;
> +
> + BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
> + alloc_size = sizeof(struct ldt_struct) + size * LDT_ENTRY_SIZE;
> +
> + if (alloc_size > PAGE_SIZE)
> + new_ldt = vmalloc(alloc_size);
> else
> - newldt = (void *)__get_free_page(GFP_KERNEL);
> + new_ldt = (void *)__get_free_page(GFP_KERNEL);
> + if (!new_ldt)
> + return NULL;
>
> - if (!newldt)
> - return -ENOMEM;
> + new_ldt->size = size;
> + paravirt_alloc_ldt(new_ldt->entries, size);
> + return new_ldt;
> +}
>
> - if (oldsize)
> - memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
> - oldldt = pc->ldt;
> - memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
> - (mincount - oldsize) * LDT_ENTRY_SIZE);
> +static void install_ldt(struct mm_struct *current_mm,
> + struct ldt_struct *ldt)
> +{
> + /* context.lock is held */
> + preempt_disable();
>
> - paravirt_alloc_ldt(newldt, mincount);
> + /* Synchronizes with lockless_dereference in load_mm_ldt. */
> + smp_store_release(&current_mm->context.ldt, ldt);
>
> -#ifdef CONFIG_X86_64
> - /* CHECKME: Do we really need this ? */
> - wmb();
> -#endif
> - pc->ldt = newldt;
> - wmb();
> - pc->size = mincount;
> - wmb();
> + /* Activate for this CPU. */
> + flush_ldt(current->mm);
>
> - if (reload) {
> #ifdef CONFIG_SMP
> - preempt_disable();
> - load_LDT(pc);
> - if (!cpumask_equal(mm_cpumask(current->mm),
> - cpumask_of(smp_processor_id())))
> - smp_call_function(flush_ldt, current->mm, 1);
> - preempt_enable();
> -#else
> - load_LDT(pc);
> + /* Synchronize with other CPUs. */
> + if (!cpumask_equal(mm_cpumask(current_mm),
> + cpumask_of(smp_processor_id())))
> + smp_call_function(flush_ldt, current_mm, 1);
> #endif
> - }
> - if (oldsize) {
> - paravirt_free_ldt(oldldt, oldsize);
> - if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
> - vfree(oldldt);
> - else
> - put_page(virt_to_page(oldldt));
> - }
> - return 0;
> + preempt_enable();
> }
>
> -static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
> +static void free_ldt_struct(struct ldt_struct *ldt)
> {
> - int err = alloc_ldt(new, old->size, 0);
> - int i;
> -
> - if (err < 0)
> - return err;
> -
> - for (i = 0; i < old->size; i++)
> - write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
> - return 0;
> + if (unlikely(ldt)) {
> + int alloc_size = sizeof(struct ldt_struct) +
> + ldt->size * LDT_ENTRY_SIZE;
> + paravirt_free_ldt(ldt->entries, ldt->size);
> + if (alloc_size > PAGE_SIZE)
> + vfree(ldt);
> + else
> + put_page(virt_to_page(ldt));
> + }
> }
>
> /*
> @@ -106,15 +94,35 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
> {
> struct mm_struct *old_mm;
> int retval = 0;
> + int i;
>
> mutex_init(&mm->context.lock);
> - mm->context.size = 0;
> old_mm = current->mm;
> - if (old_mm && old_mm->context.size > 0) {
> - mutex_lock(&old_mm->context.lock);
> - retval = copy_ldt(&mm->context, &old_mm->context);
> - mutex_unlock(&old_mm->context.lock);
> + if (!old_mm) {
> + mm->context.ldt = NULL;
> + return 0;
> }
> +
> + mutex_lock(&old_mm->context.lock);
> + if (old_mm->context.ldt) {
> + struct ldt_struct *new_ldt =
> + alloc_ldt_struct(old_mm->context.ldt->size);
> + if (!new_ldt) {
> + retval = -ENOMEM;
> + goto out_unlock;
> + }
> +
> + for (i = 0; i < old_mm->context.ldt->size; i++)
> + write_ldt_entry(new_ldt->entries, i,
> + &old_mm->context.ldt->entries[i]);
> +
> + mm->context.ldt = new_ldt;
> + } else {
> + mm->context.ldt = NULL;
> + }
> +
> +out_unlock:
> + mutex_unlock(&old_mm->context.lock);
> return retval;
> }
>
> @@ -125,53 +133,47 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
> */
> void destroy_context(struct mm_struct *mm)
> {
> - if (mm->context.size) {
> -#ifdef CONFIG_X86_32
> - /* CHECKME: Can this ever happen ? */
> - if (mm == current->active_mm)
> - clear_LDT();
> -#endif
> - paravirt_free_ldt(mm->context.ldt, mm->context.size);
> - if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
> - vfree(mm->context.ldt);
> - else
> - put_page(virt_to_page(mm->context.ldt));
> - mm->context.size = 0;
> - }
> + free_ldt_struct(mm->context.ldt);
> + mm->context.ldt = NULL;
> }
>
> static int read_ldt(void __user *ptr, unsigned long bytecount)
> {
> - int err;
> + int retval;
> unsigned long size;
> struct mm_struct *mm = current->mm;
>
> - if (!mm->context.size)
> - return 0;
> + mutex_lock(&mm->context.lock);
> +
> + if (!mm->context.ldt) {
> + retval = 0;
> + goto out_unlock;
> + }
> +
> if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
> bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
>
> - mutex_lock(&mm->context.lock);
> - size = mm->context.size * LDT_ENTRY_SIZE;
> + size = mm->context.ldt->size * LDT_ENTRY_SIZE;
> if (size > bytecount)
> size = bytecount;
>
> - err = 0;
> - if (copy_to_user(ptr, mm->context.ldt, size))
> - err = -EFAULT;
> - mutex_unlock(&mm->context.lock);
> - if (err < 0)
> - goto error_return;
> + if (copy_to_user(ptr, mm->context.ldt->entries, size)) {
> + retval = -EFAULT;
> + goto out_unlock;
> + }
> +
> if (size != bytecount) {
> - /* zero-fill the rest */
> + /* Zero-fill the rest and pretend we read bytecount bytes. */
> if (clear_user(ptr + size, bytecount - size) != 0) {
> - err = -EFAULT;
> - goto error_return;
> + retval = -EFAULT;
> + goto out_unlock;
> }
> }
> - return bytecount;
> -error_return:
> - return err;
> + retval = bytecount;
> +
> +out_unlock:
> + mutex_unlock(&mm->context.lock);
> + return retval;
> }
>
> static int read_default_ldt(void __user *ptr, unsigned long bytecount)
> @@ -189,12 +191,16 @@ static int read_default_ldt(void __user *ptr, unsigned long bytecount)
> return bytecount;
> }
>
> +static struct desc_struct blank_desc;
> +
> static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
> {
> struct mm_struct *mm = current->mm;
> struct desc_struct ldt;
> - int error;
> + int error, i;
> struct user_desc ldt_info;
> + int oldsize, newsize;
> + struct ldt_struct *new_ldt, *old_ldt;
>
> error = -EINVAL;
> if (bytecount != sizeof(ldt_info))
> @@ -213,34 +219,41 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
> goto out;
> }
>
> - mutex_lock(&mm->context.lock);
> - if (ldt_info.entry_number >= mm->context.size) {
> - error = alloc_ldt(&current->mm->context,
> - ldt_info.entry_number + 1, 1);
> - if (error < 0)
> - goto out_unlock;
> - }
> -
> - /* Allow LDTs to be cleared by the user. */
> - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
> - if (oldmode || LDT_empty(&ldt_info)) {
> - memset(&ldt, 0, sizeof(ldt));
> - goto install;
> + if ((oldmode && ldt_info.base_addr == 0 && ldt_info.limit == 0) ||
> + LDT_empty(&ldt_info)) {
> + /* The user wants to clear the entry. */
> + memset(&ldt, 0, sizeof(ldt));
> + } else {
> + if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
> + error = -EINVAL;
> + goto out;
> }
> +
> + fill_ldt(&ldt, &ldt_info);
> + if (oldmode)
> + ldt.avl = 0;
> }
>
> - if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
> - error = -EINVAL;
> + mutex_lock(&mm->context.lock);
> +
> + old_ldt = mm->context.ldt;
> + oldsize = old_ldt ? old_ldt->size : 0;
> + newsize = max((int)(ldt_info.entry_number + 1), oldsize);
> +
> + error = -ENOMEM;
> + new_ldt = alloc_ldt_struct(newsize);
> + if (!new_ldt)
> goto out_unlock;
> - }
>
> - fill_ldt(&ldt, &ldt_info);
> - if (oldmode)
> - ldt.avl = 0;
> + for (i = 0; i < oldsize; i++)
> + write_ldt_entry(new_ldt->entries, i,
> + &old_ldt->entries[i]);
> + for (i = oldsize; i < newsize; i++)
> + write_ldt_entry(new_ldt->entries, i, &blank_desc);
> + write_ldt_entry(new_ldt->entries, ldt_info.entry_number, &ldt);
>
> - /* Install the new entry ... */
> -install:
> - write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
> + install_ldt(mm, new_ldt);
> + free_ldt_struct(old_ldt);
> error = 0;
>
> out_unlock:
> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
> index 71d7849a07f7..f6b916387590 100644
> --- a/arch/x86/kernel/process_64.c
> +++ b/arch/x86/kernel/process_64.c
> @@ -121,11 +121,11 @@ void __show_regs(struct pt_regs *regs, int all)
> void release_thread(struct task_struct *dead_task)
> {
> if (dead_task->mm) {
> - if (dead_task->mm->context.size) {
> + if (dead_task->mm->context.ldt) {
> pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
> dead_task->comm,
> dead_task->mm->context.ldt,
> - dead_task->mm->context.size);
> + dead_task->mm->context.ldt->size);
> BUG();
> }
> }
> diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
> index 9b4d51d0c0d0..6273324186ac 100644
> --- a/arch/x86/kernel/step.c
> +++ b/arch/x86/kernel/step.c
> @@ -5,6 +5,7 @@
> #include <linux/mm.h>
> #include <linux/ptrace.h>
> #include <asm/desc.h>
> +#include <asm/mmu_context.h>
>
> unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs)
> {
> @@ -30,10 +31,11 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
> seg &= ~7UL;
>
> mutex_lock(&child->mm->context.lock);
> - if (unlikely((seg >> 3) >= child->mm->context.size))
> + if (unlikely(!child->mm->context.ldt ||
> + (seg >> 3) >= child->mm->context.ldt->size))
> addr = -1L; /* bogus selector, access would fault */
> else {
> - desc = child->mm->context.ldt + seg;
> + desc = &child->mm->context.ldt->entries[seg];
> base = get_desc_base(desc);
>
> /* 16-bit code segment? */
> diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
> index 0d7dd1f5ac36..9ab52791fed5 100644
> --- a/arch/x86/power/cpu.c
> +++ b/arch/x86/power/cpu.c
> @@ -22,6 +22,7 @@
> #include <asm/fpu/internal.h>
> #include <asm/debugreg.h>
> #include <asm/cpu.h>
> +#include <asm/mmu_context.h>
>
> #ifdef CONFIG_X86_32
> __visible unsigned long saved_context_ebx;
> @@ -153,7 +154,7 @@ static void fix_processor_context(void)
> syscall_init(); /* This sets MSR_*STAR and related */
> #endif
> load_TR_desc(); /* This does ltr */
> - load_LDT(&current->active_mm->context); /* This does lldt */
> + load_mm_ldt(current->active_mm); /* This does lldt */
>
> fpu__resume_cpu();
> }

2015-07-21 22:04:03

by Boris Ostrovsky

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] selftests/x86, x86/ldt: Add a selftest for modify_ldt

On 07/21/2015 03:59 PM, Andy Lutomirski wrote:
> diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c
> new file mode 100644
> index 000000000000..6f6699f0351a
> --- /dev/null
> +++ b/tools/testing/selftests/x86/ldt_gdt.c
> @@ -0,0 +1,492 @@
> +/*
> + * ldt_gdt.c - Test cases for LDT and GDT access
> + * Copyright (c) 2015 Andrew Lutomirski
> + */
> +
> +#define _GNU_SOURCE
> +#include <err.h>
> +#include <stdio.h>
> +#include <stdint.h>
> +#include <signal.h>
> +#include <setjmp.h>
> +#include <string.h>
> +#include <errno.h>
> +#include <xmmintrin.h>

Is xmmintrin.h necessary? It breaks 32-bit build with

/usr/lib/gcc/x86_64-redhat-linux/4.7.2/include/xmmintrin.h:32:3: error:
#error "SSE instruction set not enabled"

unless I add -msse2.

(This also needs stdlib.h for exit() declaration)

-boris

2015-07-21 22:34:27

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] selftests/x86, x86/ldt: Add a selftest for modify_ldt

On Tue, Jul 21, 2015 at 3:02 PM, Boris Ostrovsky
<[email protected]> wrote:
> On 07/21/2015 03:59 PM, Andy Lutomirski wrote:
>>
>> diff --git a/tools/testing/selftests/x86/ldt_gdt.c
>> b/tools/testing/selftests/x86/ldt_gdt.c
>> new file mode 100644
>> index 000000000000..6f6699f0351a
>> --- /dev/null
>> +++ b/tools/testing/selftests/x86/ldt_gdt.c
>> @@ -0,0 +1,492 @@
>> +/*
>> + * ldt_gdt.c - Test cases for LDT and GDT access
>> + * Copyright (c) 2015 Andrew Lutomirski
>> + */
>> +
>> +#define _GNU_SOURCE
>> +#include <err.h>
>> +#include <stdio.h>
>> +#include <stdint.h>
>> +#include <signal.h>
>> +#include <setjmp.h>
>> +#include <string.h>
>> +#include <errno.h>
>> +#include <xmmintrin.h>
>
>
> Is xmmintrin.h necessary? It breaks 32-bit build with

no, and I'll remove it.

--Andy

2015-07-21 23:37:47

by Willy Tarreau

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] selftests/x86, x86/ldt: Add a selftest for modify_ldt

Hi Andy,

On Tue, Jul 21, 2015 at 12:59:31PM -0700, Andy Lutomirski wrote:
> This tests general modify_ldt behavior (only writes, so far) as
> well as synchronous updates via IPI. It fails on old kernels.
>
> I called this ldt_gdt because I'll add set_thread_area tests to
> it at some point.

Quick feedback : at two places you have this :

> + } else if (errno == ENOSYS) {
> + printf("[OK]\tmodify_ldt is returned -ENOSYS\n");

=> s/is //

Please add stdlib.h to avoid this warning I'm getting on 32-bit :

ldt_gdt.c:286:4: warning: incompatible implicit declaration of built-in function 'exit' [enabled by default]

And I had to remove xmmintrinsic as suggested by Boris as well.


FWIW here's what I'm getting here on 4.1.2 without CONFIG_X86_16BIT
(where nmi_espfix failed), the output is the same for a 32- and a 64-bit
process :

[OK] LDT entry 0 has AR 0x0040FA00 and limit 0x0000000A
[OK] LDT entry 0 has AR 0x00C0FA00 and limit 0x0000AFFF
[OK] LDT entry 1 is invalid
[OK] LDT entry 2 has AR 0x00C0FA00 and limit 0x0000AFFF
[OK] LDT entry 1 is invalid
[OK] LDT entry 2 has AR 0x00C0FA00 and limit 0x0000AFFF
[OK] LDT entry 2 has AR 0x00D0FA00 and limit 0x0000AFFF
[OK] LDT entry 2 has AR 0x00D07A00 and limit 0x0000AFFF
[OK] modify_ldt rejected 16 bit segment
[OK] LDT entry 2 has AR 0x00D07200 and limit 0x0000AFFF
[OK] LDT entry 2 has AR 0x00D07000 and limit 0x0000AFFF
[OK] LDT entry 2 has AR 0x00D07400 and limit 0x0000AFFF
[OK] LDT entry 2 has AR 0x00507600 and limit 0x0000000A
[OK] LDT entry 2 has AR 0x00507E00 and limit 0x0000000A
[OK] LDT entry 2 has AR 0x00507C00 and limit 0x0000000A
[OK] LDT entry 2 has AR 0x00507A00 and limit 0x0000000A
[OK] LDT entry 2 has AR 0x00507800 and limit 0x0000000A
[OK] LDT entry 2 has AR 0x00507800 and limit 0x0000000A
[RUN] Test fork
[OK] LDT entry 2 has AR 0x00507800 and limit 0x0000000A
[OK] LDT entry 1 is invalid
[OK] Child succeeded
[OK] modify_ldt failure 22
[OK] modify_ldt rejected 16 bit segment
[OK] modify_ldt rejected 16 bit segment
[OK] modify_ldt rejected 16 bit segment
[OK] modify_ldt rejected 16 bit segment
[OK] modify_ldt rejected 16 bit segment
[OK] modify_ldt rejected 16 bit segment
[OK] LDT entry 0 is invalid
[OK] LDT entry 0 has AR 0x0040F200 and limit 0x00000000
[OK] LDT entry 0 is invalid
[RUN] Cross-CPU LDT invalidation
[FAIL] 5 of 5 iterations failed

Thanks,
Willy

2015-07-21 23:59:20

by Andrew Cooper

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] x86/ldt: Make modify_ldt synchronous

On 21/07/2015 22:53, Boris Ostrovsky wrote:
> On 07/21/2015 03:59 PM, Andy Lutomirski wrote:
>> --- a/arch/x86/include/asm/mmu_context.h
>> +++ b/arch/x86/include/asm/mmu_context.h
>> @@ -34,6 +34,44 @@ static inline void load_mm_cr4(struct mm_struct
>> *mm) {}
>> #endif
>> /*
>> + * ldt_structs can be allocated, used, and freed, but they are never
>> + * modified while live.
>> + */
>> +struct ldt_struct {
>> + int size;
>> + int __pad; /* keep the descriptors naturally aligned. */
>> + struct desc_struct entries[];
>> +};
>
>
>
> This breaks Xen which expects LDT to be page-aligned. Not sure why.
>
> Jan, Andrew?

PV guests are not permitted to have writeable mappings to the frames
making up the GDT and LDT, so it cannot make unaudited changes to
loadable descriptors. In particular, for a 32bit PV guest, it is only
the segment limit which protects Xen from the ring1 guest kernel.

A lot of this code hasn't been touched in years, and it certainly
predates me. The alignment requirement appears to come from the virtual
region Xen uses to map the guests GDT and LDT. Strict alignment is
required for the GDT so Xen's descriptors starting at 0xe0xx are
correct, but the LDT alignment seems to be a side effect of similar
codepaths.

For an LDT smaller than 8192 entries, I can't see any specific reason
for enforcing alignment, other than "that's the way it has always been".

However, the guest would still have to relinquish write access to all
frames which make up the LDT, which looks to be a bit of an issue given
the snippet above.

I think I have a solution, but I doubt it is going to be very popular.

* Make a new paravirt hook for allocation of ldt_struct, so the paravirt
backend can choose an alignment if needed
* Make absolutely certain that __pad has the value 0 (so size and __pad
combined don't look like a present descriptor)
* Never hand selector 0x0008 to unsuspecting users.

This will allow ldt_struct itself to be page aligned, and for the size
field to sit across the base/limit field of what would logically be
selector 0x0008 There would be some issues accessing size. To load
frames as an LDT, a guest must drop all refs to the page so that its
type may be changed from writeable to segdesc. After that, an
update_descriptor hypercall can be used to change size, and I believe
the guest may subsequently recreate read-only mappings to the frames in
question (although frankly it is getting late so you will want to double
check all of this).

Anyhow, this looks like an issue which should be fixed up with slightly
more PVOps, rather than enforcing a Xen view of the world on native Linux.

~Andrew

2015-07-21 23:40:36

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] selftests/x86, x86/ldt: Add a selftest for modify_ldt

On Tue, Jul 21, 2015 at 4:36 PM, Willy Tarreau <[email protected]> wrote:
> Hi Andy,
>
> On Tue, Jul 21, 2015 at 12:59:31PM -0700, Andy Lutomirski wrote:
>> This tests general modify_ldt behavior (only writes, so far) as
>> well as synchronous updates via IPI. It fails on old kernels.
>>
>> I called this ldt_gdt because I'll add set_thread_area tests to
>> it at some point.
>
> Quick feedback : at two places you have this :
>
>> + } else if (errno == ENOSYS) {
>> + printf("[OK]\tmodify_ldt is returned -ENOSYS\n");
>
> => s/is //
>
> Please add stdlib.h to avoid this warning I'm getting on 32-bit :
>
> ldt_gdt.c:286:4: warning: incompatible implicit declaration of built-in function 'exit' [enabled by default]
>
> And I had to remove xmmintrinsic as suggested by Boris as well.
>
>
> FWIW here's what I'm getting here on 4.1.2 without CONFIG_X86_16BIT
> (where nmi_espfix failed), the output is the same for a 32- and a 64-bit
> process :
>
> [OK] LDT entry 0 has AR 0x0040FA00 and limit 0x0000000A
> [OK] LDT entry 0 has AR 0x00C0FA00 and limit 0x0000AFFF
> [OK] LDT entry 1 is invalid
> [OK] LDT entry 2 has AR 0x00C0FA00 and limit 0x0000AFFF
> [OK] LDT entry 1 is invalid
> [OK] LDT entry 2 has AR 0x00C0FA00 and limit 0x0000AFFF
> [OK] LDT entry 2 has AR 0x00D0FA00 and limit 0x0000AFFF
> [OK] LDT entry 2 has AR 0x00D07A00 and limit 0x0000AFFF
> [OK] modify_ldt rejected 16 bit segment
> [OK] LDT entry 2 has AR 0x00D07200 and limit 0x0000AFFF
> [OK] LDT entry 2 has AR 0x00D07000 and limit 0x0000AFFF
> [OK] LDT entry 2 has AR 0x00D07400 and limit 0x0000AFFF
> [OK] LDT entry 2 has AR 0x00507600 and limit 0x0000000A
> [OK] LDT entry 2 has AR 0x00507E00 and limit 0x0000000A
> [OK] LDT entry 2 has AR 0x00507C00 and limit 0x0000000A
> [OK] LDT entry 2 has AR 0x00507A00 and limit 0x0000000A
> [OK] LDT entry 2 has AR 0x00507800 and limit 0x0000000A
> [OK] LDT entry 2 has AR 0x00507800 and limit 0x0000000A
> [RUN] Test fork
> [OK] LDT entry 2 has AR 0x00507800 and limit 0x0000000A
> [OK] LDT entry 1 is invalid
> [OK] Child succeeded
> [OK] modify_ldt failure 22
> [OK] modify_ldt rejected 16 bit segment
> [OK] modify_ldt rejected 16 bit segment
> [OK] modify_ldt rejected 16 bit segment
> [OK] modify_ldt rejected 16 bit segment
> [OK] modify_ldt rejected 16 bit segment
> [OK] modify_ldt rejected 16 bit segment
> [OK] LDT entry 0 is invalid
> [OK] LDT entry 0 has AR 0x0040F200 and limit 0x00000000
> [OK] LDT entry 0 is invalid
> [RUN] Cross-CPU LDT invalidation
> [FAIL] 5 of 5 iterations failed

That's intentional. Old modify_ldt can leave other threads with stale
cached descriptors, which is what you're seeing.

--Andy

>
> Thanks,
> Willy
>



--
Andy Lutomirski
AMA Capital Management, LLC

2015-07-22 00:07:33

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] x86/ldt: Make modify_ldt synchronous

On Tue, Jul 21, 2015 at 4:38 PM, Andrew Cooper
<[email protected]> wrote:
> On 21/07/2015 22:53, Boris Ostrovsky wrote:
>> On 07/21/2015 03:59 PM, Andy Lutomirski wrote:
>>> --- a/arch/x86/include/asm/mmu_context.h
>>> +++ b/arch/x86/include/asm/mmu_context.h
>>> @@ -34,6 +34,44 @@ static inline void load_mm_cr4(struct mm_struct
>>> *mm) {}
>>> #endif
>>> /*
>>> + * ldt_structs can be allocated, used, and freed, but they are never
>>> + * modified while live.
>>> + */
>>> +struct ldt_struct {
>>> + int size;
>>> + int __pad; /* keep the descriptors naturally aligned. */
>>> + struct desc_struct entries[];
>>> +};
>>
>>
>>
>> This breaks Xen which expects LDT to be page-aligned. Not sure why.
>>
>> Jan, Andrew?
>
> PV guests are not permitted to have writeable mappings to the frames
> making up the GDT and LDT, so it cannot make unaudited changes to
> loadable descriptors. In particular, for a 32bit PV guest, it is only
> the segment limit which protects Xen from the ring1 guest kernel.
>
> A lot of this code hasn't been touched in years, and it certainly
> predates me. The alignment requirement appears to come from the virtual
> region Xen uses to map the guests GDT and LDT. Strict alignment is
> required for the GDT so Xen's descriptors starting at 0xe0xx are
> correct, but the LDT alignment seems to be a side effect of similar
> codepaths.
>
> For an LDT smaller than 8192 entries, I can't see any specific reason
> for enforcing alignment, other than "that's the way it has always been".
>
> However, the guest would still have to relinquish write access to all
> frames which make up the LDT, which looks to be a bit of an issue given
> the snippet above.

Does the LDT itself need to be aligned or just the address passed to
paravirt_alloc_ldt?

>
> I think I have a solution, but I doubt it is going to be very popular.
>
> * Make a new paravirt hook for allocation of ldt_struct, so the paravirt
> backend can choose an alignment if needed
> * Make absolutely certain that __pad has the value 0 (so size and __pad
> combined don't look like a present descriptor)
> * Never hand selector 0x0008 to unsuspecting users.

Yuck.

>
> This will allow ldt_struct itself to be page aligned, and for the size
> field to sit across the base/limit field of what would logically be
> selector 0x0008 There would be some issues accessing size. To load
> frames as an LDT, a guest must drop all refs to the page so that its
> type may be changed from writeable to segdesc. After that, an
> update_descriptor hypercall can be used to change size, and I believe
> the guest may subsequently recreate read-only mappings to the frames in
> question (although frankly it is getting late so you will want to double
> check all of this).
>
> Anyhow, this looks like an issue which should be fixed up with slightly
> more PVOps, rather than enforcing a Xen view of the world on native Linux.
>

I could presumably make the allocation the other way around so the
size is at the end. I could even use two separate allocations if
needed.

--Andy

2015-07-22 00:21:57

by Andrew Cooper

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] x86/ldt: Make modify_ldt synchronous

On 22/07/2015 01:07, Andy Lutomirski wrote:
> On Tue, Jul 21, 2015 at 4:38 PM, Andrew Cooper
> <[email protected]> wrote:
>> On 21/07/2015 22:53, Boris Ostrovsky wrote:
>>> On 07/21/2015 03:59 PM, Andy Lutomirski wrote:
>>>> --- a/arch/x86/include/asm/mmu_context.h
>>>> +++ b/arch/x86/include/asm/mmu_context.h
>>>> @@ -34,6 +34,44 @@ static inline void load_mm_cr4(struct mm_struct
>>>> *mm) {}
>>>> #endif
>>>> /*
>>>> + * ldt_structs can be allocated, used, and freed, but they are never
>>>> + * modified while live.
>>>> + */
>>>> +struct ldt_struct {
>>>> + int size;
>>>> + int __pad; /* keep the descriptors naturally aligned. */
>>>> + struct desc_struct entries[];
>>>> +};
>>>
>>>
>>> This breaks Xen which expects LDT to be page-aligned. Not sure why.
>>>
>>> Jan, Andrew?
>> PV guests are not permitted to have writeable mappings to the frames
>> making up the GDT and LDT, so it cannot make unaudited changes to
>> loadable descriptors. In particular, for a 32bit PV guest, it is only
>> the segment limit which protects Xen from the ring1 guest kernel.
>>
>> A lot of this code hasn't been touched in years, and it certainly
>> predates me. The alignment requirement appears to come from the virtual
>> region Xen uses to map the guests GDT and LDT. Strict alignment is
>> required for the GDT so Xen's descriptors starting at 0xe0xx are
>> correct, but the LDT alignment seems to be a side effect of similar
>> codepaths.
>>
>> For an LDT smaller than 8192 entries, I can't see any specific reason
>> for enforcing alignment, other than "that's the way it has always been".
>>
>> However, the guest would still have to relinquish write access to all
>> frames which make up the LDT, which looks to be a bit of an issue given
>> the snippet above.
> Does the LDT itself need to be aligned or just the address passed to
> paravirt_alloc_ldt?

The address which Xen receives needs to be aligned.

It looks like xen_alloc_ldt() blindly assumes that the desc_struct *ldt
it is passed is page aligned, and passes it straight through.

>
>> I think I have a solution, but I doubt it is going to be very popular.
>>
>> * Make a new paravirt hook for allocation of ldt_struct, so the paravirt
>> backend can choose an alignment if needed
>> * Make absolutely certain that __pad has the value 0 (so size and __pad
>> combined don't look like a present descriptor)
>> * Never hand selector 0x0008 to unsuspecting users.
> Yuck.

I actually meant 0x0004, but yes. Very much yuck.

>
>> This will allow ldt_struct itself to be page aligned, and for the size
>> field to sit across the base/limit field of what would logically be
>> selector 0x0008 There would be some issues accessing size. To load
>> frames as an LDT, a guest must drop all refs to the page so that its
>> type may be changed from writeable to segdesc. After that, an
>> update_descriptor hypercall can be used to change size, and I believe
>> the guest may subsequently recreate read-only mappings to the frames in
>> question (although frankly it is getting late so you will want to double
>> check all of this).
>>
>> Anyhow, this looks like an issue which should be fixed up with slightly
>> more PVOps, rather than enforcing a Xen view of the world on native Linux.
>>
> I could presumably make the allocation the other way around so the
> size is at the end. I could even use two separate allocations if
> needed.

I suspect two separate allocations would be the better solution, as it
means that the size field doesn't need to be subject to funny page
permissions.

~Andrew

2015-07-22 00:28:25

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] x86/ldt: Make modify_ldt synchronous

On Tue, Jul 21, 2015 at 5:21 PM, Andrew Cooper
<[email protected]> wrote:
> On 22/07/2015 01:07, Andy Lutomirski wrote:
>> On Tue, Jul 21, 2015 at 4:38 PM, Andrew Cooper
>> <[email protected]> wrote:
>>> On 21/07/2015 22:53, Boris Ostrovsky wrote:
>>>> On 07/21/2015 03:59 PM, Andy Lutomirski wrote:
>>>>> --- a/arch/x86/include/asm/mmu_context.h
>>>>> +++ b/arch/x86/include/asm/mmu_context.h
>>>>> @@ -34,6 +34,44 @@ static inline void load_mm_cr4(struct mm_struct
>>>>> *mm) {}
>>>>> #endif
>>>>> /*
>>>>> + * ldt_structs can be allocated, used, and freed, but they are never
>>>>> + * modified while live.
>>>>> + */
>>>>> +struct ldt_struct {
>>>>> + int size;
>>>>> + int __pad; /* keep the descriptors naturally aligned. */
>>>>> + struct desc_struct entries[];
>>>>> +};
>>>>
>>>>
>>>> This breaks Xen which expects LDT to be page-aligned. Not sure why.
>>>>
>>>> Jan, Andrew?
>>> PV guests are not permitted to have writeable mappings to the frames
>>> making up the GDT and LDT, so it cannot make unaudited changes to
>>> loadable descriptors. In particular, for a 32bit PV guest, it is only
>>> the segment limit which protects Xen from the ring1 guest kernel.
>>>
>>> A lot of this code hasn't been touched in years, and it certainly
>>> predates me. The alignment requirement appears to come from the virtual
>>> region Xen uses to map the guests GDT and LDT. Strict alignment is
>>> required for the GDT so Xen's descriptors starting at 0xe0xx are
>>> correct, but the LDT alignment seems to be a side effect of similar
>>> codepaths.
>>>
>>> For an LDT smaller than 8192 entries, I can't see any specific reason
>>> for enforcing alignment, other than "that's the way it has always been".
>>>
>>> However, the guest would still have to relinquish write access to all
>>> frames which make up the LDT, which looks to be a bit of an issue given
>>> the snippet above.
>> Does the LDT itself need to be aligned or just the address passed to
>> paravirt_alloc_ldt?
>
> The address which Xen receives needs to be aligned.
>
> It looks like xen_alloc_ldt() blindly assumes that the desc_struct *ldt
> it is passed is page aligned, and passes it straight through.

xen_alloc_ldt is just fiddling with protection though, I think. Isn't
it xen_set_ldt that's the meat? We could easily pass xen_alloc_ldt a
pointer to the ldt_struct.

>
>>
>>> I think I have a solution, but I doubt it is going to be very popular.
>>>
>>> * Make a new paravirt hook for allocation of ldt_struct, so the paravirt
>>> backend can choose an alignment if needed
>>> * Make absolutely certain that __pad has the value 0 (so size and __pad
>>> combined don't look like a present descriptor)
>>> * Never hand selector 0x0008 to unsuspecting users.
>> Yuck.
>
> I actually meant 0x0004, but yes. Very much yuck.
>
>>
>>> This will allow ldt_struct itself to be page aligned, and for the size
>>> field to sit across the base/limit field of what would logically be
>>> selector 0x0008 There would be some issues accessing size. To load
>>> frames as an LDT, a guest must drop all refs to the page so that its
>>> type may be changed from writeable to segdesc. After that, an
>>> update_descriptor hypercall can be used to change size, and I believe
>>> the guest may subsequently recreate read-only mappings to the frames in
>>> question (although frankly it is getting late so you will want to double
>>> check all of this).
>>>
>>> Anyhow, this looks like an issue which should be fixed up with slightly
>>> more PVOps, rather than enforcing a Xen view of the world on native Linux.
>>>
>> I could presumably make the allocation the other way around so the
>> size is at the end. I could even use two separate allocations if
>> needed.
>
> I suspect two separate allocations would be the better solution, as it
> means that the size field doesn't need to be subject to funny page
> permissions.

True. OTOH we never write to the size field after allocating the thing.

--Andy

2015-07-22 00:49:58

by Andrew Cooper

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] x86/ldt: Make modify_ldt synchronous

On 22/07/2015 01:28, Andy Lutomirski wrote:
> On Tue, Jul 21, 2015 at 5:21 PM, Andrew Cooper
> <[email protected]> wrote:
>> On 22/07/2015 01:07, Andy Lutomirski wrote:
>>> On Tue, Jul 21, 2015 at 4:38 PM, Andrew Cooper
>>> <[email protected]> wrote:
>>>> On 21/07/2015 22:53, Boris Ostrovsky wrote:
>>>>> On 07/21/2015 03:59 PM, Andy Lutomirski wrote:
>>>>>> --- a/arch/x86/include/asm/mmu_context.h
>>>>>> +++ b/arch/x86/include/asm/mmu_context.h
>>>>>> @@ -34,6 +34,44 @@ static inline void load_mm_cr4(struct mm_struct
>>>>>> *mm) {}
>>>>>> #endif
>>>>>> /*
>>>>>> + * ldt_structs can be allocated, used, and freed, but they are never
>>>>>> + * modified while live.
>>>>>> + */
>>>>>> +struct ldt_struct {
>>>>>> + int size;
>>>>>> + int __pad; /* keep the descriptors naturally aligned. */
>>>>>> + struct desc_struct entries[];
>>>>>> +};
>>>>>
>>>>> This breaks Xen which expects LDT to be page-aligned. Not sure why.
>>>>>
>>>>> Jan, Andrew?
>>>> PV guests are not permitted to have writeable mappings to the frames
>>>> making up the GDT and LDT, so it cannot make unaudited changes to
>>>> loadable descriptors. In particular, for a 32bit PV guest, it is only
>>>> the segment limit which protects Xen from the ring1 guest kernel.
>>>>
>>>> A lot of this code hasn't been touched in years, and it certainly
>>>> predates me. The alignment requirement appears to come from the virtual
>>>> region Xen uses to map the guests GDT and LDT. Strict alignment is
>>>> required for the GDT so Xen's descriptors starting at 0xe0xx are
>>>> correct, but the LDT alignment seems to be a side effect of similar
>>>> codepaths.
>>>>
>>>> For an LDT smaller than 8192 entries, I can't see any specific reason
>>>> for enforcing alignment, other than "that's the way it has always been".
>>>>
>>>> However, the guest would still have to relinquish write access to all
>>>> frames which make up the LDT, which looks to be a bit of an issue given
>>>> the snippet above.
>>> Does the LDT itself need to be aligned or just the address passed to
>>> paravirt_alloc_ldt?
>> The address which Xen receives needs to be aligned.
>>
>> It looks like xen_alloc_ldt() blindly assumes that the desc_struct *ldt
>> it is passed is page aligned, and passes it straight through.
> xen_alloc_ldt is just fiddling with protection though, I think. Isn't
> it xen_set_ldt that's the meat? We could easily pass xen_alloc_ldt a
> pointer to the ldt_struct.

So it is. It is the linear_addr in xen_set_ldt() which Xen currently
audits to be page aligned.

>>>> This will allow ldt_struct itself to be page aligned, and for the size
>>>> field to sit across the base/limit field of what would logically be
>>>> selector 0x0008 There would be some issues accessing size. To load
>>>> frames as an LDT, a guest must drop all refs to the page so that its
>>>> type may be changed from writeable to segdesc. After that, an
>>>> update_descriptor hypercall can be used to change size, and I believe
>>>> the guest may subsequently recreate read-only mappings to the frames in
>>>> question (although frankly it is getting late so you will want to double
>>>> check all of this).
>>>>
>>>> Anyhow, this looks like an issue which should be fixed up with slightly
>>>> more PVOps, rather than enforcing a Xen view of the world on native Linux.
>>>>
>>> I could presumably make the allocation the other way around so the
>>> size is at the end. I could even use two separate allocations if
>>> needed.
>> I suspect two separate allocations would be the better solution, as it
>> means that the size field doesn't need to be subject to funny page
>> permissions.
> True. OTOH we never write to the size field after allocating the thing.

Right, but even reading it is going to cause problems if one of the
paravirt ops can't re-establish ro mappings.

~Andrew

2015-07-22 01:06:43

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] x86/ldt: Make modify_ldt synchronous

On Tue, Jul 21, 2015 at 5:49 PM, Andrew Cooper
<[email protected]> wrote:
> On 22/07/2015 01:28, Andy Lutomirski wrote:
>> On Tue, Jul 21, 2015 at 5:21 PM, Andrew Cooper
>> <[email protected]> wrote:
>>> On 22/07/2015 01:07, Andy Lutomirski wrote:
>>>> On Tue, Jul 21, 2015 at 4:38 PM, Andrew Cooper
>>>> <[email protected]> wrote:
>>>>> On 21/07/2015 22:53, Boris Ostrovsky wrote:
>>>>>> On 07/21/2015 03:59 PM, Andy Lutomirski wrote:
>>>>>>> --- a/arch/x86/include/asm/mmu_context.h
>>>>>>> +++ b/arch/x86/include/asm/mmu_context.h
>>>>>>> @@ -34,6 +34,44 @@ static inline void load_mm_cr4(struct mm_struct
>>>>>>> *mm) {}
>>>>>>> #endif
>>>>>>> /*
>>>>>>> + * ldt_structs can be allocated, used, and freed, but they are never
>>>>>>> + * modified while live.
>>>>>>> + */
>>>>>>> +struct ldt_struct {
>>>>>>> + int size;
>>>>>>> + int __pad; /* keep the descriptors naturally aligned. */
>>>>>>> + struct desc_struct entries[];
>>>>>>> +};
>>>>>>
>>>>>> This breaks Xen which expects LDT to be page-aligned. Not sure why.
>>>>>>
>>>>>> Jan, Andrew?
>>>>> PV guests are not permitted to have writeable mappings to the frames
>>>>> making up the GDT and LDT, so it cannot make unaudited changes to
>>>>> loadable descriptors. In particular, for a 32bit PV guest, it is only
>>>>> the segment limit which protects Xen from the ring1 guest kernel.
>>>>>
>>>>> A lot of this code hasn't been touched in years, and it certainly
>>>>> predates me. The alignment requirement appears to come from the virtual
>>>>> region Xen uses to map the guests GDT and LDT. Strict alignment is
>>>>> required for the GDT so Xen's descriptors starting at 0xe0xx are
>>>>> correct, but the LDT alignment seems to be a side effect of similar
>>>>> codepaths.
>>>>>
>>>>> For an LDT smaller than 8192 entries, I can't see any specific reason
>>>>> for enforcing alignment, other than "that's the way it has always been".
>>>>>
>>>>> However, the guest would still have to relinquish write access to all
>>>>> frames which make up the LDT, which looks to be a bit of an issue given
>>>>> the snippet above.
>>>> Does the LDT itself need to be aligned or just the address passed to
>>>> paravirt_alloc_ldt?
>>> The address which Xen receives needs to be aligned.
>>>
>>> It looks like xen_alloc_ldt() blindly assumes that the desc_struct *ldt
>>> it is passed is page aligned, and passes it straight through.
>> xen_alloc_ldt is just fiddling with protection though, I think. Isn't
>> it xen_set_ldt that's the meat? We could easily pass xen_alloc_ldt a
>> pointer to the ldt_struct.
>
> So it is. It is the linear_addr in xen_set_ldt() which Xen currently
> audits to be page aligned.
>
>>>>> This will allow ldt_struct itself to be page aligned, and for the size
>>>>> field to sit across the base/limit field of what would logically be
>>>>> selector 0x0008 There would be some issues accessing size. To load
>>>>> frames as an LDT, a guest must drop all refs to the page so that its
>>>>> type may be changed from writeable to segdesc. After that, an
>>>>> update_descriptor hypercall can be used to change size, and I believe
>>>>> the guest may subsequently recreate read-only mappings to the frames in
>>>>> question (although frankly it is getting late so you will want to double
>>>>> check all of this).
>>>>>
>>>>> Anyhow, this looks like an issue which should be fixed up with slightly
>>>>> more PVOps, rather than enforcing a Xen view of the world on native Linux.
>>>>>
>>>> I could presumably make the allocation the other way around so the
>>>> size is at the end. I could even use two separate allocations if
>>>> needed.
>>> I suspect two separate allocations would be the better solution, as it
>>> means that the size field doesn't need to be subject to funny page
>>> permissions.
>> True. OTOH we never write to the size field after allocating the thing.
>
> Right, but even reading it is going to cause problems if one of the
> paravirt ops can't re-establish ro mappings.

Does paravirt_alloc_ldt completely deny access or does it just set it RO?

--Andy

>
> ~Andrew



--
Andy Lutomirski
AMA Capital Management, LLC

2015-07-22 02:01:09

by Brian Gerst

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] x86/ldt: Make modify_ldt synchronous

On Tue, Jul 21, 2015 at 3:59 PM, Andy Lutomirski <[email protected]> wrote:
> modify_ldt has questionable locking and does not synchronize
> threads. Improve it: redesign the locking and synchronize all
> threads' LDTs using an IPI on all modifications.

What does this fix? I can see sending an IPI if the LDT is
reallocated, but on every update seems unnecessary.

--
Brian Gerst

2015-07-22 02:05:44

by Boris Ostrovsky

[permalink] [raw]
Subject: Re: [Xen-devel] [PATCH v2 1/3] x86/ldt: Make modify_ldt synchronous



On 07/21/2015 08:49 PM, Andrew Cooper wrote:
> On 22/07/2015 01:28, Andy Lutomirski wrote:
>> On Tue, Jul 21, 2015 at 5:21 PM, Andrew Cooper
>> <[email protected]> wrote:
>>> On 22/07/2015 01:07, Andy Lutomirski wrote:
>>>> On Tue, Jul 21, 2015 at 4:38 PM, Andrew Cooper
>>>> <[email protected]> wrote:
>>>>> On 21/07/2015 22:53, Boris Ostrovsky wrote:
>>>>>> On 07/21/2015 03:59 PM, Andy Lutomirski wrote:
>>>>>>> --- a/arch/x86/include/asm/mmu_context.h
>>>>>>> +++ b/arch/x86/include/asm/mmu_context.h
>>>>>>> @@ -34,6 +34,44 @@ static inline void load_mm_cr4(struct mm_struct
>>>>>>> *mm) {}
>>>>>>> #endif
>>>>>>> /*
>>>>>>> + * ldt_structs can be allocated, used, and freed, but they are never
>>>>>>> + * modified while live.
>>>>>>> + */
>>>>>>> +struct ldt_struct {
>>>>>>> + int size;
>>>>>>> + int __pad; /* keep the descriptors naturally aligned. */
>>>>>>> + struct desc_struct entries[];
>>>>>>> +};
>>>>>> This breaks Xen which expects LDT to be page-aligned. Not sure why.
>>>>>>
>>>>>> Jan, Andrew?
>>>>> PV guests are not permitted to have writeable mappings to the frames
>>>>> making up the GDT and LDT, so it cannot make unaudited changes to
>>>>> loadable descriptors. In particular, for a 32bit PV guest, it is only
>>>>> the segment limit which protects Xen from the ring1 guest kernel.
>>>>>
>>>>> A lot of this code hasn't been touched in years, and it certainly
>>>>> predates me. The alignment requirement appears to come from the virtual
>>>>> region Xen uses to map the guests GDT and LDT. Strict alignment is
>>>>> required for the GDT so Xen's descriptors starting at 0xe0xx are
>>>>> correct, but the LDT alignment seems to be a side effect of similar
>>>>> codepaths.
>>>>>
>>>>> For an LDT smaller than 8192 entries, I can't see any specific reason
>>>>> for enforcing alignment, other than "that's the way it has always been".
>>>>>
>>>>> However, the guest would still have to relinquish write access to all
>>>>> frames which make up the LDT, which looks to be a bit of an issue given
>>>>> the snippet above.
>>>> Does the LDT itself need to be aligned or just the address passed to
>>>> paravirt_alloc_ldt?
>>> The address which Xen receives needs to be aligned.
>>>
>>> It looks like xen_alloc_ldt() blindly assumes that the desc_struct *ldt
>>> it is passed is page aligned, and passes it straight through.
>> xen_alloc_ldt is just fiddling with protection though, I think. Isn't
>> it xen_set_ldt that's the meat? We could easily pass xen_alloc_ldt a
>> pointer to the ldt_struct.
> So it is. It is the linear_addr in xen_set_ldt() which Xen currently
> audits to be page aligned.
>
>>>>> This will allow ldt_struct itself to be page aligned, and for the size
>>>>> field to sit across the base/limit field of what would logically be
>>>>> selector 0x0008 There would be some issues accessing size. To load
>>>>> frames as an LDT, a guest must drop all refs to the page so that its
>>>>> type may be changed from writeable to segdesc. After that, an
>>>>> update_descriptor hypercall can be used to change size, and I believe
>>>>> the guest may subsequently recreate read-only mappings to the frames in
>>>>> question (although frankly it is getting late so you will want to double
>>>>> check all of this).
>>>>>
>>>>> Anyhow, this looks like an issue which should be fixed up with slightly
>>>>> more PVOps, rather than enforcing a Xen view of the world on native Linux.
>>>>>
>>>> I could presumably make the allocation the other way around so the
>>>> size is at the end. I could even use two separate allocations if
>>>> needed.

Why not wrap mm_context_t's ldt and size into a struct (just like
ldt_struct but without __pad) and have a single allocation of ldt?

I.e.

struct ldt_struct {
int size;
struct desc_struct *entries;
}

--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -9,8 +9,7 @@
* we put the segment information here.
*/
typedef struct {
- void *ldt;
- int size;
+ struct ldt_struct ldt;
#ifdef CONFIG_X86_64
/* True if mm supports a task running in 32 bit compatibility
mode. */


-boris

>>> I suspect two separate allocations would be the better solution, as it
>>> means that the size field doesn't need to be subject to funny page
>>> permissions.
>> True. OTOH we never write to the size field after allocating the thing.
> Right, but even reading it is going to cause problems if one of the
> paravirt ops can't re-establish ro mappings.
>
> ~Andrew
>
> _______________________________________________
> Xen-devel mailing list
> [email protected]
> http://lists.xen.org/xen-devel

2015-07-22 02:13:03

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] x86/ldt: Make modify_ldt synchronous

On Tue, Jul 21, 2015 at 7:01 PM, Brian Gerst <[email protected]> wrote:
> On Tue, Jul 21, 2015 at 3:59 PM, Andy Lutomirski <[email protected]> wrote:
>> modify_ldt has questionable locking and does not synchronize
>> threads. Improve it: redesign the locking and synchronize all
>> threads' LDTs using an IPI on all modifications.
>
> What does this fix? I can see sending an IPI if the LDT is
> reallocated, but on every update seems unnecessary.
>

It prevents nastiness in which you're in user mode with an impossible
CS or SS, resulting in potentially interesting artifacts in
interrupts, NMIs, etc.

> --
> Brian Gerst



--
Andy Lutomirski
AMA Capital Management, LLC

2015-07-22 02:13:50

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [Xen-devel] [PATCH v2 1/3] x86/ldt: Make modify_ldt synchronous

On Tue, Jul 21, 2015 at 7:04 PM, Boris Ostrovsky
<[email protected]> wrote:
>
>
> On 07/21/2015 08:49 PM, Andrew Cooper wrote:
>>
>> On 22/07/2015 01:28, Andy Lutomirski wrote:
>>>
>>> On Tue, Jul 21, 2015 at 5:21 PM, Andrew Cooper
>>> <[email protected]> wrote:
>>>>
>>>> On 22/07/2015 01:07, Andy Lutomirski wrote:
>>>>>
>>>>> On Tue, Jul 21, 2015 at 4:38 PM, Andrew Cooper
>>>>> <[email protected]> wrote:
>>>>>>
>>>>>> On 21/07/2015 22:53, Boris Ostrovsky wrote:
>>>>>>>
>>>>>>> On 07/21/2015 03:59 PM, Andy Lutomirski wrote:
>>>>>>>>
>>>>>>>> --- a/arch/x86/include/asm/mmu_context.h
>>>>>>>> +++ b/arch/x86/include/asm/mmu_context.h
>>>>>>>> @@ -34,6 +34,44 @@ static inline void load_mm_cr4(struct mm_struct
>>>>>>>> *mm) {}
>>>>>>>> #endif
>>>>>>>> /*
>>>>>>>> + * ldt_structs can be allocated, used, and freed, but they are
>>>>>>>> never
>>>>>>>> + * modified while live.
>>>>>>>> + */
>>>>>>>> +struct ldt_struct {
>>>>>>>> + int size;
>>>>>>>> + int __pad; /* keep the descriptors naturally aligned. */
>>>>>>>> + struct desc_struct entries[];
>>>>>>>> +};
>>>>>>>
>>>>>>> This breaks Xen which expects LDT to be page-aligned. Not sure why.
>>>>>>>
>>>>>>> Jan, Andrew?
>>>>>>
>>>>>> PV guests are not permitted to have writeable mappings to the frames
>>>>>> making up the GDT and LDT, so it cannot make unaudited changes to
>>>>>> loadable descriptors. In particular, for a 32bit PV guest, it is only
>>>>>> the segment limit which protects Xen from the ring1 guest kernel.
>>>>>>
>>>>>> A lot of this code hasn't been touched in years, and it certainly
>>>>>> predates me. The alignment requirement appears to come from the
>>>>>> virtual
>>>>>> region Xen uses to map the guests GDT and LDT. Strict alignment is
>>>>>> required for the GDT so Xen's descriptors starting at 0xe0xx are
>>>>>> correct, but the LDT alignment seems to be a side effect of similar
>>>>>> codepaths.
>>>>>>
>>>>>> For an LDT smaller than 8192 entries, I can't see any specific reason
>>>>>> for enforcing alignment, other than "that's the way it has always
>>>>>> been".
>>>>>>
>>>>>> However, the guest would still have to relinquish write access to all
>>>>>> frames which make up the LDT, which looks to be a bit of an issue
>>>>>> given
>>>>>> the snippet above.
>>>>>
>>>>> Does the LDT itself need to be aligned or just the address passed to
>>>>> paravirt_alloc_ldt?
>>>>
>>>> The address which Xen receives needs to be aligned.
>>>>
>>>> It looks like xen_alloc_ldt() blindly assumes that the desc_struct *ldt
>>>> it is passed is page aligned, and passes it straight through.
>>>
>>> xen_alloc_ldt is just fiddling with protection though, I think. Isn't
>>> it xen_set_ldt that's the meat? We could easily pass xen_alloc_ldt a
>>> pointer to the ldt_struct.
>>
>> So it is. It is the linear_addr in xen_set_ldt() which Xen currently
>> audits to be page aligned.
>>
>>>>>> This will allow ldt_struct itself to be page aligned, and for the size
>>>>>> field to sit across the base/limit field of what would logically be
>>>>>> selector 0x0008 There would be some issues accessing size. To load
>>>>>> frames as an LDT, a guest must drop all refs to the page so that its
>>>>>> type may be changed from writeable to segdesc. After that, an
>>>>>> update_descriptor hypercall can be used to change size, and I believe
>>>>>> the guest may subsequently recreate read-only mappings to the frames
>>>>>> in
>>>>>> question (although frankly it is getting late so you will want to
>>>>>> double
>>>>>> check all of this).
>>>>>>
>>>>>> Anyhow, this looks like an issue which should be fixed up with
>>>>>> slightly
>>>>>> more PVOps, rather than enforcing a Xen view of the world on native
>>>>>> Linux.
>>>>>>
>>>>> I could presumably make the allocation the other way around so the
>>>>> size is at the end. I could even use two separate allocations if
>>>>> needed.
>
>
> Why not wrap mm_context_t's ldt and size into a struct (just like ldt_struct
> but without __pad) and have a single allocation of ldt?
>
> I.e.
>
> struct ldt_struct {
> int size;
> struct desc_struct *entries;
> }
>
> --- a/arch/x86/include/asm/mmu.h
> +++ b/arch/x86/include/asm/mmu.h
> @@ -9,8 +9,7 @@
> * we put the segment information here.
> */
> typedef struct {
> - void *ldt;
> - int size;
> + struct ldt_struct ldt;
> #ifdef CONFIG_X86_64
> /* True if mm supports a task running in 32 bit compatibility mode. */

I want the atomic read of both of them. The current code make
interesting assumptions about ordering that may or may not be correct
but are certainly not obviously correct.

--Andy

2015-07-22 02:53:28

by Brian Gerst

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] x86/ldt: Make modify_ldt synchronous

On Tue, Jul 21, 2015 at 10:12 PM, Andy Lutomirski <[email protected]> wrote:
> On Tue, Jul 21, 2015 at 7:01 PM, Brian Gerst <[email protected]> wrote:
>> On Tue, Jul 21, 2015 at 3:59 PM, Andy Lutomirski <[email protected]> wrote:
>>> modify_ldt has questionable locking and does not synchronize
>>> threads. Improve it: redesign the locking and synchronize all
>>> threads' LDTs using an IPI on all modifications.
>>
>> What does this fix? I can see sending an IPI if the LDT is
>> reallocated, but on every update seems unnecessary.
>>
>
> It prevents nastiness in which you're in user mode with an impossible
> CS or SS, resulting in potentially interesting artifacts in
> interrupts, NMIs, etc.

By impossible, do you mean a partially updated descriptor when the
interrupt occurs? Would making sure that the descriptor is atomically
updated (using set_64bit()) fix that?

--
Brian Gerst

2015-07-22 04:22:54

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2 1/3] x86/ldt: Make modify_ldt synchronous

On Tue, Jul 21, 2015 at 7:53 PM, Brian Gerst <[email protected]> wrote:
> On Tue, Jul 21, 2015 at 10:12 PM, Andy Lutomirski <[email protected]> wrote:
>> On Tue, Jul 21, 2015 at 7:01 PM, Brian Gerst <[email protected]> wrote:
>>> On Tue, Jul 21, 2015 at 3:59 PM, Andy Lutomirski <[email protected]> wrote:
>>>> modify_ldt has questionable locking and does not synchronize
>>>> threads. Improve it: redesign the locking and synchronize all
>>>> threads' LDTs using an IPI on all modifications.
>>>
>>> What does this fix? I can see sending an IPI if the LDT is
>>> reallocated, but on every update seems unnecessary.
>>>
>>
>> It prevents nastiness in which you're in user mode with an impossible
>> CS or SS, resulting in potentially interesting artifacts in
>> interrupts, NMIs, etc.
>
> By impossible, do you mean a partially updated descriptor when the
> interrupt occurs? Would making sure that the descriptor is atomically
> updated (using set_64bit()) fix that?
>

I tried to exploit that once and didn't get very far. If I could
cause the LDT to be populated one bit at a time, I think I could
materialize a call gate out of thin air. The docs are unclear on
what, if anything, the memory ordering rules are.

I'm more concerned about the case where a segment register caches a
value that is no longer in the LDT. If it's DS, ES, FS, or GS, it
results in nondeterministic behavior but is otherwise not a big deal.
If it's CS or SS, then an interrupt or exception will write a stack
frame with the selectors but IRET can fault.

If the interrupt is an NMI and certain other conditions are met and
your kernel is older than 4.2-rc3, then you should read the
CVE-2015-5157 advisory I'll send tomorrow :) Even on 4.2-rc3, the NMI
code still struggles a bit if this happens.

With this patch, you can never be in user mode with CS or SS selectors
that point at the LDT but don't match the LDT. That makes me a lot
more comfortable with modify_ldt.

--Andy

2015-07-22 06:06:49

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH v2 2/3] x86/ldt: Make modify_ldt optional


* Andy Lutomirski <[email protected]> wrote:

> On Tue, Jul 21, 2015 at 1:28 PM, Brian Gerst <[email protected]> wrote:
> > On Tue, Jul 21, 2015 at 3:59 PM, Andy Lutomirski <[email protected]> wrote:
> >>
> >> The modify_ldt syscall exposes a large attack surface and is unnecessary for
> >> modern userspace. Make it optional.
> >>
> >> Signed-off-by: Andy Lutomirski <[email protected]>
> >> ---
> >> arch/x86/Kconfig | 17 +++++++++++++++++
> >> arch/x86/include/asm/mmu.h | 2 ++
> >> arch/x86/include/asm/mmu_context.h | 31 +++++++++++++++++++++++--------
> >> arch/x86/kernel/Makefile | 3 ++-
> >> arch/x86/kernel/cpu/perf_event.c | 4 ++++
> >> arch/x86/kernel/process_64.c | 2 ++
> >> arch/x86/kernel/step.c | 2 ++
> >> kernel/sys_ni.c | 1 +
> >> 8 files changed, 53 insertions(+), 9 deletions(-)
> >>
> >> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> >> index 55bced17dc95..a7ff3980bd65 100644
> >> --- a/arch/x86/Kconfig
> >> +++ b/arch/x86/Kconfig
> >> @@ -1009,6 +1009,7 @@ config VM86
> >> config X86_16BIT
> >> bool "Enable support for 16-bit segments" if EXPERT
> >> default y
> >> + depends on MODIFY_LDT_SYSCALL
> >> ---help---
> >> This option is required by programs like Wine to run 16-bit
> >> protected mode legacy code on x86 processors. Disabling
> >> @@ -2047,6 +2048,22 @@ config CMDLINE_OVERRIDE
> >> This is used to work around broken boot loaders. This should
> >> be set to 'N' under normal conditions.
> >>
> >> +config MODIFY_LDT_SYSCALL
> >> + bool "Enable the LDT (local descriptor table)" if EXPERT
> >> + default y
> >> + ---help---
> >> + Linux can allow user programs to install a per-process x86
> >> + Local Descriptor Table (LDT) using the modify_ldt(2) system
> >> + call. This is required to run 16-bit or segmented code such as
> >> + DOSEMU or some Wine programs. It is also used by some very old
> >> + threading libraries.
> >> +
> >> + Enabling this feature adds a small amount of overhead to
> >> + context switches and increases the low-level kernel attack
> >> + surface. Disabling it removes the modify_ldt(2) system call.
> >> +
> >> + Saying 'N' here may make sense for embedded or server kernels.
> >> +
> >
> > I believe Wine still uses the LDT for thread-local data, even for 32
> > and 64-bit programs. This is separate from the Linux runtime TLS.
> >
>
> Really? I thought the whole reason we had three set_thread_area slots
> was for Wine.

Too bad we have to guess, if only we had the Wine source code under a nicely
accessible Git archive or so to check?

git clone git://source.winehq.org/git/wine.git

;-)

Thanks,

Ingo

2015-07-22 06:23:24

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2 2/3] x86/ldt: Make modify_ldt optional

On Tue, Jul 21, 2015 at 11:06 PM, Ingo Molnar <[email protected]> wrote:
>
> * Andy Lutomirski <[email protected]> wrote:
>
>> On Tue, Jul 21, 2015 at 1:28 PM, Brian Gerst <[email protected]> wrote:
>> > On Tue, Jul 21, 2015 at 3:59 PM, Andy Lutomirski <[email protected]> wrote:
>> >>
>> >> The modify_ldt syscall exposes a large attack surface and is unnecessary for
>> >> modern userspace. Make it optional.
>> >>
>> >> Signed-off-by: Andy Lutomirski <[email protected]>
>> >> ---
>> >> arch/x86/Kconfig | 17 +++++++++++++++++
>> >> arch/x86/include/asm/mmu.h | 2 ++
>> >> arch/x86/include/asm/mmu_context.h | 31 +++++++++++++++++++++++--------
>> >> arch/x86/kernel/Makefile | 3 ++-
>> >> arch/x86/kernel/cpu/perf_event.c | 4 ++++
>> >> arch/x86/kernel/process_64.c | 2 ++
>> >> arch/x86/kernel/step.c | 2 ++
>> >> kernel/sys_ni.c | 1 +
>> >> 8 files changed, 53 insertions(+), 9 deletions(-)
>> >>
>> >> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> >> index 55bced17dc95..a7ff3980bd65 100644
>> >> --- a/arch/x86/Kconfig
>> >> +++ b/arch/x86/Kconfig
>> >> @@ -1009,6 +1009,7 @@ config VM86
>> >> config X86_16BIT
>> >> bool "Enable support for 16-bit segments" if EXPERT
>> >> default y
>> >> + depends on MODIFY_LDT_SYSCALL
>> >> ---help---
>> >> This option is required by programs like Wine to run 16-bit
>> >> protected mode legacy code on x86 processors. Disabling
>> >> @@ -2047,6 +2048,22 @@ config CMDLINE_OVERRIDE
>> >> This is used to work around broken boot loaders. This should
>> >> be set to 'N' under normal conditions.
>> >>
>> >> +config MODIFY_LDT_SYSCALL
>> >> + bool "Enable the LDT (local descriptor table)" if EXPERT
>> >> + default y
>> >> + ---help---
>> >> + Linux can allow user programs to install a per-process x86
>> >> + Local Descriptor Table (LDT) using the modify_ldt(2) system
>> >> + call. This is required to run 16-bit or segmented code such as
>> >> + DOSEMU or some Wine programs. It is also used by some very old
>> >> + threading libraries.
>> >> +
>> >> + Enabling this feature adds a small amount of overhead to
>> >> + context switches and increases the low-level kernel attack
>> >> + surface. Disabling it removes the modify_ldt(2) system call.
>> >> +
>> >> + Saying 'N' here may make sense for embedded or server kernels.
>> >> +
>> >
>> > I believe Wine still uses the LDT for thread-local data, even for 32
>> > and 64-bit programs. This is separate from the Linux runtime TLS.
>> >
>>
>> Really? I thought the whole reason we had three set_thread_area slots
>> was for Wine.
>
> Too bad we have to guess, if only we had the Wine source code under a nicely
> accessible Git archive or so to check?
>
> git clone git://source.winehq.org/git/wine.git
>
> ;-)

You don't say?

It appears that Wine uses set_thread_area with a fallback to
modify_ldt for 32-bit binaries and arch_prctl for 64-bit.

--Andy

2015-07-22 06:27:09

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH v2 2/3] x86/ldt: Make modify_ldt optional


* Andy Lutomirski <[email protected]> wrote:

> On Tue, Jul 21, 2015 at 11:06 PM, Ingo Molnar <[email protected]> wrote:
> >
> > * Andy Lutomirski <[email protected]> wrote:
> >
> >> On Tue, Jul 21, 2015 at 1:28 PM, Brian Gerst <[email protected]> wrote:
> >> > On Tue, Jul 21, 2015 at 3:59 PM, Andy Lutomirski <[email protected]> wrote:
> >> >>
> >> >> The modify_ldt syscall exposes a large attack surface and is unnecessary for
> >> >> modern userspace. Make it optional.
> >> >>
> >> >> Signed-off-by: Andy Lutomirski <[email protected]>
> >> >> ---
> >> >> arch/x86/Kconfig | 17 +++++++++++++++++
> >> >> arch/x86/include/asm/mmu.h | 2 ++
> >> >> arch/x86/include/asm/mmu_context.h | 31 +++++++++++++++++++++++--------
> >> >> arch/x86/kernel/Makefile | 3 ++-
> >> >> arch/x86/kernel/cpu/perf_event.c | 4 ++++
> >> >> arch/x86/kernel/process_64.c | 2 ++
> >> >> arch/x86/kernel/step.c | 2 ++
> >> >> kernel/sys_ni.c | 1 +
> >> >> 8 files changed, 53 insertions(+), 9 deletions(-)
> >> >>
> >> >> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> >> >> index 55bced17dc95..a7ff3980bd65 100644
> >> >> --- a/arch/x86/Kconfig
> >> >> +++ b/arch/x86/Kconfig
> >> >> @@ -1009,6 +1009,7 @@ config VM86
> >> >> config X86_16BIT
> >> >> bool "Enable support for 16-bit segments" if EXPERT
> >> >> default y
> >> >> + depends on MODIFY_LDT_SYSCALL
> >> >> ---help---
> >> >> This option is required by programs like Wine to run 16-bit
> >> >> protected mode legacy code on x86 processors. Disabling
> >> >> @@ -2047,6 +2048,22 @@ config CMDLINE_OVERRIDE
> >> >> This is used to work around broken boot loaders. This should
> >> >> be set to 'N' under normal conditions.
> >> >>
> >> >> +config MODIFY_LDT_SYSCALL
> >> >> + bool "Enable the LDT (local descriptor table)" if EXPERT
> >> >> + default y
> >> >> + ---help---
> >> >> + Linux can allow user programs to install a per-process x86
> >> >> + Local Descriptor Table (LDT) using the modify_ldt(2) system
> >> >> + call. This is required to run 16-bit or segmented code such as
> >> >> + DOSEMU or some Wine programs. It is also used by some very old
> >> >> + threading libraries.
> >> >> +
> >> >> + Enabling this feature adds a small amount of overhead to
> >> >> + context switches and increases the low-level kernel attack
> >> >> + surface. Disabling it removes the modify_ldt(2) system call.
> >> >> +
> >> >> + Saying 'N' here may make sense for embedded or server kernels.
> >> >> +
> >> >
> >> > I believe Wine still uses the LDT for thread-local data, even for 32
> >> > and 64-bit programs. This is separate from the Linux runtime TLS.
> >> >
> >>
> >> Really? I thought the whole reason we had three set_thread_area slots
> >> was for Wine.
> >
> > Too bad we have to guess, if only we had the Wine source code under a nicely
> > accessible Git archive or so to check?
> >
> > git clone git://source.winehq.org/git/wine.git
> >
> > ;-)
>
> You don't say?
>
> It appears that Wine uses set_thread_area with a fallback to
> modify_ldt for 32-bit binaries and arch_prctl for 64-bit.

I also think it uses modify_ldt() unconditionally for 16-bit APIs:

triton:~/wine> git grep wine_ldt_set_entry
dlls/krnl386.exe16/dosvm.c: wine_ldt_set_entry( sel, &entry );
dlls/krnl386.exe16/int31.c: wine_ldt_set_entry( sel, &entry );
dlls/krnl386.exe16/int31.c: wine_ldt_set_entry( BX_reg(context), entry );
dlls/krnl386.exe16/selector.c: if (wine_ldt_set_entry( sel + (i << __AHSHIFT), &entry ) < 0)
dlls/krnl386.exe16/selector.c: wine_ldt_set_entry( newsel + (i << __AHSHIFT), &entry );
dlls/krnl386.exe16/selector.c: if (wine_ldt_set_entry( sel + (i << __AHSHIFT), &entry ) < 0) return FALSE;
dlls/krnl386.exe16/selector.c: wine_ldt_set_entry( selDst, &entry );
dlls/krnl386.exe16/selector.c: if (wine_ldt_set_entry( newsel, &entry ) >= 0) return newsel;
dlls/krnl386.exe16/selector.c: if (wine_ldt_set_entry( newsel, &entry ) >= 0) return newsel;
dlls/krnl386.exe16/selector.c: wine_ldt_set_entry( SELECTOROF(ptr), &entry );
dlls/krnl386.exe16/selector.c: if (wine_ldt_set_entry( sel, &entry ) < 0) sel = 0;
dlls/krnl386.exe16/selector.c: if (wine_ldt_set_entry( sel, &entry ) < 0) sel = 0;
dlls/krnl386.exe16/selector.c: wine_ldt_set_entry( sel, &entry );
dlls/krnl386.exe16/wowthunk.c: wine_ldt_set_entry( codesel, &entry );
dlls/user.exe16/message.c: wine_ldt_set_entry( thunk_selector, &entry );
include/wine/library.h:extern int wine_ldt_set_entry( unsigned short sel, const LDT_ENTRY *entry );
libs/wine/ldt.c: * wine_ldt_set_entry
libs/wine/ldt.c:int wine_ldt_set_entry( unsigned short sel, const LDT_ENTRY *entry )
libs/wine/wine.map: wine_ldt_set_entry;

So the situation looks mostly encouraging, IMHO.

Plus an 'strace -fc' output of a modern, Wine driven Windows game would probably
tell us pretty definitely whether there's anything particularly performance
sensitive about modify_ldt().

Thanks,

Ingo

2015-07-22 12:35:19

by Willy Tarreau

[permalink] [raw]
Subject: Re: [PATCH v2 2/3] x86/ldt: Make modify_ldt optional

On Tue, Jul 21, 2015 at 11:23:02PM -0700, Andy Lutomirski wrote:
> >> >> +config MODIFY_LDT_SYSCALL
> >> >> + bool "Enable the LDT (local descriptor table)" if EXPERT
> >> >> + default y
> >> >> + ---help---
> >> >> + Linux can allow user programs to install a per-process x86
> >> >> + Local Descriptor Table (LDT) using the modify_ldt(2) system
> >> >> + call. This is required to run 16-bit or segmented code such as
> >> >> + DOSEMU or some Wine programs. It is also used by some very old
> >> >> + threading libraries.
> >> >> +
> >> >> + Enabling this feature adds a small amount of overhead to
> >> >> + context switches and increases the low-level kernel attack
> >> >> + surface. Disabling it removes the modify_ldt(2) system call.
> >> >> +
> >> >> + Saying 'N' here may make sense for embedded or server kernels.
> >> >> +
> >> >
> >> > I believe Wine still uses the LDT for thread-local data, even for 32
> >> > and 64-bit programs. This is separate from the Linux runtime TLS.
> >> >
> >>
> >> Really? I thought the whole reason we had three set_thread_area slots
> >> was for Wine.
> >
> > Too bad we have to guess, if only we had the Wine source code under a nicely
> > accessible Git archive or so to check?
> >
> > git clone git://source.winehq.org/git/wine.git
> >
> > ;-)
>
> You don't say?
>
> It appears that Wine uses set_thread_area with a fallback to
> modify_ldt for 32-bit binaries and arch_prctl for 64-bit.

Why wouldn't we have this (as well as X86_16BIT) as a sysctl for the long
term, just like we've finally got rid of NULL mapping ? We would encourage
distros to ship with those settings disabled by default and to only enable
them when breakage is *observed*. Currently I think that adding new config
options will just make distro ship with the option enabled "just in case".

It's also a nice way to discover users of these mechanisms and to suggest
their developers to contemplate other options or to whine loudly.

Willy

2015-07-22 18:49:43

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2 2/3] x86/ldt: Make modify_ldt optional

On Tue, Jul 21, 2015 at 11:27 PM, Ingo Molnar <[email protected]> wrote:
>
> * Andy Lutomirski <[email protected]> wrote:
>
>> On Tue, Jul 21, 2015 at 11:06 PM, Ingo Molnar <[email protected]> wrote:
>> >
>> > * Andy Lutomirski <[email protected]> wrote:
>> >
>> >> On Tue, Jul 21, 2015 at 1:28 PM, Brian Gerst <[email protected]> wrote:
>> >> > On Tue, Jul 21, 2015 at 3:59 PM, Andy Lutomirski <[email protected]> wrote:
>> >> >>
>> >> >> The modify_ldt syscall exposes a large attack surface and is unnecessary for
>> >> >> modern userspace. Make it optional.
>> >> >>
>> >> >> Signed-off-by: Andy Lutomirski <[email protected]>
>> >> >> ---
>> >> >> arch/x86/Kconfig | 17 +++++++++++++++++
>> >> >> arch/x86/include/asm/mmu.h | 2 ++
>> >> >> arch/x86/include/asm/mmu_context.h | 31 +++++++++++++++++++++++--------
>> >> >> arch/x86/kernel/Makefile | 3 ++-
>> >> >> arch/x86/kernel/cpu/perf_event.c | 4 ++++
>> >> >> arch/x86/kernel/process_64.c | 2 ++
>> >> >> arch/x86/kernel/step.c | 2 ++
>> >> >> kernel/sys_ni.c | 1 +
>> >> >> 8 files changed, 53 insertions(+), 9 deletions(-)
>> >> >>
>> >> >> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> >> >> index 55bced17dc95..a7ff3980bd65 100644
>> >> >> --- a/arch/x86/Kconfig
>> >> >> +++ b/arch/x86/Kconfig
>> >> >> @@ -1009,6 +1009,7 @@ config VM86
>> >> >> config X86_16BIT
>> >> >> bool "Enable support for 16-bit segments" if EXPERT
>> >> >> default y
>> >> >> + depends on MODIFY_LDT_SYSCALL
>> >> >> ---help---
>> >> >> This option is required by programs like Wine to run 16-bit
>> >> >> protected mode legacy code on x86 processors. Disabling
>> >> >> @@ -2047,6 +2048,22 @@ config CMDLINE_OVERRIDE
>> >> >> This is used to work around broken boot loaders. This should
>> >> >> be set to 'N' under normal conditions.
>> >> >>
>> >> >> +config MODIFY_LDT_SYSCALL
>> >> >> + bool "Enable the LDT (local descriptor table)" if EXPERT
>> >> >> + default y
>> >> >> + ---help---
>> >> >> + Linux can allow user programs to install a per-process x86
>> >> >> + Local Descriptor Table (LDT) using the modify_ldt(2) system
>> >> >> + call. This is required to run 16-bit or segmented code such as
>> >> >> + DOSEMU or some Wine programs. It is also used by some very old
>> >> >> + threading libraries.
>> >> >> +
>> >> >> + Enabling this feature adds a small amount of overhead to
>> >> >> + context switches and increases the low-level kernel attack
>> >> >> + surface. Disabling it removes the modify_ldt(2) system call.
>> >> >> +
>> >> >> + Saying 'N' here may make sense for embedded or server kernels.
>> >> >> +
>> >> >
>> >> > I believe Wine still uses the LDT for thread-local data, even for 32
>> >> > and 64-bit programs. This is separate from the Linux runtime TLS.
>> >> >
>> >>
>> >> Really? I thought the whole reason we had three set_thread_area slots
>> >> was for Wine.
>> >
>> > Too bad we have to guess, if only we had the Wine source code under a nicely
>> > accessible Git archive or so to check?
>> >
>> > git clone git://source.winehq.org/git/wine.git
>> >
>> > ;-)
>>
>> You don't say?
>>
>> It appears that Wine uses set_thread_area with a fallback to
>> modify_ldt for 32-bit binaries and arch_prctl for 64-bit.
>
> I also think it uses modify_ldt() unconditionally for 16-bit APIs:
>
> triton:~/wine> git grep wine_ldt_set_entry
> dlls/krnl386.exe16/dosvm.c: wine_ldt_set_entry( sel, &entry );
> dlls/krnl386.exe16/int31.c: wine_ldt_set_entry( sel, &entry );
> dlls/krnl386.exe16/int31.c: wine_ldt_set_entry( BX_reg(context), entry );
> dlls/krnl386.exe16/selector.c: if (wine_ldt_set_entry( sel + (i << __AHSHIFT), &entry ) < 0)
> dlls/krnl386.exe16/selector.c: wine_ldt_set_entry( newsel + (i << __AHSHIFT), &entry );
> dlls/krnl386.exe16/selector.c: if (wine_ldt_set_entry( sel + (i << __AHSHIFT), &entry ) < 0) return FALSE;
> dlls/krnl386.exe16/selector.c: wine_ldt_set_entry( selDst, &entry );
> dlls/krnl386.exe16/selector.c: if (wine_ldt_set_entry( newsel, &entry ) >= 0) return newsel;
> dlls/krnl386.exe16/selector.c: if (wine_ldt_set_entry( newsel, &entry ) >= 0) return newsel;
> dlls/krnl386.exe16/selector.c: wine_ldt_set_entry( SELECTOROF(ptr), &entry );
> dlls/krnl386.exe16/selector.c: if (wine_ldt_set_entry( sel, &entry ) < 0) sel = 0;
> dlls/krnl386.exe16/selector.c: if (wine_ldt_set_entry( sel, &entry ) < 0) sel = 0;
> dlls/krnl386.exe16/selector.c: wine_ldt_set_entry( sel, &entry );
> dlls/krnl386.exe16/wowthunk.c: wine_ldt_set_entry( codesel, &entry );
> dlls/user.exe16/message.c: wine_ldt_set_entry( thunk_selector, &entry );
> include/wine/library.h:extern int wine_ldt_set_entry( unsigned short sel, const LDT_ENTRY *entry );
> libs/wine/ldt.c: * wine_ldt_set_entry
> libs/wine/ldt.c:int wine_ldt_set_entry( unsigned short sel, const LDT_ENTRY *entry )
> libs/wine/wine.map: wine_ldt_set_entry;
>
> So the situation looks mostly encouraging, IMHO.
>
> Plus an 'strace -fc' output of a modern, Wine driven Windows game would probably
> tell us pretty definitely whether there's anything particularly performance
> sensitive about modify_ldt().
>

I don't have any of those. However, strace -fc reports that winemine
doesn't call modify_ldt at all, which leads me to believe that this is
a nonissue except for 16-bit games.

Dosemu calls modify_ldt five times at startup.

--Andy