Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753134AbbGVWWW (ORCPT ); Wed, 22 Jul 2015 18:22:22 -0400 Received: from userp1040.oracle.com ([156.151.31.81]:42968 "EHLO userp1040.oracle.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751756AbbGVWWT (ORCPT ); Wed, 22 Jul 2015 18:22:19 -0400 Message-ID: <55B01745.4010702@oracle.com> Date: Wed, 22 Jul 2015 18:20:53 -0400 From: Boris Ostrovsky User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Thunderbird/31.7.0 MIME-Version: 1.0 To: Andy Lutomirski , Peter Zijlstra , Steven Rostedt CC: "security@kernel.org" , X86 ML , Borislav Petkov , Sasha Levin , linux-kernel@vger.kernel.org, Konrad Rzeszutek Wilk , Andrew Cooper , Jan Beulich , xen-devel , stable@vger.kernel.org Subject: Re: [PATCH v3 1/3] x86/ldt: Make modify_ldt synchronous References: <049fdbab8ae2ecac1c8b40ecd558e9df45ccd5d3.1437592883.git.luto@kernel.org> In-Reply-To: <049fdbab8ae2ecac1c8b40ecd558e9df45ccd5d3.1437592883.git.luto@kernel.org> Content-Type: text/plain; charset=windows-1252; format=flowed Content-Transfer-Encoding: 7bit X-Source-IP: aserv0021.oracle.com [141.146.126.233] Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 19552 Lines: 629 On 07/22/2015 03:23 PM, Andy Lutomirski wrote: > modify_ldt has questionable locking and does not synchronize > threads. Improve it: redesign the locking and synchronize all > threads' LDTs using an IPI on all modifications. > > This will dramatically slow down modify_ldt in multithreaded > programs, but there shouldn't be any multithreaded programs that > care about modify_ldt's performance in the first place. > > Cc: stable@vger.kernel.org > Signed-off-by: Andy Lutomirski > --- > arch/x86/include/asm/desc.h | 15 --- > arch/x86/include/asm/mmu.h | 3 +- > arch/x86/include/asm/mmu_context.h | 53 +++++++- > arch/x86/kernel/cpu/common.c | 4 +- > arch/x86/kernel/cpu/perf_event.c | 12 +- > arch/x86/kernel/ldt.c | 258 ++++++++++++++++++++----------------- > arch/x86/kernel/process_64.c | 4 +- > arch/x86/kernel/step.c | 6 +- > arch/x86/power/cpu.c | 3 +- > 9 files changed, 209 insertions(+), 149 deletions(-) > > diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h > index a0bf89fd2647..4e10d73cf018 100644 > --- a/arch/x86/include/asm/desc.h > +++ b/arch/x86/include/asm/desc.h > @@ -280,21 +280,6 @@ static inline void clear_LDT(void) > set_ldt(NULL, 0); > } > > -/* > - * load one particular LDT into the current CPU > - */ > -static inline void load_LDT_nolock(mm_context_t *pc) > -{ > - set_ldt(pc->ldt, pc->size); > -} > - > -static inline void load_LDT(mm_context_t *pc) > -{ > - preempt_disable(); > - load_LDT_nolock(pc); > - preempt_enable(); > -} > - > static inline unsigned long get_desc_base(const struct desc_struct *desc) > { > return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24)); > diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h > index 09b9620a73b4..364d27481a52 100644 > --- a/arch/x86/include/asm/mmu.h > +++ b/arch/x86/include/asm/mmu.h > @@ -9,8 +9,7 @@ > * we put the segment information here. > */ > typedef struct { > - void *ldt; > - int size; > + struct ldt_struct *ldt; > > #ifdef CONFIG_X86_64 > /* True if mm supports a task running in 32 bit compatibility mode. */ > diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h > index 804a3a6030ca..3fcff70c398e 100644 > --- a/arch/x86/include/asm/mmu_context.h > +++ b/arch/x86/include/asm/mmu_context.h > @@ -34,6 +34,49 @@ static inline void load_mm_cr4(struct mm_struct *mm) {} > #endif > > /* > + * ldt_structs can be allocated, used, and freed, but they are never > + * modified while live. > + */ > +struct ldt_struct { > + /* > + * Xen requires page-aligned LDTs with special permissions. This is > + * needed to prevent us from installing evil descriptors such as > + * call gates. On native, we could merge the ldt_struct and LDT > + * allocations, but it's not worth trying to optimize. > + */ > + struct desc_struct *entries; > + int size; > +}; > + > +static inline void load_mm_ldt(struct mm_struct *mm) > +{ > + struct ldt_struct *ldt; > + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); > + > + /* lockless_dereference synchronizes with smp_store_release */ > + ldt = lockless_dereference(mm->context.ldt); > + > + /* > + * Any change to mm->context.ldt is followed by an IPI to all > + * CPUs with the mm active. The LDT will not be freed until > + * after the IPI is handled by all such CPUs. This means that, > + * if the ldt_struct changes before we return, the values we see > + * will be safe, and the new values will be loaded before we run > + * any user code. > + * > + * NB: don't try to convert this to use RCU without extreme care. > + * We would still need IRQs off, because we don't want to change > + * the local LDT after an IPI loaded a newer value than the one > + * that we can see. > + */ > + > + if (unlikely(ldt)) > + set_ldt(ldt->entries, ldt->size); > + else > + clear_LDT(); > +} > + > +/* > * Used for LDT copy/destruction. > */ > int init_new_context(struct task_struct *tsk, struct mm_struct *mm); > @@ -78,12 +121,12 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, > * was called and then modify_ldt changed > * prev->context.ldt but suppressed an IPI to this CPU. > * In this case, prev->context.ldt != NULL, because we > - * never free an LDT while the mm still exists. That > - * means that next->context.ldt != prev->context.ldt, > - * because mms never share an LDT. > + * never set context.ldt to NULL while the mm still > + * exists. That means that next->context.ldt != > + * prev->context.ldt, because mms never share an LDT. > */ > if (unlikely(prev->context.ldt != next->context.ldt)) > - load_LDT_nolock(&next->context); > + load_mm_ldt(next); > } > #ifdef CONFIG_SMP > else { > @@ -106,7 +149,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, > load_cr3(next->pgd); > trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); > load_mm_cr4(next); > - load_LDT_nolock(&next->context); > + load_mm_ldt(next); > } > } > #endif > diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c > index 922c5e0cea4c..cb9e5df42dd2 100644 > --- a/arch/x86/kernel/cpu/common.c > +++ b/arch/x86/kernel/cpu/common.c > @@ -1410,7 +1410,7 @@ void cpu_init(void) > load_sp0(t, ¤t->thread); > set_tss_desc(cpu, t); > load_TR_desc(); > - load_LDT(&init_mm.context); > + load_mm_ldt(&init_mm); > > clear_all_debug_regs(); > dbg_restore_debug_regs(); > @@ -1459,7 +1459,7 @@ void cpu_init(void) > load_sp0(t, thread); > set_tss_desc(cpu, t); > load_TR_desc(); > - load_LDT(&init_mm.context); > + load_mm_ldt(&init_mm); > > t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); > > diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c > index 3658de47900f..9469dfa55607 100644 > --- a/arch/x86/kernel/cpu/perf_event.c > +++ b/arch/x86/kernel/cpu/perf_event.c > @@ -2179,21 +2179,25 @@ static unsigned long get_segment_base(unsigned int segment) > int idx = segment >> 3; > > if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { > + struct ldt_struct *ldt; > + > if (idx > LDT_ENTRIES) > return 0; > > - if (idx > current->active_mm->context.size) > + /* IRQs are off, so this synchronizes with smp_store_release */ > + ldt = lockless_dereference(current->active_mm->context.ldt); > + if (!ldt || idx > ldt->size) > return 0; > > - desc = current->active_mm->context.ldt; > + desc = &ldt->entries[idx]; > } else { > if (idx > GDT_ENTRIES) > return 0; > > - desc = raw_cpu_ptr(gdt_page.gdt); > + desc = raw_cpu_ptr(gdt_page.gdt) + idx; > } > > - return get_desc_base(desc + idx); > + return get_desc_base(desc); > } > > #ifdef CONFIG_COMPAT > diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c > index c37886d759cc..3ae308029dee 100644 > --- a/arch/x86/kernel/ldt.c > +++ b/arch/x86/kernel/ldt.c > @@ -12,6 +12,7 @@ > #include > #include > #include > +#include > #include > #include > > @@ -20,82 +21,83 @@ > #include > #include > > -#ifdef CONFIG_SMP > static void flush_ldt(void *current_mm) > { > - if (current->active_mm == current_mm) > - load_LDT(¤t->active_mm->context); > + if (current->active_mm == current_mm) { > + /* context.lock is held for us, so we don't need any locking. */ > + mm_context_t *pc = ¤t->active_mm->context; > + set_ldt(pc->ldt->entries, pc->ldt->size); > + } > } > -#endif > > -static int alloc_ldt(mm_context_t *pc, int mincount, int reload) > +/* The caller must call finalize_ldt_struct on the result. */ > +static struct ldt_struct *alloc_ldt_struct(int size) > { > - void *oldldt, *newldt; > - int oldsize; > + struct ldt_struct *new_ldt; > + int alloc_size; > > - if (mincount <= pc->size) > - return 0; > - oldsize = pc->size; > - mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) & > - (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1)); > - if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE) > - newldt = vmalloc(mincount * LDT_ENTRY_SIZE); > + if (size > LDT_ENTRIES) > + return NULL; > + > + new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL); > + if (!new_ldt) > + return NULL; > + > + BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct)); > + alloc_size = size * LDT_ENTRY_SIZE; > + > + if (alloc_size > PAGE_SIZE) > + new_ldt->entries = vmalloc(alloc_size); > else > - newldt = (void *)__get_free_page(GFP_KERNEL); > + new_ldt->entries = (void *)__get_free_page(GFP_KERNEL); > + if (!new_ldt->entries) { > + kfree(new_ldt); > + return NULL; > + } > > - if (!newldt) > - return -ENOMEM; > + new_ldt->size = size; > + return new_ldt; > +} > + > +/* After calling this, the LDT is immutable. */ > +static void finalize_ldt_struct(struct ldt_struct *ldt) > +{ > + paravirt_alloc_ldt(ldt->entries, ldt->size); > +} > > - if (oldsize) > - memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE); > - oldldt = pc->ldt; > - memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, > - (mincount - oldsize) * LDT_ENTRY_SIZE); > +static void install_ldt(struct mm_struct *current_mm, > + struct ldt_struct *ldt) > +{ > + /* context.lock is held */ > + preempt_disable(); > > - paravirt_alloc_ldt(newldt, mincount); > + /* Synchronizes with lockless_dereference in load_mm_ldt. */ > + smp_store_release(¤t_mm->context.ldt, ldt); > > -#ifdef CONFIG_X86_64 > - /* CHECKME: Do we really need this ? */ > - wmb(); > -#endif > - pc->ldt = newldt; > - wmb(); > - pc->size = mincount; > - wmb(); > + /* Activate for this CPU. */ > + flush_ldt(current->mm); > > - if (reload) { > #ifdef CONFIG_SMP > - preempt_disable(); > - load_LDT(pc); > - if (!cpumask_equal(mm_cpumask(current->mm), > - cpumask_of(smp_processor_id()))) > - smp_call_function(flush_ldt, current->mm, 1); > - preempt_enable(); > -#else > - load_LDT(pc); > + /* Synchronize with other CPUs. */ > + if (!cpumask_equal(mm_cpumask(current_mm), > + cpumask_of(smp_processor_id()))) > + smp_call_function(flush_ldt, current_mm, 1); > #endif > - } > - if (oldsize) { > - paravirt_free_ldt(oldldt, oldsize); > - if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) > - vfree(oldldt); > - else > - put_page(virt_to_page(oldldt)); > - } > - return 0; > + preempt_enable(); > } > > -static inline int copy_ldt(mm_context_t *new, mm_context_t *old) > +static void free_ldt_struct(struct ldt_struct *ldt) > { > - int err = alloc_ldt(new, old->size, 0); > - int i; > - > - if (err < 0) > - return err; > - > - for (i = 0; i < old->size; i++) > - write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE); > - return 0; > + if (unlikely(ldt)) { > + int alloc_size = sizeof(struct ldt_struct) + > + ldt->size * LDT_ENTRY_SIZE; > + paravirt_free_ldt(ldt->entries, ldt->size); > + if (alloc_size > PAGE_SIZE) > + vfree(ldt->entries); > + else > + put_page(virt_to_page(ldt->entries)); > + kfree(ldt); > + } > } > > /* > @@ -108,13 +110,32 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) > int retval = 0; > > mutex_init(&mm->context.lock); > - mm->context.size = 0; > old_mm = current->mm; > - if (old_mm && old_mm->context.size > 0) { > - mutex_lock(&old_mm->context.lock); > - retval = copy_ldt(&mm->context, &old_mm->context); > - mutex_unlock(&old_mm->context.lock); > + if (!old_mm) { > + mm->context.ldt = NULL; > + return 0; > + } > + > + mutex_lock(&old_mm->context.lock); > + if (old_mm->context.ldt) { > + struct ldt_struct *new_ldt = > + alloc_ldt_struct(old_mm->context.ldt->size); > + if (!new_ldt) { > + retval = -ENOMEM; > + goto out_unlock; > + } > + > + memcpy(new_ldt->entries, old_mm->context.ldt->entries, > + new_ldt->size * LDT_ENTRY_SIZE); > + finalize_ldt_struct(new_ldt); > + > + mm->context.ldt = new_ldt; > + } else { > + mm->context.ldt = NULL; > } > + > +out_unlock: > + mutex_unlock(&old_mm->context.lock); > return retval; > } > > @@ -125,53 +146,47 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) > */ > void destroy_context(struct mm_struct *mm) > { > - if (mm->context.size) { > -#ifdef CONFIG_X86_32 > - /* CHECKME: Can this ever happen ? */ > - if (mm == current->active_mm) > - clear_LDT(); > -#endif > - paravirt_free_ldt(mm->context.ldt, mm->context.size); > - if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) > - vfree(mm->context.ldt); > - else > - put_page(virt_to_page(mm->context.ldt)); > - mm->context.size = 0; > - } > + free_ldt_struct(mm->context.ldt); > + mm->context.ldt = NULL; > } > > static int read_ldt(void __user *ptr, unsigned long bytecount) > { > - int err; > + int retval; > unsigned long size; > struct mm_struct *mm = current->mm; > > - if (!mm->context.size) > - return 0; > + mutex_lock(&mm->context.lock); > + > + if (!mm->context.ldt) { > + retval = 0; > + goto out_unlock; > + } > + > if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) > bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; > > - mutex_lock(&mm->context.lock); > - size = mm->context.size * LDT_ENTRY_SIZE; > + size = mm->context.ldt->size * LDT_ENTRY_SIZE; > if (size > bytecount) > size = bytecount; > > - err = 0; > - if (copy_to_user(ptr, mm->context.ldt, size)) > - err = -EFAULT; > - mutex_unlock(&mm->context.lock); > - if (err < 0) > - goto error_return; > + if (copy_to_user(ptr, mm->context.ldt->entries, size)) { > + retval = -EFAULT; > + goto out_unlock; > + } > + > if (size != bytecount) { > - /* zero-fill the rest */ > + /* Zero-fill the rest and pretend we read bytecount bytes. */ > if (clear_user(ptr + size, bytecount - size) != 0) { > - err = -EFAULT; > - goto error_return; > + retval = -EFAULT; > + goto out_unlock; > } > } > - return bytecount; > -error_return: > - return err; > + retval = bytecount; > + > +out_unlock: > + mutex_unlock(&mm->context.lock); > + return retval; > } > > static int read_default_ldt(void __user *ptr, unsigned long bytecount) > @@ -195,6 +210,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) > struct desc_struct ldt; > int error; > struct user_desc ldt_info; > + int oldsize, newsize; > + struct ldt_struct *new_ldt, *old_ldt; > > error = -EINVAL; > if (bytecount != sizeof(ldt_info)) > @@ -213,34 +230,43 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) > goto out; > } > > - mutex_lock(&mm->context.lock); > - if (ldt_info.entry_number >= mm->context.size) { > - error = alloc_ldt(¤t->mm->context, > - ldt_info.entry_number + 1, 1); > - if (error < 0) > - goto out_unlock; > - } > - > - /* Allow LDTs to be cleared by the user. */ > - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { > - if (oldmode || LDT_empty(&ldt_info)) { > - memset(&ldt, 0, sizeof(ldt)); > - goto install; > + if ((oldmode && ldt_info.base_addr == 0 && ldt_info.limit == 0) || > + LDT_empty(&ldt_info)) { > + /* The user wants to clear the entry. */ > + memset(&ldt, 0, sizeof(ldt)); > + } else { > + if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) { > + error = -EINVAL; > + goto out; > } > + > + fill_ldt(&ldt, &ldt_info); > + if (oldmode) > + ldt.avl = 0; > } > > - if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) { > - error = -EINVAL; > + mutex_lock(&mm->context.lock); > + > + old_ldt = mm->context.ldt; > + oldsize = old_ldt ? old_ldt->size : 0; > + newsize = max((int)(ldt_info.entry_number + 1), oldsize); > + > + error = -ENOMEM; > + new_ldt = alloc_ldt_struct(newsize); > + if (!new_ldt) > goto out_unlock; > - } > > - fill_ldt(&ldt, &ldt_info); > - if (oldmode) > - ldt.avl = 0; > + if (old_ldt) { > + memcpy(new_ldt->entries, old_ldt->entries, > + oldsize * LDT_ENTRY_SIZE); > + } > + memset(new_ldt->entries + oldsize * LDT_ENTRY_SIZE, 0, > + (newsize - oldsize) * LDT_ENTRY_SIZE); We need to zero out full page (probably better in alloc_ldt_struct() with vmzalloc/__GFP_ZERO) --- Xen checks whole page that is assigned to G/LDT and gets unhappy if an invalid descriptor is found there. This fixes one problem. There is something else that Xen gets upset about, I haven't figured what it is yet (and I am out tomorrow so it may need to wait until Friday). -boris > + new_ldt->entries[ldt_info.entry_number] = ldt; > + finalize_ldt_struct(new_ldt); > > - /* Install the new entry ... */ > -install: > - write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt); > + install_ldt(mm, new_ldt); > + free_ldt_struct(old_ldt); > error = 0; > > out_unlock: > diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c > index 71d7849a07f7..f6b916387590 100644 > --- a/arch/x86/kernel/process_64.c > +++ b/arch/x86/kernel/process_64.c > @@ -121,11 +121,11 @@ void __show_regs(struct pt_regs *regs, int all) > void release_thread(struct task_struct *dead_task) > { > if (dead_task->mm) { > - if (dead_task->mm->context.size) { > + if (dead_task->mm->context.ldt) { > pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", > dead_task->comm, > dead_task->mm->context.ldt, > - dead_task->mm->context.size); > + dead_task->mm->context.ldt->size); > BUG(); > } > } > diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c > index 9b4d51d0c0d0..6273324186ac 100644 > --- a/arch/x86/kernel/step.c > +++ b/arch/x86/kernel/step.c > @@ -5,6 +5,7 @@ > #include > #include > #include > +#include > > unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) > { > @@ -30,10 +31,11 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re > seg &= ~7UL; > > mutex_lock(&child->mm->context.lock); > - if (unlikely((seg >> 3) >= child->mm->context.size)) > + if (unlikely(!child->mm->context.ldt || > + (seg >> 3) >= child->mm->context.ldt->size)) > addr = -1L; /* bogus selector, access would fault */ > else { > - desc = child->mm->context.ldt + seg; > + desc = &child->mm->context.ldt->entries[seg]; > base = get_desc_base(desc); > > /* 16-bit code segment? */ > diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c > index 0d7dd1f5ac36..9ab52791fed5 100644 > --- a/arch/x86/power/cpu.c > +++ b/arch/x86/power/cpu.c > @@ -22,6 +22,7 @@ > #include > #include > #include > +#include > > #ifdef CONFIG_X86_32 > __visible unsigned long saved_context_ebx; > @@ -153,7 +154,7 @@ static void fix_processor_context(void) > syscall_init(); /* This sets MSR_*STAR and related */ > #endif > load_TR_desc(); /* This does ltr */ > - load_LDT(¤t->active_mm->context); /* This does lldt */ > + load_mm_ldt(current->active_mm); /* This does lldt */ > > fpu__resume_cpu(); > } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/