I think I like this approach. I also think it might be nice to move the
whole cpu_entry_area into this new pgd range so that we can stop mucking
around with the fixmap.
TODO:
- It crashes in ldt_gdt_64. Not sure why.
- 5-level docs aren't updated and the code is untested.
Signed-off-by: Andy Lutomirski <[email protected]>
---
Documentation/x86/x86_64/mm.txt | 11 ++--
arch/x86/include/asm/mmu_context.h | 33 +++++++++-
arch/x86/include/asm/pgtable_64_types.h | 2 +
arch/x86/include/asm/processor.h | 23 +++++--
arch/x86/kernel/ldt.c | 109 +++++++++++++++++++++++++++++++-
5 files changed, 161 insertions(+), 17 deletions(-)
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 2d7d6590ade8..bfa44e1cb293 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,13 +12,15 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ...
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
... unused hole ...
+fffffe8000000000 - fffffeffffffffff (=39 bits) LDT range
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable)
-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space (variable)
+[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
Virtual memory map with 5 level page tables:
@@ -39,8 +41,9 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space
-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space
+[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
Architecture defines a 64-bit virtual address. Implementations can support
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 5e1a1ecb65c6..eb87bbeddacc 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -52,13 +52,29 @@ struct ldt_struct {
*/
struct desc_struct *entries;
unsigned int nr_entries;
+
+ /*
+ * If PTI is in use, then the entries array is not mapped while we're
+ * in user mode. The whole array will be aliased at the addressed
+ * given by ldt_slot_va(slot).
+ */
+ int slot;
};
+/* This is a multiple of PAGE_SIZE. */
+#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
+
+static void *ldt_slot_va(int slot)
+{
+ return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
+}
+
/*
* Used for LDT copy/destruction.
*/
int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm);
void destroy_context_ldt(struct mm_struct *mm);
+void ldt_arch_exit_mmap(struct mm_struct *mm);
#else /* CONFIG_MODIFY_LDT_SYSCALL */
static inline int init_new_context_ldt(struct task_struct *tsk,
struct mm_struct *mm)
@@ -90,10 +106,20 @@ static inline void load_mm_ldt(struct mm_struct *mm)
* that we can see.
*/
- if (unlikely(ldt))
- set_ldt(ldt->entries, ldt->nr_entries);
- else
+ if (unlikely(ldt)) {
+ if (static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI)) {
+ if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
+ clear_LDT();
+ return;
+ }
+
+ set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
+ } else {
+ set_ldt(ldt->entries, ldt->nr_entries);
+ }
+ } else {
clear_LDT();
+ }
#else
clear_LDT();
#endif
@@ -185,6 +211,7 @@ static inline void arch_dup_mmap(struct mm_struct *oldmm,
static inline void arch_exit_mmap(struct mm_struct *mm)
{
paravirt_arch_exit_mmap(mm);
+ ldt_arch_exit_mmap(mm);
}
#ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 6d5f45dcd4a1..130f575f8d1e 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -100,6 +100,8 @@ typedef struct { pteval_t pte; } pte_t;
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
#define ESPFIX_PGD_ENTRY _AC(-2, UL)
#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
+#define LDT_PGD_ENTRY _AC(-3, UL)
+#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#define EFI_VA_START ( -4 * (_AC(1, UL) << 30))
#define EFI_VA_END (-68 * (_AC(1, UL) << 30))
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 9e482d8b0b97..9c18da64daa9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -851,13 +851,22 @@ static inline void spin_lock_prefetch(const void *x)
#else
/*
- * User space process size. 47bits minus one guard page. The guard
- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
- * the highest possible canonical userspace address, then that
- * syscall will enter the kernel with a non-canonical return
- * address, and SYSRET will explode dangerously. We avoid this
- * particular problem by preventing anything from being mapped
- * at the maximum canonical address.
+ * User space process size. This is the first address outside the user range.
+ * There are a few constraints that determine this:
+ *
+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
+ * address, then that syscall will enter the kernel with a
+ * non-canonical return address, and SYSRET will explode dangerously.
+ * We avoid this particular problem by preventing anything executable
+ * from being mapped at the maximum canonical address.
+ *
+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
+ * CPUs malfunction if they execute code from the highest canonical page.
+ * They'll speculate right off the end of the canonical space, and
+ * bad things happen. This is worked around in the same way as the
+ * Intel problem.
+ *
+ * With page table isolation enabled, we map the LDT in ... [stay tuned]
*/
#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ae5615b03def..a0008fb26ba2 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -19,6 +19,7 @@
#include <linux/uaccess.h>
#include <asm/ldt.h>
+#include <asm/tlb.h>
#include <asm/desc.h>
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
@@ -46,13 +47,12 @@ static void refresh_ldt_segments(void)
static void flush_ldt(void *__mm)
{
struct mm_struct *mm = __mm;
- mm_context_t *pc;
if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
return;
- pc = &mm->context;
- set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
+ __flush_tlb_all();
+ load_mm_ldt(mm);
refresh_ldt_segments();
}
@@ -90,9 +90,93 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
}
new_ldt->nr_entries = num_entries;
+ new_ldt->slot = -1;
return new_ldt;
}
+static int map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
+{
+#ifdef CONFIG_X86_64
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ bool is_vmalloc;
+ int i;
+ bool awful_hack;
+
+ if (!static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI))
+ return 0;
+
+ WARN_ON(ldt->slot != -1);
+
+ /* Both LDT slots are contained in a single PMD. */
+ pgd = pgd_offset(mm, LDT_BASE_ADDR);
+
+ awful_hack = pgd_none(*pgd);
+
+ p4d = p4d_alloc(mm, pgd, LDT_BASE_ADDR);
+ if (!p4d)
+ return -ENOMEM;
+
+ pud = pud_alloc(mm, p4d, LDT_BASE_ADDR);
+ if (!pud)
+ return -ENOMEM;
+ pmd = pmd_alloc(mm, pud, LDT_BASE_ADDR);
+ if (!pmd)
+ return -ENOMEM;
+ if (pte_alloc(mm, pmd, LDT_BASE_ADDR))
+ return -ENOMEM;
+
+ if (true) {
+ /* awful hack -- pti_set_user_pgd is a mess */
+ /* we can't even use the bool awful_hack because of pti_init_all_pgds(), which poops on our pgd. */
+ kernel_to_user_pgdp(pgd)->pgd = pgd->pgd;
+ }
+
+ is_vmalloc = is_vmalloc_addr(ldt->entries);
+
+ for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
+ unsigned long offset = i << PAGE_SHIFT;
+ unsigned long va = (unsigned long)ldt_slot_va(slot) + offset;
+ const void *src = (char *)ldt->entries + offset;
+ unsigned long pfn = is_vmalloc ? vmalloc_to_pfn(src) :
+ page_to_pfn(virt_to_page(src));
+
+ pte = pte_offset_kernel(pmd, va);
+ set_pte(pte, pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
+ }
+
+ flush_tlb_mm_range(mm,
+ (unsigned long)ldt_slot_va(slot),
+ (unsigned long)ldt_slot_va(slot) + LDT_SLOT_STRIDE,
+ 0);
+
+ ldt->slot = slot;
+
+ return 0;
+#else
+ return -EINVAL;
+#endif
+}
+
+static void free_ldt_pgtables(struct mm_struct *mm)
+{
+#ifdef CONFIG_X86_64
+ struct mmu_gather tlb;
+ unsigned long start = LDT_BASE_ADDR;
+ unsigned long end = start + (1UL << PGDIR_SHIFT);
+
+ if (!static_cpu_has_bug(X86_BUG_CPU_SECURE_MODE_PTI))
+ return;
+
+ tlb_gather_mmu(&tlb, mm, start, end);
+ free_pgd_range(&tlb, start, end, start, end);
+ tlb_finish_mmu(&tlb, start, end);
+#endif
+}
+
/* After calling this, the LDT is immutable. */
static void finalize_ldt_struct(struct ldt_struct *ldt)
{
@@ -155,8 +239,17 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
memcpy(new_ldt->entries, old_mm->context.ldt->entries,
new_ldt->nr_entries * LDT_ENTRY_SIZE);
finalize_ldt_struct(new_ldt);
+ retval = map_ldt_struct(mm, new_ldt, 0);
+ if (retval)
+ goto out_free;
mm->context.ldt = new_ldt;
+ goto out_unlock;
+
+out_free:
+ free_ldt_pgtables(mm);
+ free_ldt_struct(new_ldt);
+ return retval;
out_unlock:
mutex_unlock(&old_mm->context.lock);
@@ -174,6 +267,11 @@ void destroy_context_ldt(struct mm_struct *mm)
mm->context.ldt = NULL;
}
+void ldt_arch_exit_mmap(struct mm_struct *mm)
+{
+ free_ldt_pgtables(mm);
+}
+
static int read_ldt(void __user *ptr, unsigned long bytecount)
{
struct mm_struct *mm = current->mm;
@@ -285,6 +383,11 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
new_ldt->entries[ldt_info.entry_number] = ldt;
finalize_ldt_struct(new_ldt);
+ error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
+ if (error) {
+ free_ldt_struct(old_ldt);
+ goto out_unlock;
+ }
install_ldt(mm, new_ldt);
free_ldt_struct(old_ldt);
--
2.13.6
On Wed, Dec 06, 2017 at 11:22:21PM -0800, Andy Lutomirski wrote:
> I think I like this approach. I also think it might be nice to move the
> whole cpu_entry_area into this new pgd range so that we can stop mucking
> around with the fixmap.
Yeah, and also, I don't like the idea of sacrificing a whole PGD
only for the LDT crap which is optional, even. Frankly - and this
is just me - I'd make CONFIG_KERNEL_PAGE_TABLE_ISOLATION xor
CONFIG_MODIFY_LDT_SYSCALL and don't give a rat's *ss about the LDT.
--
Regards/Gruss,
Boris.
Good mailing practices for 400: avoid top-posting and trim the reply.
On Thu, Dec 7, 2017 at 4:43 AM, Borislav Petkov <[email protected]> wrote:
> On Wed, Dec 06, 2017 at 11:22:21PM -0800, Andy Lutomirski wrote:
>> I think I like this approach. I also think it might be nice to move the
>> whole cpu_entry_area into this new pgd range so that we can stop mucking
>> around with the fixmap.
>
> Yeah, and also, I don't like the idea of sacrificing a whole PGD
> only for the LDT crap which is optional, even. Frankly - and this
> is just me - I'd make CONFIG_KERNEL_PAGE_TABLE_ISOLATION xor
> CONFIG_MODIFY_LDT_SYSCALL and don't give a rat's *ss about the LDT.
The PGD sacrifice doesn't bother me. Putting a writable LDT map at a
constant address does bother me. We could probably get away with RO
if we trapped and handled the nasty faults, but that could be very
problematic.
The version here:
https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/pti&id=a74d1009ac72a1f04ec5bcd338a4bdbe170ab776
actually seems to work.
On Thu, 7 Dec 2017, Andy Lutomirski wrote:
> On Thu, Dec 7, 2017 at 4:43 AM, Borislav Petkov <[email protected]> wrote:
> > On Wed, Dec 06, 2017 at 11:22:21PM -0800, Andy Lutomirski wrote:
> >> I think I like this approach. I also think it might be nice to move the
> >> whole cpu_entry_area into this new pgd range so that we can stop mucking
> >> around with the fixmap.
> >
> > Yeah, and also, I don't like the idea of sacrificing a whole PGD
> > only for the LDT crap which is optional, even. Frankly - and this
> > is just me - I'd make CONFIG_KERNEL_PAGE_TABLE_ISOLATION xor
> > CONFIG_MODIFY_LDT_SYSCALL and don't give a rat's *ss about the LDT.
>
> The PGD sacrifice doesn't bother me. Putting a writable LDT map at a
> constant address does bother me. We could probably get away with RO
> if we trapped and handled the nasty faults, but that could be very
> problematic.
Where is the problem? You can map it RO into user space with the USER bit
cleared. The kernel knows how to access the real stuff.
> The version here:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/pti&id=a74d1009ac72a1f04ec5bcd338a4bdbe170ab776
>
> actually seems to work.
The approach I've taken is to create a VMA and map it into user space with
the USER bit cleared. A little bit more effort code wise, but that avoids
all the page table muck and keeps it straight attached to the process.
Will post once in a bit.
Thanks,
tglx
> On Dec 7, 2017, at 9:23 AM, Thomas Gleixner <[email protected]> wrote:
>
>> On Thu, 7 Dec 2017, Andy Lutomirski wrote:
>>
>>> On Thu, Dec 7, 2017 at 4:43 AM, Borislav Petkov <[email protected]> wrote:
>>>> On Wed, Dec 06, 2017 at 11:22:21PM -0800, Andy Lutomirski wrote:
>>>> I think I like this approach. I also think it might be nice to move the
>>>> whole cpu_entry_area into this new pgd range so that we can stop mucking
>>>> around with the fixmap.
>>>
>>> Yeah, and also, I don't like the idea of sacrificing a whole PGD
>>> only for the LDT crap which is optional, even. Frankly - and this
>>> is just me - I'd make CONFIG_KERNEL_PAGE_TABLE_ISOLATION xor
>>> CONFIG_MODIFY_LDT_SYSCALL and don't give a rat's *ss about the LDT.
>>
>> The PGD sacrifice doesn't bother me. Putting a writable LDT map at a
>> constant address does bother me. We could probably get away with RO
>> if we trapped and handled the nasty faults, but that could be very
>> problematic.
>
> Where is the problem? You can map it RO into user space with the USER bit
> cleared. The kernel knows how to access the real stuff.
Blows up when the CPU tries to set the accessed bit.
>
>> The version here:
>>
>> https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/pti&id=a74d1009ac72a1f04ec5bcd338a4bdbe170ab776
>>
>> actually seems to work.
>
> The approach I've taken is to create a VMA and map it into user space with
> the USER bit cleared. A little bit more effort code wise, but that avoids
> all the page table muck and keeps it straight attached to the process.
>
> Will post once in a bit.
I don't love mucking with user address space. I'm also quite nervous about putting it in our near anything that could pass an access_ok check, since we're totally screwed if the bad guys can figure out how to write to it.
>
> Thanks,
>
> tglx
>
* Andy Lutomirski <[email protected]> wrote:
>
>
> > On Dec 7, 2017, at 9:23 AM, Thomas Gleixner <[email protected]> wrote:
> >
> >> On Thu, 7 Dec 2017, Andy Lutomirski wrote:
> >>
> >>> On Thu, Dec 7, 2017 at 4:43 AM, Borislav Petkov <[email protected]> wrote:
> >>>> On Wed, Dec 06, 2017 at 11:22:21PM -0800, Andy Lutomirski wrote:
> >>>> I think I like this approach. I also think it might be nice to move the
> >>>> whole cpu_entry_area into this new pgd range so that we can stop mucking
> >>>> around with the fixmap.
> >>>
> >>> Yeah, and also, I don't like the idea of sacrificing a whole PGD
> >>> only for the LDT crap which is optional, even. Frankly - and this
> >>> is just me - I'd make CONFIG_KERNEL_PAGE_TABLE_ISOLATION xor
> >>> CONFIG_MODIFY_LDT_SYSCALL and don't give a rat's *ss about the LDT.
> >>
> >> The PGD sacrifice doesn't bother me. Putting a writable LDT map at a
> >> constant address does bother me. We could probably get away with RO
> >> if we trapped and handled the nasty faults, but that could be very
> >> problematic.
> >
> > Where is the problem? You can map it RO into user space with the USER bit
> > cleared. The kernel knows how to access the real stuff.
>
> Blows up when the CPU tries to set the accessed bit.
BTW., could we force the accessed bit to be always set, without breaking the ABI?
> > The approach I've taken is to create a VMA and map it into user space with
> > the USER bit cleared. A little bit more effort code wise, but that avoids
> > all the page table muck and keeps it straight attached to the process.
> >
> > Will post once in a bit.
>
> I don't love mucking with user address space. I'm also quite nervous about
> putting it in our near anything that could pass an access_ok check, since we're
> totally screwed if the bad guys can figure out how to write to it.
Hm, robustness of the LDT address wrt. access_ok() is a valid concern.
Can we have vmas with high addresses, in the vmalloc space for example?
IIRC the GPU code has precedents in that area.
Since this is x86-64, limitation of the vmalloc() space is not an issue.
I like Thomas's solution:
- have the LDT in a regular mmap space vma (hence per process ASLR randomized),
but with the system bit set.
- That would be an advantage even for non-PTI kernels, because mmap() is probably
more randomized than kmalloc().
- It would also be a cleaner approach all around, and would avoid the fixmap
complications and the scheduler muckery.
Thanks,
Ingo
On Fri, 8 Dec 2017, Ingo Molnar wrote:
> * Andy Lutomirski <[email protected]> wrote:
> > I don't love mucking with user address space. I'm also quite nervous about
> > putting it in our near anything that could pass an access_ok check, since we're
> > totally screwed if the bad guys can figure out how to write to it.
>
> Hm, robustness of the LDT address wrt. access_ok() is a valid concern.
>
> Can we have vmas with high addresses, in the vmalloc space for example?
> IIRC the GPU code has precedents in that area.
>
> Since this is x86-64, limitation of the vmalloc() space is not an issue.
>
> I like Thomas's solution:
>
> - have the LDT in a regular mmap space vma (hence per process ASLR randomized),
> but with the system bit set.
>
> - That would be an advantage even for non-PTI kernels, because mmap() is probably
> more randomized than kmalloc().
Randomization is pointless as long as you can get the LDT address in user
space, i.e. w/o UMIP.
> - It would also be a cleaner approach all around, and would avoid the fixmap
> complications and the scheduler muckery.
The error code of such an access is always 0x03. So I added a special
handler, which checks whether the address is in the LDT map range and
verifies that the access bit in the descriptor is 0. If that's the case it
sets it and returns. If not, the thing dies. That works.
Thanks,
tglx
* Thomas Gleixner <[email protected]> wrote:
> On Fri, 8 Dec 2017, Ingo Molnar wrote:
> > * Andy Lutomirski <[email protected]> wrote:
> > > I don't love mucking with user address space. I'm also quite nervous about
> > > putting it in our near anything that could pass an access_ok check, since we're
> > > totally screwed if the bad guys can figure out how to write to it.
> >
> > Hm, robustness of the LDT address wrt. access_ok() is a valid concern.
> >
> > Can we have vmas with high addresses, in the vmalloc space for example?
> > IIRC the GPU code has precedents in that area.
> >
> > Since this is x86-64, limitation of the vmalloc() space is not an issue.
> >
> > I like Thomas's solution:
> >
> > - have the LDT in a regular mmap space vma (hence per process ASLR randomized),
> > but with the system bit set.
> >
> > - That would be an advantage even for non-PTI kernels, because mmap() is probably
> > more randomized than kmalloc().
>
> Randomization is pointless as long as you can get the LDT address in user
> space, i.e. w/o UMIP.
But with UMIP unprivileged user-space won't be able to get the linear address of
the LDT. Now it's written out in /proc/self/maps.
> > - It would also be a cleaner approach all around, and would avoid the fixmap
> > complications and the scheduler muckery.
>
> The error code of such an access is always 0x03. So I added a special
> handler, which checks whether the address is in the LDT map range and
> verifies that the access bit in the descriptor is 0. If that's the case it
> sets it and returns. If not, the thing dies. That works.
Are SMP races possible? For example two threads both triggering the accessed bit
fault, but only one of them succeeding in setting it. The other thread should not
die in this case, right?
Thanks,
Ingo
On Fri, 8 Dec 2017, Ingo Molnar wrote:
> * Thomas Gleixner <[email protected]> wrote:
>
> > On Fri, 8 Dec 2017, Ingo Molnar wrote:
> > > * Andy Lutomirski <[email protected]> wrote:
> > > > I don't love mucking with user address space. I'm also quite nervous about
> > > > putting it in our near anything that could pass an access_ok check, since we're
> > > > totally screwed if the bad guys can figure out how to write to it.
> > >
> > > Hm, robustness of the LDT address wrt. access_ok() is a valid concern.
> > >
> > > Can we have vmas with high addresses, in the vmalloc space for example?
> > > IIRC the GPU code has precedents in that area.
> > >
> > > Since this is x86-64, limitation of the vmalloc() space is not an issue.
> > >
> > > I like Thomas's solution:
> > >
> > > - have the LDT in a regular mmap space vma (hence per process ASLR randomized),
> > > but with the system bit set.
> > >
> > > - That would be an advantage even for non-PTI kernels, because mmap() is probably
> > > more randomized than kmalloc().
> >
> > Randomization is pointless as long as you can get the LDT address in user
> > space, i.e. w/o UMIP.
>
> But with UMIP unprivileged user-space won't be able to get the linear address of
> the LDT. Now it's written out in /proc/self/maps.
We can expose it nameless like other VMAs, but then it's 128k sized so it
can be figured out. But when it's RO then it's not really a problem, even
the kernel can't write to it.
> > > - It would also be a cleaner approach all around, and would avoid the fixmap
> > > complications and the scheduler muckery.
> >
> > The error code of such an access is always 0x03. So I added a special
> > handler, which checks whether the address is in the LDT map range and
> > verifies that the access bit in the descriptor is 0. If that's the case it
> > sets it and returns. If not, the thing dies. That works.
>
> Are SMP races possible? For example two threads both triggering the accessed bit
> fault, but only one of them succeeding in setting it. The other thread should not
> die in this case, right?
Right. I'm trying to figure out whether there is a way to reliably detect
that write access bit mode.
Thanks,
tglx
* Thomas Gleixner <[email protected]> wrote:
> On Fri, 8 Dec 2017, Ingo Molnar wrote:
> > * Thomas Gleixner <[email protected]> wrote:
> >
> > > On Fri, 8 Dec 2017, Ingo Molnar wrote:
> > > > * Andy Lutomirski <[email protected]> wrote:
> > > > > I don't love mucking with user address space. I'm also quite nervous about
> > > > > putting it in our near anything that could pass an access_ok check, since we're
> > > > > totally screwed if the bad guys can figure out how to write to it.
> > > >
> > > > Hm, robustness of the LDT address wrt. access_ok() is a valid concern.
> > > >
> > > > Can we have vmas with high addresses, in the vmalloc space for example?
> > > > IIRC the GPU code has precedents in that area.
> > > >
> > > > Since this is x86-64, limitation of the vmalloc() space is not an issue.
> > > >
> > > > I like Thomas's solution:
> > > >
> > > > - have the LDT in a regular mmap space vma (hence per process ASLR randomized),
> > > > but with the system bit set.
> > > >
> > > > - That would be an advantage even for non-PTI kernels, because mmap() is probably
> > > > more randomized than kmalloc().
> > >
> > > Randomization is pointless as long as you can get the LDT address in user
> > > space, i.e. w/o UMIP.
> >
> > But with UMIP unprivileged user-space won't be able to get the linear address of
> > the LDT. Now it's written out in /proc/self/maps.
>
> We can expose it nameless like other VMAs, but then it's 128k sized so it
> can be figured out. But when it's RO then it's not really a problem, even
> the kernel can't write to it.
Yeah, ok. I don't think we should hide it - if it's in the vma space it should be
listed in the 'maps' file, and with a descriptive name.
Thanks,
Ingo
> On Dec 8, 2017, at 1:34 AM, Thomas Gleixner <[email protected]> wrote:
>
>> On Fri, 8 Dec 2017, Ingo Molnar wrote:
>> * Andy Lutomirski <[email protected]> wrote:
>>> I don't love mucking with user address space. I'm also quite nervous about
>>> putting it in our near anything that could pass an access_ok check, since we're
>>> totally screwed if the bad guys can figure out how to write to it.
>>
>> Hm, robustness of the LDT address wrt. access_ok() is a valid concern.
>>
>> Can we have vmas with high addresses, in the vmalloc space for example?
>> IIRC the GPU code has precedents in that area.
>>
>> Since this is x86-64, limitation of the vmalloc() space is not an issue.
>>
>> I like Thomas's solution:
>>
>> - have the LDT in a regular mmap space vma (hence per process ASLR randomized),
>> but with the system bit set.
>>
>> - That would be an advantage even for non-PTI kernels, because mmap() is probably
>> more randomized than kmalloc().
>
> Randomization is pointless as long as you can get the LDT address in user
> space, i.e. w/o UMIP.
You only get the LDT selector, not the address.
>
>> - It would also be a cleaner approach all around, and would avoid the fixmap
>> complications and the scheduler muckery.
>
> The error code of such an access is always 0x03. So I added a special
> handler, which checks whether the address is in the LDT map range and
> verifies that the access bit in the descriptor is 0. If that's the case it
> sets it and returns. If not, the thing dies. That works.
What if you are in kernel mode and try to return to a context with SS or CS pointing to a non-accessed segment? Or what if you try to schedule to a context with fs or, worse, gs pointing to such a segment?
>
> Thanks,
>
> tglx
From: Andy Lutomirski
> Sent: 08 December 2017 13:20
...
> >> - It would also be a cleaner approach all around, and would avoid the fixmap
> >> complications and the scheduler muckery.
> >
> > The error code of such an access is always 0x03. So I added a special
> > handler, which checks whether the address is in the LDT map range and
> > verifies that the access bit in the descriptor is 0. If that's the case it
> > sets it and returns. If not, the thing dies. That works.
>
> What if you are in kernel mode and try to return to a context with SS or CS pointing to a non-accessed
> segment?
> Or what if you try to schedule to a context with fs or, worse, gs pointing to such a segment?
Well, the cpu will fault in kernel on the 'pop %xs' or 'iret' instruction.
These all (probably) happen on the kernel stack with the usergs loaded.
So the fault handler has to look at the opcodes and/or %pc value, sort
out the stack (etc) and then generate SIGSEGV.
I'm not sure the kernel needs to know why the segment selector is invalid.
David
On Fri, Dec 08, 2017 at 05:20:00AM -0800, Andy Lutomirski wrote:
> >
> > The error code of such an access is always 0x03. So I added a special
> > handler, which checks whether the address is in the LDT map range and
> > verifies that the access bit in the descriptor is 0. If that's the case it
> > sets it and returns. If not, the thing dies. That works.
>
> What if you are in kernel mode and try to return to a context with SS
> or CS pointing to a non-accessed segment? Or what if you try to
> schedule to a context with fs or, worse, gs pointing to such a
> segment?
How would that be different from setting a 'crap' GS in modify_ldt() and
then returning from the syscall? That is something we should be able to
deal with already, no?
Is this something ldt_gdt.c already tests? The current "Test GS" is in
test_gdt_invalidation() which seems to suggest not.
Could we get a testcase for the exact situation you worry about? I'm not
sure I'd trust myself to get it right, all this LDT magic is new to me.
On Fri, Dec 08, 2017 at 03:06:54PM +0100, Peter Zijlstra wrote:
> On Fri, Dec 08, 2017 at 05:20:00AM -0800, Andy Lutomirski wrote:
> > >
> > > The error code of such an access is always 0x03. So I added a special
> > > handler, which checks whether the address is in the LDT map range and
> > > verifies that the access bit in the descriptor is 0. If that's the case it
> > > sets it and returns. If not, the thing dies. That works.
> >
> > What if you are in kernel mode and try to return to a context with SS
> > or CS pointing to a non-accessed segment? Or what if you try to
> > schedule to a context with fs or, worse, gs pointing to such a
> > segment?
>
> How would that be different from setting a 'crap' GS in modify_ldt() and
> then returning from the syscall? That is something we should be able to
> deal with already, no?
>
> Is this something ldt_gdt.c already tests? The current "Test GS" is in
> test_gdt_invalidation() which seems to suggest not.
>
> Could we get a testcase for the exact situation you worry about? I'm not
> sure I'd trust myself to get it right, all this LDT magic is new to me.
I ended up with the below; that loads something in LDT-2, sets GS to
LDT-2 and then again loads that same thing in LDT-2.
AFAIU calling modify_ldt will clear that ACCESSED bit, so this would
then trigger that on the return to user from modify_ldt, no?
Seems to work with tglx's patches.
diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c
index 66e5ce5b91f0..d46a620c3734 100644
--- a/tools/testing/selftests/x86/ldt_gdt.c
+++ b/tools/testing/selftests/x86/ldt_gdt.c
@@ -242,6 +242,36 @@ static void fail_install(struct user_desc *desc)
}
}
+static void do_ldt_gs_test(void)
+{
+ unsigned short prev_sel, sel = (2 << 3) | (1 << 2) | 3;
+
+ low_user_desc->entry_number = 2;
+
+ safe_modify_ldt(1, low_user_desc, sizeof(*low_user_desc));
+
+ /*
+ * syscall (eax) 123 - modify_ldt
+ * (ebx) - func
+ * (ecx) - ptr
+ * (edx) - bytecount
+ */
+
+ int eax = 123;
+ int ebx = 1;
+ int ecx = (unsigned int)low_user_desc;
+ int edx = sizeof(struct user_desc);
+
+ asm volatile ("movw %%gs, %[prev_sel]\n\t"
+ "movw %[sel], %%gs\n\t"
+ "int $0x80\n\t"
+ "mov %[prev_sel], %%gs"
+ : [prev_sel] "=&R" (prev_sel), [sel] "+R" (sel), "+a" (eax)
+ : "b" (ebx), "c" (ecx), "d" (edx)
+ : INT80_CLOBBERS);
+
+}
+
static void do_simple_tests(void)
{
struct user_desc desc = {
@@ -919,6 +946,8 @@ int main(int argc, char **argv)
setup_counter_page();
setup_low_user_desc();
+ do_ldt_gs_test();
+
do_simple_tests();
do_multicpu_tests();
On Fri, Dec 8, 2017 at 6:06 AM, Peter Zijlstra <[email protected]> wrote:
> On Fri, Dec 08, 2017 at 05:20:00AM -0800, Andy Lutomirski wrote:
>> >
>> > The error code of such an access is always 0x03. So I added a special
>> > handler, which checks whether the address is in the LDT map range and
>> > verifies that the access bit in the descriptor is 0. If that's the case it
>> > sets it and returns. If not, the thing dies. That works.
>>
>> What if you are in kernel mode and try to return to a context with SS
>> or CS pointing to a non-accessed segment? Or what if you try to
>> schedule to a context with fs or, worse, gs pointing to such a
>> segment?
>
> How would that be different from setting a 'crap' GS in modify_ldt() and
> then returning from the syscall? That is something we should be able to
> deal with already, no?
>
> Is this something ldt_gdt.c already tests? The current "Test GS" is in
> test_gdt_invalidation() which seems to suggest not.
>
> Could we get a testcase for the exact situation you worry about? I'm not
> sure I'd trust myself to get it right, all this LDT magic is new to me.
#GP on IRET is a failure, and we have disgusting code to handle it.
#PF on IRET would not be a failure -- it's a case where IRET should be
retried. Our crap that fixes up #GP would get that wrong and leave us
with the wrong GSBASE.
On Fri, Dec 8, 2017 at 3:31 AM, Ingo Molnar <[email protected]> wrote:
>
> * Thomas Gleixner <[email protected]> wrote:
>
>> On Fri, 8 Dec 2017, Ingo Molnar wrote:
>> > * Thomas Gleixner <[email protected]> wrote:
>> >
>> > > On Fri, 8 Dec 2017, Ingo Molnar wrote:
>> > > > * Andy Lutomirski <[email protected]> wrote:
>> > > > > I don't love mucking with user address space. I'm also quite nervous about
>> > > > > putting it in our near anything that could pass an access_ok check, since we're
>> > > > > totally screwed if the bad guys can figure out how to write to it.
>> > > >
>> > > > Hm, robustness of the LDT address wrt. access_ok() is a valid concern.
>> > > >
>> > > > Can we have vmas with high addresses, in the vmalloc space for example?
>> > > > IIRC the GPU code has precedents in that area.
>> > > >
>> > > > Since this is x86-64, limitation of the vmalloc() space is not an issue.
>> > > >
>> > > > I like Thomas's solution:
>> > > >
>> > > > - have the LDT in a regular mmap space vma (hence per process ASLR randomized),
>> > > > but with the system bit set.
>> > > >
>> > > > - That would be an advantage even for non-PTI kernels, because mmap() is probably
>> > > > more randomized than kmalloc().
>> > >
>> > > Randomization is pointless as long as you can get the LDT address in user
>> > > space, i.e. w/o UMIP.
>> >
>> > But with UMIP unprivileged user-space won't be able to get the linear address of
>> > the LDT. Now it's written out in /proc/self/maps.
>>
>> We can expose it nameless like other VMAs, but then it's 128k sized so it
>> can be figured out. But when it's RO then it's not really a problem, even
>> the kernel can't write to it.
>
> Yeah, ok. I don't think we should hide it - if it's in the vma space it should be
> listed in the 'maps' file, and with a descriptive name.
>
> Thanks,
>
> Ingo
Can we take a step back here? I think there are four vaguely sane
ways to make the LDT work:
1. The way it is right now -- in vmalloc space. The only real
downside is that it requires exposing that part of vmalloc space in
the user tables, which is a bit gross.
2. In some fixmap-like space, which is what my patch does, albeit
buggily. This requires a PGD that we treat as per-mm, not per-cpu,
but that's not so bad.
3. In one of the user PGDs but above TASK_SIZE_MAX. This is
functionally almost identical to #2, except that there's more concern
about exploits that write past TASK_SIZE_MAX.
4. In an actual vma. I don't see the benefit of doing this at all --
it's just like #2 except way more error prone. Hell, you have to make
sure that you can't munmap or mremap it, which isn't a consideration
at all with the other choices.
Why all the effort to make #4 work? #1 is working fine right now, and
#2 is half-implemented. #3 code-wise looks just like #2 except for
the choice of address and the interation with PTI's shitty PGD
handling.
From: Andy Lutomirski
> Sent: 08 December 2017 16:34
> #GP on IRET is a failure, and we have disgusting code to handle it.
Is that the trap in kernel space when the on-stack segment registers
are invalid?
Definitely needs horrid code...
> #PF on IRET would not be a failure -- it's a case where IRET should be
> retried. Our crap that fixes up #GP would get that wrong and leave us
> with the wrong GSBASE.
If the user code page isn't present then the fault happens after the
return to user mode, not on the IRET instruction in kernel mode.
So it is not really any different to returning to a NOP at the end
of a resident page when the page following is absent.
(Or any other invalid %ip value.)
SWAPGS is a PITA, should have been SAVEGS, LOAD_KERNEL_GS, and READ_SAVED_GS.
David
On Fri, Dec 8, 2017 at 8:46 AM, David Laight <[email protected]> wrote:
> From: Andy Lutomirski
>> Sent: 08 December 2017 16:34
>
>> #GP on IRET is a failure, and we have disgusting code to handle it.
>
> Is that the trap in kernel space when the on-stack segment registers
> are invalid?
> Definitely needs horrid code...
>
>> #PF on IRET would not be a failure -- it's a case where IRET should be
>> retried. Our crap that fixes up #GP would get that wrong and leave us
>> with the wrong GSBASE.
>
> If the user code page isn't present then the fault happens after the
> return to user mode, not on the IRET instruction in kernel mode.
> So it is not really any different to returning to a NOP at the end
> of a resident page when the page following is absent.
> (Or any other invalid %ip value.)
I mean: if the user CS or SS is not accessed and the LDT is RO, then
we get #PF on the IRET instruction, I think. Dealing with that is
truly awful.
From: Andy Lutomirski
> Sent: 08 December 2017 16:48
...
> I mean: if the user CS or SS is not accessed and the LDT is RO, then
> we get #PF on the IRET instruction, I think. Dealing with that is
> truly awful.
Any fault in-kernel on the IRET is horrid.
Doesn't really matter which one.
Same goes for the 'pop %ds' (etc) that tend to precede it.
David
On Fri, 8 Dec 2017, Andy Lutomirski wrote:
> Can we take a step back here? I think there are four vaguely sane
> ways to make the LDT work:
>
> 1. The way it is right now -- in vmalloc space. The only real
> downside is that it requires exposing that part of vmalloc space in
> the user tables, which is a bit gross.
>
> 2. In some fixmap-like space, which is what my patch does, albeit
> buggily. This requires a PGD that we treat as per-mm, not per-cpu,
> but that's not so bad.
>
> 3. In one of the user PGDs but above TASK_SIZE_MAX. This is
> functionally almost identical to #2, except that there's more concern
> about exploits that write past TASK_SIZE_MAX.
>
> 4. In an actual vma. I don't see the benefit of doing this at all --
> it's just like #2 except way more error prone. Hell, you have to make
> sure that you can't munmap or mremap it, which isn't a consideration
> at all with the other choices.
Why? You can unmap vdso or uprobe or whatever VMAs and you will simply
die. You get what you asked for.
Thanks,
tglx
On Fri, Dec 8, 2017 at 9:37 AM, Thomas Gleixner <[email protected]> wrote:
> On Fri, 8 Dec 2017, Andy Lutomirski wrote:
>> Can we take a step back here? I think there are four vaguely sane
>> ways to make the LDT work:
>>
>> 1. The way it is right now -- in vmalloc space. The only real
>> downside is that it requires exposing that part of vmalloc space in
>> the user tables, which is a bit gross.
>>
>> 2. In some fixmap-like space, which is what my patch does, albeit
>> buggily. This requires a PGD that we treat as per-mm, not per-cpu,
>> but that's not so bad.
>>
>> 3. In one of the user PGDs but above TASK_SIZE_MAX. This is
>> functionally almost identical to #2, except that there's more concern
>> about exploits that write past TASK_SIZE_MAX.
>>
>> 4. In an actual vma. I don't see the benefit of doing this at all --
>> it's just like #2 except way more error prone. Hell, you have to make
>> sure that you can't munmap or mremap it, which isn't a consideration
>> at all with the other choices.
>
> Why? You can unmap vdso or uprobe or whatever VMAs and you will simply
> die. You get what you asked for.
But if you unmap ldt and then map something else at the same VA, you're root.
On Fri, Dec 08, 2017 at 08:38:26AM -0800, Andy Lutomirski wrote:
> 4. In an actual vma. I don't see the benefit of doing this at all --
> it's just like #2 except way more error prone. Hell, you have to make
> sure that you can't munmap or mremap it, which isn't a consideration
> at all with the other choices.
mremap is trivially disabled. I've not tried munmap() yet, as long as it
just kills the process doing it we're good of course. Otherwise we need
an extra callback in do_munmap() which isn't too hard.
> Why all the effort to make #4 work?
Seemed like a sensible approach; I really dislike wasting an entire pmd
or whatever on a feature 'nobody' ever uses anyway.
> #1 is working fine right now
doesn't work for pti in its current form.