2022-06-10 15:07:34

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCHv3 0/8] Linear Address Masking enabling

Linear Address Masking[1] (LAM) modifies the checking that is applied to
64-bit linear addresses, allowing software to use of the untranslated
address bits for metadata.

The patchset brings support for LAM for userspace addresses.

LAM_U48 enabling is controversial since it competes for bits with
5-level paging. Its enabling isolated into an optional last patch that
can be applied at maintainer's discretion.

Please review and consider applying.

v3:
- Rebased onto v5.19-rc1
- Per-process enabling;
- API overhaul (again);
- Avoid branches and costly computations in the fast path;
- LAM_U48 is in optional patch.
v2:
- Rebased onto v5.18-rc1
- New arch_prctl(2)-based API
- Expose status of LAM (or other thread features) in
/proc/$PID/arch_status

[1] ISE, Chapter 14.
https://software.intel.com/content/dam/develop/external/us/en/documents-tps/architecture-instruction-set-extensions-programming-reference.pdf

Kirill A. Shutemov (8):
x86/mm: Fix CR3_ADDR_MASK
x86: CPUID and CR3/CR4 flags for Linear Address Masking
mm: Pass down mm_struct to untagged_addr()
x86/mm: Handle LAM on context switch
x86/uaccess: Provide untagged_addr() and remove tags before address check
x86/mm: Provide ARCH_GET_UNTAG_MASK and ARCH_ENABLE_TAGGED_ADDR
x86: Expose untagging mask in /proc/$PID/arch_status
x86/mm: Extend LAM to support to LAM_U48

arch/arm64/include/asm/memory.h | 4 +-
arch/arm64/include/asm/signal.h | 2 +-
arch/arm64/include/asm/uaccess.h | 4 +-
arch/arm64/kernel/hw_breakpoint.c | 2 +-
arch/arm64/kernel/traps.c | 4 +-
arch/arm64/mm/fault.c | 10 +--
arch/sparc/include/asm/pgtable_64.h | 2 +-
arch/sparc/include/asm/uaccess_64.h | 2 +
arch/x86/include/asm/cpufeatures.h | 1 +
arch/x86/include/asm/elf.h | 3 +-
arch/x86/include/asm/mmu.h | 2 +
arch/x86/include/asm/mmu_context.h | 58 +++++++++++++++++
arch/x86/include/asm/processor-flags.h | 2 +-
arch/x86/include/asm/tlbflush.h | 3 +
arch/x86/include/asm/uaccess.h | 44 ++++++++++++-
arch/x86/include/uapi/asm/prctl.h | 3 +
arch/x86/include/uapi/asm/processor-flags.h | 6 ++
arch/x86/kernel/Makefile | 2 +
arch/x86/kernel/fpu/xstate.c | 47 --------------
arch/x86/kernel/proc.c | 50 +++++++++++++++
arch/x86/kernel/process.c | 3 +
arch/x86/kernel/process_64.c | 54 +++++++++++++++-
arch/x86/kernel/sys_x86_64.c | 5 +-
arch/x86/mm/hugetlbpage.c | 6 +-
arch/x86/mm/mmap.c | 9 ++-
arch/x86/mm/tlb.c | 62 ++++++++++++++-----
.../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +-
drivers/gpu/drm/radeon/radeon_gem.c | 2 +-
drivers/infiniband/hw/mlx4/mr.c | 2 +-
drivers/media/common/videobuf2/frame_vector.c | 2 +-
drivers/media/v4l2-core/videobuf-dma-contig.c | 2 +-
.../staging/media/atomisp/pci/hmm/hmm_bo.c | 2 +-
drivers/tee/tee_shm.c | 2 +-
drivers/vfio/vfio_iommu_type1.c | 2 +-
fs/proc/task_mmu.c | 2 +-
include/linux/mm.h | 11 ----
include/linux/uaccess.h | 11 ++++
lib/strncpy_from_user.c | 2 +-
lib/strnlen_user.c | 2 +-
mm/gup.c | 6 +-
mm/madvise.c | 2 +-
mm/mempolicy.c | 6 +-
mm/migrate.c | 2 +-
mm/mincore.c | 2 +-
mm/mlock.c | 4 +-
mm/mmap.c | 2 +-
mm/mprotect.c | 2 +-
mm/mremap.c | 2 +-
mm/msync.c | 2 +-
virt/kvm/kvm_main.c | 2 +-
51 files changed, 342 insertions(+), 126 deletions(-)
create mode 100644 arch/x86/kernel/proc.c

--
2.35.1


2022-06-10 15:08:27

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCHv3 2/8] x86: CPUID and CR3/CR4 flags for Linear Address Masking

Enumerate Linear Address Masking and provide defines for CR3 and CR4
flags.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/include/asm/cpufeatures.h | 1 +
arch/x86/include/uapi/asm/processor-flags.h | 6 ++++++
2 files changed, 7 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 393f2bbb5e3a..9835ab09b590 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -300,6 +300,7 @@
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */
+#define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */

/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index c47cc7f2feeb..d898432947ff 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -82,6 +82,10 @@
#define X86_CR3_PCID_BITS 12
#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL))

+#define X86_CR3_LAM_U57_BIT 61 /* Activate LAM for userspace, 62:57 bits masked */
+#define X86_CR3_LAM_U57 _BITULL(X86_CR3_LAM_U57_BIT)
+#define X86_CR3_LAM_U48_BIT 62 /* Activate LAM for userspace, 62:48 bits masked */
+#define X86_CR3_LAM_U48 _BITULL(X86_CR3_LAM_U48_BIT)
#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)

@@ -132,6 +136,8 @@
#define X86_CR4_PKE _BITUL(X86_CR4_PKE_BIT)
#define X86_CR4_CET_BIT 23 /* enable Control-flow Enforcement Technology */
#define X86_CR4_CET _BITUL(X86_CR4_CET_BIT)
+#define X86_CR4_LAM_SUP_BIT 28 /* LAM for supervisor pointers */
+#define X86_CR4_LAM_SUP _BITUL(X86_CR4_LAM_SUP_BIT)

/*
* x86-64 Task Priority Register, CR8
--
2.35.1

2022-06-10 15:09:54

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCHv3 4/8] x86/mm: Handle LAM on context switch

Linear Address Masking mode for userspace pointers encoded in CR3 bits.
The mode is selected per-thread. Add new thread features indicate that the
thread has Linear Address Masking enabled.

switch_mm_irqs_off() now respects these flags and constructs CR3
accordingly.

The active LAM mode gets recorded in the tlb_state.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/include/asm/mmu.h | 1 +
arch/x86/include/asm/mmu_context.h | 24 ++++++++++++
arch/x86/include/asm/tlbflush.h | 3 ++
arch/x86/mm/tlb.c | 62 ++++++++++++++++++++++--------
4 files changed, 75 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 5d7494631ea9..d150e92163b6 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -40,6 +40,7 @@ typedef struct {

#ifdef CONFIG_X86_64
unsigned short flags;
+ u64 lam_cr3_mask;
#endif

struct mutex lock;
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index b8d40ddeab00..e6eac047c728 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -91,6 +91,29 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
}
#endif

+#ifdef CONFIG_X86_64
+static inline u64 mm_cr3_lam_mask(struct mm_struct *mm)
+{
+ return mm->context.lam_cr3_mask;
+}
+
+static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+ mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask;
+}
+
+#else
+
+static inline u64 mm_cr3_lam_mask(struct mm_struct *mm)
+{
+ return 0;
+}
+
+static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+}
+#endif
+
#define enter_lazy_tlb enter_lazy_tlb
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);

@@ -168,6 +191,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
arch_dup_pkeys(oldmm, mm);
paravirt_arch_dup_mmap(oldmm, mm);
+ dup_lam(oldmm, mm);
return ldt_dup_context(oldmm, mm);
}

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 4af5579c7ef7..5b93dad93ff4 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -86,6 +86,9 @@ struct tlb_state {
unsigned long last_user_mm_spec;
};

+#ifdef CONFIG_X86_64
+ u64 lam_cr3_mask;
+#endif
u16 loaded_mm_asid;
u16 next_asid;

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index d400b6d9d246..458867a8f4bd 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -154,17 +154,17 @@ static inline u16 user_pcid(u16 asid)
return ret;
}

-static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
+static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, u64 lam)
{
if (static_cpu_has(X86_FEATURE_PCID)) {
- return __sme_pa(pgd) | kern_pcid(asid);
+ return __sme_pa(pgd) | kern_pcid(asid) | lam;
} else {
VM_WARN_ON_ONCE(asid != 0);
- return __sme_pa(pgd);
+ return __sme_pa(pgd) | lam;
}
}

-static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
+static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid, u64 lam)
{
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
/*
@@ -173,7 +173,7 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
* boot because all CPU's the have same capabilities:
*/
VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
- return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
+ return __sme_pa(pgd) | kern_pcid(asid) | lam | CR3_NOFLUSH;
}

/*
@@ -274,15 +274,15 @@ static inline void invalidate_user_asid(u16 asid)
(unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
}

-static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
+static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, u64 lam, bool need_flush)
{
unsigned long new_mm_cr3;

if (need_flush) {
invalidate_user_asid(new_asid);
- new_mm_cr3 = build_cr3(pgdir, new_asid);
+ new_mm_cr3 = build_cr3(pgdir, new_asid, lam);
} else {
- new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
+ new_mm_cr3 = build_cr3_noflush(pgdir, new_asid, lam);
}

/*
@@ -486,11 +486,36 @@ void cr4_update_pce(void *ignored)
static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
#endif

+#ifdef CONFIG_X86_64
+static inline u64 tlbstate_lam_cr3_mask(void)
+{
+ return this_cpu_read(cpu_tlbstate.lam_cr3_mask);
+}
+
+static inline void set_tlbstate_lam_cr3_mask(u64 mask)
+{
+ this_cpu_write(cpu_tlbstate.lam_cr3_mask, mask);
+}
+
+#else
+
+static inline u64 tlbstate_lam_cr3_mask(void)
+{
+ return 0;
+}
+
+static inline void set_tlbstate_lam_cr3_mask(u64 mask)
+{
+}
+#endif
+
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ u64 prev_lam = tlbstate_lam_cr3_mask();
+ u64 new_lam = mm_cr3_lam_mask(next);
bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
@@ -504,6 +529,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* cpu_tlbstate.loaded_mm) matches next.
*
* NB: leave_mm() calls us with prev == NULL and tsk == NULL.
+ *
+ * NB: Initial LAM enabling calls us with prev == next. We must update
+ * CR3 if prev_lam doesn't match the new one.
*/

/* We don't want flush_tlb_func() to run concurrently with us. */
@@ -520,7 +548,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* isn't free.
*/
#ifdef CONFIG_DEBUG_VM
- if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
+ if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid, prev_lam))) {
/*
* If we were to BUG here, we'd be very likely to kill
* the system so hard that we don't see the call trace.
@@ -551,7 +579,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* provides that full memory barrier and core serializing
* instruction.
*/
- if (real_prev == next) {
+ if (real_prev == next && prev_lam == new_lam) {
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);

@@ -622,15 +650,16 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
barrier();
}

+ set_tlbstate_lam_cr3_mask(new_lam);
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- load_new_mm_cr3(next->pgd, new_asid, true);
+ load_new_mm_cr3(next->pgd, new_asid, new_lam, true);

trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
- load_new_mm_cr3(next->pgd, new_asid, false);
+ load_new_mm_cr3(next->pgd, new_asid, new_lam, false);

trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
@@ -687,6 +716,7 @@ void initialize_tlbstate_and_flush(void)
struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
unsigned long cr3 = __read_cr3();
+ u64 lam = cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57);

/* Assert that CR3 already references the right mm. */
WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
@@ -700,7 +730,7 @@ void initialize_tlbstate_and_flush(void)
!(cr4_read_shadow() & X86_CR4_PCIDE));

/* Force ASID 0 and force a TLB flush. */
- write_cr3(build_cr3(mm->pgd, 0));
+ write_cr3(build_cr3(mm->pgd, 0, lam));

/* Reinitialize tlbstate. */
this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT);
@@ -1047,8 +1077,10 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
*/
unsigned long __get_current_cr3_fast(void)
{
- unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
- this_cpu_read(cpu_tlbstate.loaded_mm_asid));
+ unsigned long cr3 =
+ build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
+ this_cpu_read(cpu_tlbstate.loaded_mm_asid),
+ tlbstate_lam_cr3_mask());

/* For now, be very restrictive about when this can be called. */
VM_WARN_ON(in_nmi() || preemptible());
--
2.35.1

2022-06-10 15:10:54

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCHv3 5/8] x86/uaccess: Provide untagged_addr() and remove tags before address check

untagged_addr() is a helper used by the core-mm to strip tag bits and
get the address to the canonical shape. In only handles userspace
addresses. The untagging mask is stored in mmu_context and will be set
on enabling LAM for the process.

The tags must not be included into check whether it's okay to access the
userspace address.

Strip tags in access_ok().

get_user() and put_user() don't use access_ok(), but check access
against TASK_SIZE directly in assembly. Strip tags, before calling into
the assembly helper.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/include/asm/mmu.h | 1 +
arch/x86/include/asm/mmu_context.h | 11 ++++++++
arch/x86/include/asm/uaccess.h | 44 ++++++++++++++++++++++++++++--
arch/x86/kernel/process.c | 3 ++
4 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index d150e92163b6..59c617fc7c6f 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -41,6 +41,7 @@ typedef struct {
#ifdef CONFIG_X86_64
unsigned short flags;
u64 lam_cr3_mask;
+ u64 untag_mask;
#endif

struct mutex lock;
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index e6eac047c728..05821534aadc 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -100,6 +100,12 @@ static inline u64 mm_cr3_lam_mask(struct mm_struct *mm)
static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
{
mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask;
+ mm->context.untag_mask = oldmm->context.untag_mask;
+}
+
+static inline void mm_reset_untag_mask(struct mm_struct *mm)
+{
+ mm->context.untag_mask = -1UL;
}

#else
@@ -112,6 +118,10 @@ static inline u64 mm_cr3_lam_mask(struct mm_struct *mm)
static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
{
}
+
+static inline void mm_reset_untag_mask(struct mm_struct *mm)
+{
+}
#endif

#define enter_lazy_tlb enter_lazy_tlb
@@ -138,6 +148,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.execute_only_pkey = -1;
}
#endif
+ mm_reset_untag_mask(mm);
init_new_context_ldt(mm);
return 0;
}
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 35f222aa66bf..ca754521a4db 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -6,6 +6,7 @@
*/
#include <linux/compiler.h>
#include <linux/kasan-checks.h>
+#include <linux/mm_types.h>
#include <linux/string.h>
#include <asm/asm.h>
#include <asm/page.h>
@@ -20,6 +21,32 @@ static inline bool pagefault_disabled(void);
# define WARN_ON_IN_IRQ()
#endif

+#ifdef CONFIG_X86_64
+/*
+ * Mask out tag bits from the address.
+ *
+ * Magic with the 'sign' allows to untag userspace pointer without any branches
+ * while leaving kernel addresses intact.
+ */
+#define untagged_addr(mm, addr) ({ \
+ u64 __addr = (__force u64)(addr); \
+ s64 sign = (s64)__addr >> 63; \
+ __addr ^= sign; \
+ __addr &= (mm)->context.untag_mask; \
+ __addr ^= sign; \
+ (__force __typeof__(addr))__addr; \
+})
+
+#define untagged_ptr(mm, ptr) ({ \
+ u64 __ptrval = (__force u64)(ptr); \
+ __ptrval = untagged_addr(mm, __ptrval); \
+ (__force __typeof__(*(ptr)) *)__ptrval; \
+})
+#else
+#define untagged_addr(mm, addr) (addr)
+#define untagged_ptr(mm, ptr) (ptr)
+#endif
+
/**
* access_ok - Checks if a user space pointer is valid
* @addr: User space pointer to start of block to check
@@ -40,7 +67,7 @@ static inline bool pagefault_disabled(void);
#define access_ok(addr, size) \
({ \
WARN_ON_IN_IRQ(); \
- likely(__access_ok(addr, size)); \
+ likely(__access_ok(untagged_addr(current->mm, addr), size)); \
})

#include <asm-generic/access_ok.h>
@@ -125,7 +152,13 @@ extern int __get_user_bad(void);
* Return: zero on success, or -EFAULT on error.
* On error, the variable @x is set to zero.
*/
-#define get_user(x,ptr) ({ might_fault(); do_get_user_call(get_user,x,ptr); })
+#define get_user(x,ptr) \
+({ \
+ __typeof__(*(ptr)) __user *__ptr_clean; \
+ __ptr_clean = untagged_ptr(current->mm, ptr); \
+ might_fault(); \
+ do_get_user_call(get_user,x,__ptr_clean); \
+})

/**
* __get_user - Get a simple variable from user space, with less checking.
@@ -222,7 +255,12 @@ extern void __put_user_nocheck_8(void);
*
* Return: zero on success, or -EFAULT on error.
*/
-#define put_user(x, ptr) ({ might_fault(); do_put_user_call(put_user,x,ptr); })
+#define put_user(x, ptr) ({ \
+ __typeof__(*(ptr)) __user *__ptr_clean; \
+ __ptr_clean = untagged_ptr(current->mm, ptr); \
+ might_fault(); \
+ do_put_user_call(put_user,x,__ptr_clean); \
+})

/**
* __put_user - Write a simple value into user space, with less checking.
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9b2772b7e1f3..18b2bfdf7b9b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -47,6 +47,7 @@
#include <asm/frame.h>
#include <asm/unwind.h>
#include <asm/tdx.h>
+#include <asm/mmu_context.h>

#include "process.h"

@@ -367,6 +368,8 @@ void arch_setup_new_exec(void)
task_clear_spec_ssb_noexec(current);
speculation_ctrl_update(read_thread_flags());
}
+
+ mm_reset_untag_mask(current->mm);
}

#ifdef CONFIG_X86_IOPL_IOPERM
--
2.35.1

2022-06-10 20:35:37

by Kostya Serebryany

[permalink] [raw]
Subject: Re: [PATCHv3 0/8] Linear Address Masking enabling

Thanks for working on this, please make LAM happen.
It enables efficient memory safety testing that is already available on AArch64.

Memory error detectors, such as ASAN and Valgrind (or KASAN for the kernel)
have limited applicability, primarily because of their run-time overheads
(CPU, RAM, and code size). In many cases, the major obstacle to a wider
deployment is the RAM overhead, which is typically 2x-3x. There is another tool,
HWASAN [1], which solves the same problem and has < 10% RAM overhead.
This tool is available only on AArch64 because it relies on the
top-byte-ignore (TBI)
feature. Full support for that feature [2] has been added to the
kernel in order to
enable HWASAN. Adding support for LAM will enable HWASAN on x86_64.

HWASAN is already the main memory safety tool for Android [3] - the reduced RAM
overhead allowed us to utilize this testing tool where ASAN’s RAM overhead was
prohibitive. We have also prototyped the x86_64 variant of HWASAN, and we can
observe that it is a major improvement over ASAN. The kernel support
and hardware
availability are the only missing parts.

Making HWASAN available on x86_64 will enable developers of server and
client software
to scale up their memory safety testing, and thus improve the quality
and security of their products.


[1] https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
[2] https://www.kernel.org/doc/html/latest/arm64/tagged-address-abi.html
[3] https://source.android.com/devices/tech/debug/hwasan

--kcc


On Fri, Jun 10, 2022 at 7:35 AM Kirill A. Shutemov
<[email protected]> wrote:
>
> Linear Address Masking[1] (LAM) modifies the checking that is applied to
> 64-bit linear addresses, allowing software to use of the untranslated
> address bits for metadata.
>
> The patchset brings support for LAM for userspace addresses.
>
> LAM_U48 enabling is controversial since it competes for bits with
> 5-level paging. Its enabling isolated into an optional last patch that
> can be applied at maintainer's discretion.
>
> Please review and consider applying.
>
> v3:
> - Rebased onto v5.19-rc1
> - Per-process enabling;
> - API overhaul (again);
> - Avoid branches and costly computations in the fast path;
> - LAM_U48 is in optional patch.
> v2:
> - Rebased onto v5.18-rc1
> - New arch_prctl(2)-based API
> - Expose status of LAM (or other thread features) in
> /proc/$PID/arch_status
>
> [1] ISE, Chapter 14.
> https://software.intel.com/content/dam/develop/external/us/en/documents-tps/architecture-instruction-set-extensions-programming-reference.pdf
>
> Kirill A. Shutemov (8):
> x86/mm: Fix CR3_ADDR_MASK
> x86: CPUID and CR3/CR4 flags for Linear Address Masking
> mm: Pass down mm_struct to untagged_addr()
> x86/mm: Handle LAM on context switch
> x86/uaccess: Provide untagged_addr() and remove tags before address check
> x86/mm: Provide ARCH_GET_UNTAG_MASK and ARCH_ENABLE_TAGGED_ADDR
> x86: Expose untagging mask in /proc/$PID/arch_status
> x86/mm: Extend LAM to support to LAM_U48
>
> arch/arm64/include/asm/memory.h | 4 +-
> arch/arm64/include/asm/signal.h | 2 +-
> arch/arm64/include/asm/uaccess.h | 4 +-
> arch/arm64/kernel/hw_breakpoint.c | 2 +-
> arch/arm64/kernel/traps.c | 4 +-
> arch/arm64/mm/fault.c | 10 +--
> arch/sparc/include/asm/pgtable_64.h | 2 +-
> arch/sparc/include/asm/uaccess_64.h | 2 +
> arch/x86/include/asm/cpufeatures.h | 1 +
> arch/x86/include/asm/elf.h | 3 +-
> arch/x86/include/asm/mmu.h | 2 +
> arch/x86/include/asm/mmu_context.h | 58 +++++++++++++++++
> arch/x86/include/asm/processor-flags.h | 2 +-
> arch/x86/include/asm/tlbflush.h | 3 +
> arch/x86/include/asm/uaccess.h | 44 ++++++++++++-
> arch/x86/include/uapi/asm/prctl.h | 3 +
> arch/x86/include/uapi/asm/processor-flags.h | 6 ++
> arch/x86/kernel/Makefile | 2 +
> arch/x86/kernel/fpu/xstate.c | 47 --------------
> arch/x86/kernel/proc.c | 50 +++++++++++++++
> arch/x86/kernel/process.c | 3 +
> arch/x86/kernel/process_64.c | 54 +++++++++++++++-
> arch/x86/kernel/sys_x86_64.c | 5 +-
> arch/x86/mm/hugetlbpage.c | 6 +-
> arch/x86/mm/mmap.c | 9 ++-
> arch/x86/mm/tlb.c | 62 ++++++++++++++-----
> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +-
> drivers/gpu/drm/radeon/radeon_gem.c | 2 +-
> drivers/infiniband/hw/mlx4/mr.c | 2 +-
> drivers/media/common/videobuf2/frame_vector.c | 2 +-
> drivers/media/v4l2-core/videobuf-dma-contig.c | 2 +-
> .../staging/media/atomisp/pci/hmm/hmm_bo.c | 2 +-
> drivers/tee/tee_shm.c | 2 +-
> drivers/vfio/vfio_iommu_type1.c | 2 +-
> fs/proc/task_mmu.c | 2 +-
> include/linux/mm.h | 11 ----
> include/linux/uaccess.h | 11 ++++
> lib/strncpy_from_user.c | 2 +-
> lib/strnlen_user.c | 2 +-
> mm/gup.c | 6 +-
> mm/madvise.c | 2 +-
> mm/mempolicy.c | 6 +-
> mm/migrate.c | 2 +-
> mm/mincore.c | 2 +-
> mm/mlock.c | 4 +-
> mm/mmap.c | 2 +-
> mm/mprotect.c | 2 +-
> mm/mremap.c | 2 +-
> mm/msync.c | 2 +-
> virt/kvm/kvm_main.c | 2 +-
> 51 files changed, 342 insertions(+), 126 deletions(-)
> create mode 100644 arch/x86/kernel/proc.c
>
> --
> 2.35.1
>

2022-06-11 00:19:30

by Edgecombe, Rick P

[permalink] [raw]
Subject: Re: [PATCHv3 4/8] x86/mm: Handle LAM on context switch

On Fri, 2022-06-10 at 17:35 +0300, Kirill A. Shutemov wrote:
> @@ -687,6 +716,7 @@ void initialize_tlbstate_and_flush(void)
> struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
> u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
> unsigned long cr3 = __read_cr3();
> + u64 lam = cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57);
>
> /* Assert that CR3 already references the right mm. */
> WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
> @@ -700,7 +730,7 @@ void initialize_tlbstate_and_flush(void)
> !(cr4_read_shadow() & X86_CR4_PCIDE));
>
> /* Force ASID 0 and force a TLB flush. */
> - write_cr3(build_cr3(mm->pgd, 0));
> + write_cr3(build_cr3(mm->pgd, 0, lam));
>

Can you explain why to keep the lam bits that were in CR3 here? It
seems to be worried some CR3 bits got changed and need to be set to a
known state. Why not take them from the MM?

Also, it warns if the cr3 pfn doesn't match the mm pgd, should it warn
if cr3 lam bits don't match the MM's copy?

2022-06-13 19:15:57

by Edgecombe, Rick P

[permalink] [raw]
Subject: Re: [PATCHv3 5/8] x86/uaccess: Provide untagged_addr() and remove tags before address check

On Fri, 2022-06-10 at 17:35 +0300, Kirill A. Shutemov wrote:
> +#ifdef CONFIG_X86_64
> +/*
> + * Mask out tag bits from the address.
> + *
> + * Magic with the 'sign' allows to untag userspace pointer without
> any branches
> + * while leaving kernel addresses intact.

Trying to understand the magic part here. I guess how it works is, when
the high bit is set, it does the opposite of untagging the addresses by
setting the tag bits instead of clearing them. So:
- For proper canonical kernel addresses (with U57) it leaves them
intact since the tag bits were already set.
- For non-canonical kernel-half addresses, it fixes them up.
(0xeffffff000000840->0xfffffff000000840)
- For U48 and 5 level paging, it corrupts some normal kernel
addresses. (0xff90ffffffffffff->0xffffffffffffffff)

I just ported this to userspace and threw some addresses at it to see
what happened, so hopefully I got that right.

Is this special kernel address handling only needed because
copy_to_kernel_nofault(), etc call the user helpers?

> + */
> +#define untagged_addr(mm,
> addr) ({ \
> + u64 __addr = (__force
> u64)(addr); \
> + s64 sign = (s64)__addr >>
> 63; \
> + __addr ^=
> sign; \
> + __addr &= (mm)-
> >context.untag_mask; \
> + __addr ^=
> sign; \
> + (__force
> __typeof__(addr))__addr; \
> +})

2022-06-15 16:07:40

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCHv3 4/8] x86/mm: Handle LAM on context switch

On Fri, Jun 10, 2022 at 11:55:02PM +0000, Edgecombe, Rick P wrote:
> On Fri, 2022-06-10 at 17:35 +0300, Kirill A. Shutemov wrote:
> > @@ -687,6 +716,7 @@ void initialize_tlbstate_and_flush(void)
> > struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
> > u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
> > unsigned long cr3 = __read_cr3();
> > + u64 lam = cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57);
> >
> > /* Assert that CR3 already references the right mm. */
> > WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
> > @@ -700,7 +730,7 @@ void initialize_tlbstate_and_flush(void)
> > !(cr4_read_shadow() & X86_CR4_PCIDE));
> >
> > /* Force ASID 0 and force a TLB flush. */
> > - write_cr3(build_cr3(mm->pgd, 0));
> > + write_cr3(build_cr3(mm->pgd, 0, lam));
> >
>
> Can you explain why to keep the lam bits that were in CR3 here? It
> seems to be worried some CR3 bits got changed and need to be set to a
> known state. Why not take them from the MM?
>
> Also, it warns if the cr3 pfn doesn't match the mm pgd, should it warn
> if cr3 lam bits don't match the MM's copy?

You are right, taking LAM mode from init_mm is more correct. And we need
to update tlbstate with the new LAM mode.

I think both CR3 and init_mm should LAM disabled here as we are bringing
CPU up. I'll add WARN_ON().

--
Kirill A. Shutemov

2022-06-15 17:03:55

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCHv3 5/8] x86/uaccess: Provide untagged_addr() and remove tags before address check

On Mon, Jun 13, 2022 at 05:36:43PM +0000, Edgecombe, Rick P wrote:
> On Fri, 2022-06-10 at 17:35 +0300, Kirill A. Shutemov wrote:
> > +#ifdef CONFIG_X86_64
> > +/*
> > + * Mask out tag bits from the address.
> > + *
> > + * Magic with the 'sign' allows to untag userspace pointer without
> > any branches
> > + * while leaving kernel addresses intact.
>
> Trying to understand the magic part here. I guess how it works is, when
> the high bit is set, it does the opposite of untagging the addresses by
> setting the tag bits instead of clearing them. So:
> - For proper canonical kernel addresses (with U57) it leaves them
> intact since the tag bits were already set.
> - For non-canonical kernel-half addresses, it fixes them up.
> (0xeffffff000000840->0xfffffff000000840)
> - For U48 and 5 level paging, it corrupts some normal kernel
> addresses. (0xff90ffffffffffff->0xffffffffffffffff)
>
> I just ported this to userspace and threw some addresses at it to see
> what happened, so hopefully I got that right.

Ouch. Thanks for noticing this. I should have catched this myself. Yes,
this implementation is broken for LAM_U48 on 5-level machine.

What about this:

#define untagged_addr(mm, addr) ({ \
u64 __addr = (__force u64)(addr); \
s64 sign = (s64)__addr >> 63; \
__addr &= (mm)->context.untag_mask | sign; \
(__force __typeof__(addr))__addr; \
})

It makes mask effectively. all-ones for supervisor addresses. And it is
less magic to my eyes.

The generated code also look sane to me:

11d0: 48 89 f8 mov %rdi,%rax
11d3: 48 c1 f8 3f sar $0x3f,%rax
11d7: 48 0b 05 52 2e 00 00 or 0x2e52(%rip),%rax # 4030 <untag_mask>
11de: 48 21 f8 and %rdi,%rax

Any comments?

> Is this special kernel address handling only needed because
> copy_to_kernel_nofault(), etc call the user helpers?

I did not have any particular use-case in mind. But just if some kernel
address gets there and bits get cleared we will have very hard to debug
bug.

--
Kirill A. Shutemov

2022-06-15 20:16:52

by Edgecombe, Rick P

[permalink] [raw]
Subject: Re: [PATCHv3 5/8] x86/uaccess: Provide untagged_addr() and remove tags before address check

On Wed, 2022-06-15 at 19:58 +0300, Kirill A. Shutemov wrote:
> On Mon, Jun 13, 2022 at 05:36:43PM +0000, Edgecombe, Rick P wrote:
> > On Fri, 2022-06-10 at 17:35 +0300, Kirill A. Shutemov wrote:
> > > +#ifdef CONFIG_X86_64
> > > +/*
> > > + * Mask out tag bits from the address.
> > > + *
> > > + * Magic with the 'sign' allows to untag userspace pointer
> > > without
> > > any branches
> > > + * while leaving kernel addresses intact.
> >
> > Trying to understand the magic part here. I guess how it works is,
> > when
> > the high bit is set, it does the opposite of untagging the
> > addresses by
> > setting the tag bits instead of clearing them. So:
> > - For proper canonical kernel addresses (with U57) it leaves
> > them
> > intact since the tag bits were already set.
> > - For non-canonical kernel-half addresses, it fixes them up.
> > (0xeffffff000000840->0xfffffff000000840)
> > - For U48 and 5 level paging, it corrupts some normal kernel
> > addresses. (0xff90ffffffffffff->0xffffffffffffffff)
> >
> > I just ported this to userspace and threw some addresses at it to
> > see
> > what happened, so hopefully I got that right.
>
> Ouch. Thanks for noticing this. I should have catched this myself.
> Yes,
> this implementation is broken for LAM_U48 on 5-level machine.
>
> What about this:
>
> #define untagged_addr(mm,
> addr) ({ \
> u64 __addr = (__force
> u64)(addr); \
> s64 sign = (s64)__addr >>
> 63; \
> __addr &= (mm)->context.untag_mask |
> sign; \
> (__force
> __typeof__(addr))__addr; \
> })
>
> It makes mask effectively. all-ones for supervisor addresses. And it
> is
> less magic to my eyes.

Yea, it seems to leave kernel half addresses alone now, including
leaving non-canonical addresses as non-canonical and 5 level addresses.

With the new bit math:
Reviewed-by: Rick Edgecombe <[email protected]>

>
> The generated code also look sane to me:
>
> 11d0: 48 89 f8 mov %rdi,%rax
> 11d3: 48 c1 f8 3f sar $0x3f,%rax
> 11d7: 48 0b 05 52 2e 00 00 or
> 0x2e52(%rip),%rax # 4030 <untag_mask>
> 11de: 48 21 f8 and %rdi,%rax
>
> Any comments?
>
> > Is this special kernel address handling only needed because
> > copy_to_kernel_nofault(), etc call the user helpers?
>
> I did not have any particular use-case in mind. But just if some
> kernel
> address gets there and bits get cleared we will have very hard to
> debug
> bug.

I just was thinking if we could rearrange the code to avoid untagging
kernel addresses, we could skip this, or even VM_WARN_ON() if we see
one. Seems ok either way.

2022-06-16 09:16:21

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCHv3 4/8] x86/mm: Handle LAM on context switch

On Fri, Jun 10, 2022 at 05:35:23PM +0300, Kirill A. Shutemov wrote:

> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index 4af5579c7ef7..5b93dad93ff4 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -86,6 +86,9 @@ struct tlb_state {
> unsigned long last_user_mm_spec;
> };
>
> +#ifdef CONFIG_X86_64
> + u64 lam_cr3_mask;
> +#endif
> u16 loaded_mm_asid;
> u16 next_asid;
>

Urgh.. so there's a comment there that states:

/*
* 6 because 6 should be plenty and struct tlb_state will fit in two cache
* lines.
*/
#define TLB_NR_DYN_ASIDS 6

And then look at tlb_state:

struct tlb_state {
struct mm_struct * loaded_mm; /* 0 8 */
union {
struct mm_struct * last_user_mm; /* 8 8 */
long unsigned int last_user_mm_spec; /* 8 8 */
}; /* 8 8 */
u16 loaded_mm_asid; /* 16 2 */
u16 next_asid; /* 18 2 */
bool invalidate_other; /* 20 1 */

/* XXX 1 byte hole, try to pack */

short unsigned int user_pcid_flush_mask; /* 22 2 */
long unsigned int cr4; /* 24 8 */
struct tlb_context ctxs[6]; /* 32 96 */

/* size: 128, cachelines: 2, members: 8 */
/* sum members: 127, holes: 1, sum holes: 1 */
};

If you add that u64 as you do, you'll wreck all that.

Either use that one spare byte, or find room elsewhere I suppose.

2022-06-16 09:49:08

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCHv3 5/8] x86/uaccess: Provide untagged_addr() and remove tags before address check

On Mon, Jun 13, 2022 at 05:36:43PM +0000, Edgecombe, Rick P wrote:

> Is this special kernel address handling only needed because
> copy_to_kernel_nofault(), etc call the user helpers?

It is to make absolutely sure we don't need to go audit everything, if
code is correct without untag_pointer() it will still be correct with it
on.

Also avoids future bugs due to being robust in general.

2022-06-16 10:04:06

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCHv3 5/8] x86/uaccess: Provide untagged_addr() and remove tags before address check

On Mon, Jun 13, 2022 at 05:36:43PM +0000, Edgecombe, Rick P wrote:
> On Fri, 2022-06-10 at 17:35 +0300, Kirill A. Shutemov wrote:
> > +#ifdef CONFIG_X86_64
> > +/*
> > + * Mask out tag bits from the address.
> > + *
> > + * Magic with the 'sign' allows to untag userspace pointer without
> > any branches
> > + * while leaving kernel addresses intact.
>
> Trying to understand the magic part here. I guess how it works is, when
> the high bit is set, it does the opposite of untagging the addresses by
> setting the tag bits instead of clearing them. So:

The magic is really rather simple to see; there's two observations:

x ^ y ^ y == x

That is; xor is it's own inverse. And secondly, xor with 1 is a bit
toggle.

So if we mask a negative value, we destroy the sign. Therefore, if we
xor with the sign-bit, we have a nop for positive numbers and a toggle
for negatives (effectively making them positive, -1, 2s complement
yada-yada) then we can mask, without fear of destroying the sign, and
then we xor again to undo whatever we did before, effectively restoring
the sign.

Anyway, concequence of all this is that LAM_U48 won't work correct on
5-level kernels, because the mask will still destroy kernel pointers.

As such, this patch only does LAM_U57.


2022-06-16 10:26:07

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCHv3 5/8] x86/uaccess: Provide untagged_addr() and remove tags before address check

On Fri, Jun 10, 2022 at 05:35:24PM +0300, Kirill A. Shutemov wrote:
> +#ifdef CONFIG_X86_64
> +/*
> + * Mask out tag bits from the address.
> + *
> + * Magic with the 'sign' allows to untag userspace pointer without any branches
> + * while leaving kernel addresses intact.
> + */
> +#define untagged_addr(mm, addr) ({ \
> + u64 __addr = (__force u64)(addr); \
> + s64 sign = (s64)__addr >> 63; \
> + __addr ^= sign; \
> + __addr &= (mm)->context.untag_mask; \
> + __addr ^= sign; \
> + (__force __typeof__(addr))__addr; \
> +})

Can't we make that mask a constant and *always* unmask U57 irrespective
of LAM being on?

2022-06-16 16:59:19

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCHv3 5/8] x86/uaccess: Provide untagged_addr() and remove tags before address check

On Thu, Jun 16, 2022 at 11:30:49AM +0200, Peter Zijlstra wrote:
> On Mon, Jun 13, 2022 at 05:36:43PM +0000, Edgecombe, Rick P wrote:
> > On Fri, 2022-06-10 at 17:35 +0300, Kirill A. Shutemov wrote:
> > > +#ifdef CONFIG_X86_64
> > > +/*
> > > + * Mask out tag bits from the address.
> > > + *
> > > + * Magic with the 'sign' allows to untag userspace pointer without
> > > any branches
> > > + * while leaving kernel addresses intact.
> >
> > Trying to understand the magic part here. I guess how it works is, when
> > the high bit is set, it does the opposite of untagging the addresses by
> > setting the tag bits instead of clearing them. So:
>
> The magic is really rather simple to see; there's two observations:
>
> x ^ y ^ y == x
>
> That is; xor is it's own inverse. And secondly, xor with 1 is a bit
> toggle.
>
> So if we mask a negative value, we destroy the sign. Therefore, if we
> xor with the sign-bit, we have a nop for positive numbers and a toggle
> for negatives (effectively making them positive, -1, 2s complement
> yada-yada) then we can mask, without fear of destroying the sign, and
> then we xor again to undo whatever we did before, effectively restoring
> the sign.
>
> Anyway, concequence of all this is that LAM_U48 won't work correct on
> 5-level kernels, because the mask will still destroy kernel pointers.

Any objection against this variant (was posted in the thread):

#define untagged_addr(mm, addr) ({ \
u64 __addr = (__force u64)(addr); \
s64 sign = (s64)__addr >> 63; \
__addr &= (mm)->context.untag_mask | sign; \
(__force __typeof__(addr))__addr; \
})

?

I find it easier to follow and it is LAM_U48-safe.

--
Kirill A. Shutemov

2022-06-16 17:00:31

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCHv3 4/8] x86/mm: Handle LAM on context switch

On Thu, Jun 16, 2022 at 11:08:07AM +0200, Peter Zijlstra wrote:
> Either use that one spare byte, or find room elsewhere I suppose.

Okay, I will put into the byte after invalidate_other and modify
tlbstate_lam_cr3_mask() and set_tlbstate_lam_cr3_mask() to shift it
accordingly.

It looks like this:

struct tlb_state {
struct mm_struct * loaded_mm; /* 0 8 */
union {
struct mm_struct * last_user_mm; /* 8 8 */
unsigned long last_user_mm_spec; /* 8 8 */
}; /* 8 8 */
union {
struct mm_struct * last_user_mm; /* 0 8 */
unsigned long last_user_mm_spec; /* 0 8 */
};

u16 loaded_mm_asid; /* 16 2 */
u16 next_asid; /* 18 2 */
bool invalidate_other; /* 20 1 */
u8 lam; /* 21 1 */
unsigned short user_pcid_flush_mask; /* 22 2 */
unsigned long cr4; /* 24 8 */
struct tlb_context ctxs[6]; /* 32 96 */

/* size: 128, cachelines: 2, members: 9 */
};

--
Kirill A. Shutemov

2022-06-16 17:00:41

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCHv3 5/8] x86/uaccess: Provide untagged_addr() and remove tags before address check

On Thu, Jun 16, 2022 at 12:02:16PM +0200, Peter Zijlstra wrote:
> On Fri, Jun 10, 2022 at 05:35:24PM +0300, Kirill A. Shutemov wrote:
> > +#ifdef CONFIG_X86_64
> > +/*
> > + * Mask out tag bits from the address.
> > + *
> > + * Magic with the 'sign' allows to untag userspace pointer without any branches
> > + * while leaving kernel addresses intact.
> > + */
> > +#define untagged_addr(mm, addr) ({ \
> > + u64 __addr = (__force u64)(addr); \
> > + s64 sign = (s64)__addr >> 63; \
> > + __addr ^= sign; \
> > + __addr &= (mm)->context.untag_mask; \
> > + __addr ^= sign; \
> > + (__force __typeof__(addr))__addr; \
> > +})
>
> Can't we make that mask a constant and *always* unmask U57 irrespective
> of LAM being on?

We can do this if we give up on LAM_U48.

It would also needlessly relax canonical check. I'm not sure it is a good
idea.

--
Kirill A. Shutemov

2022-06-16 23:18:57

by Edgecombe, Rick P

[permalink] [raw]
Subject: Re: [PATCHv3 0/8] Linear Address Masking enabling

On Fri, 2022-06-10 at 17:35 +0300, Kirill A. Shutemov wrote:
> Linear Address Masking[1] (LAM) modifies the checking that is applied
> to
> 64-bit linear addresses, allowing software to use of the untranslated
> address bits for metadata.
>
> The patchset brings support for LAM for userspace addresses.

Arm has this documentation about which memory operations support being
passed tagged pointers, and which do not:
Documentation/arm64/tagged-address-abi.rst

Is the idea that LAM would have something similar, or exactly mirror
the arm ABI? It seems like it is the same right now. Should the docs be
generalized?

2022-06-16 23:54:47

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCHv3 0/8] Linear Address Masking enabling

On Thu, Jun 16, 2022 at 10:52:14PM +0000, Edgecombe, Rick P wrote:
> On Fri, 2022-06-10 at 17:35 +0300, Kirill A. Shutemov wrote:
> > Linear Address Masking[1] (LAM) modifies the checking that is applied
> > to
> > 64-bit linear addresses, allowing software to use of the untranslated
> > address bits for metadata.
> >
> > The patchset brings support for LAM for userspace addresses.
>
> Arm has this documentation about which memory operations support being
> passed tagged pointers, and which do not:
> Documentation/arm64/tagged-address-abi.rst
>
> Is the idea that LAM would have something similar, or exactly mirror
> the arm ABI? It seems like it is the same right now. Should the docs be
> generalized?

It is somewhat similar, but not exact. ARM TBI interface implies tag size
and placement. ARM TBI is per-thread and LAM is per-process.

--
Kirill A. Shutemov

2022-06-17 00:05:33

by Edgecombe, Rick P

[permalink] [raw]
Subject: Re: [PATCHv3 0/8] Linear Address Masking enabling

On Fri, 2022-06-17 at 02:43 +0300, Kirill A. Shutemov wrote:
> On Thu, Jun 16, 2022 at 10:52:14PM +0000, Edgecombe, Rick P wrote:
> > On Fri, 2022-06-10 at 17:35 +0300, Kirill A. Shutemov wrote:
> > > Linear Address Masking[1] (LAM) modifies the checking that is
> > > applied
> > > to
> > > 64-bit linear addresses, allowing software to use of the
> > > untranslated
> > > address bits for metadata.
> > >
> > > The patchset brings support for LAM for userspace addresses.
> >
> > Arm has this documentation about which memory operations support
> > being
> > passed tagged pointers, and which do not:
> > Documentation/arm64/tagged-address-abi.rst
> >
> > Is the idea that LAM would have something similar, or exactly
> > mirror
> > the arm ABI? It seems like it is the same right now. Should the
> > docs be
> > generalized?
>
> It is somewhat similar, but not exact. ARM TBI interface implies tag
> size
> and placement. ARM TBI is per-thread and LAM is per-process.

Ah right. I was thinking more the part about which syscalls support
tagged addresses:

https://www.kernel.org/doc/html/latest/arm64/tagged-address-abi.html#id1

Some mention kernel versions where they changed. Just thinking it could
get complex to track which HW features support which syscalls for which
kernel versions.

2022-06-17 11:47:35

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCHv3 5/8] x86/uaccess: Provide untagged_addr() and remove tags before address check

On Thu, Jun 16, 2022 at 07:44:40PM +0300, Kirill A. Shutemov wrote:
> Any objection against this variant (was posted in the thread):
>
> #define untagged_addr(mm, addr) ({ \
> u64 __addr = (__force u64)(addr); \
> s64 sign = (s64)__addr >> 63; \
> __addr &= (mm)->context.untag_mask | sign; \
> (__force __typeof__(addr))__addr; \
> })
>
> ?

Yeah, I suppose that should work fine.

2022-06-17 14:43:09

by H.J. Lu

[permalink] [raw]
Subject: Re: [PATCHv3 5/8] x86/uaccess: Provide untagged_addr() and remove tags before address check

On Fri, Jun 17, 2022 at 4:36 AM Peter Zijlstra <[email protected]> wrote:
>
> On Thu, Jun 16, 2022 at 07:44:40PM +0300, Kirill A. Shutemov wrote:
> > Any objection against this variant (was posted in the thread):
> >
> > #define untagged_addr(mm, addr) ({ \
> > u64 __addr = (__force u64)(addr); \
> > s64 sign = (s64)__addr >> 63; \
> > __addr &= (mm)->context.untag_mask | sign; \
> > (__force __typeof__(addr))__addr; \
> > })
> >
> > ?
>
> Yeah, I suppose that should work fine.

Won't the sign bit be put at the wrong place?

--
H.J.

2022-06-17 14:45:11

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCHv3 5/8] x86/uaccess: Provide untagged_addr() and remove tags before address check

On Fri, Jun 17, 2022 at 07:22:57AM -0700, H.J. Lu wrote:
> On Fri, Jun 17, 2022 at 4:36 AM Peter Zijlstra <[email protected]> wrote:
> >
> > On Thu, Jun 16, 2022 at 07:44:40PM +0300, Kirill A. Shutemov wrote:
> > > Any objection against this variant (was posted in the thread):
> > >
> > > #define untagged_addr(mm, addr) ({ \
> > > u64 __addr = (__force u64)(addr); \
> > > s64 sign = (s64)__addr >> 63; \
> > > __addr &= (mm)->context.untag_mask | sign; \
> > > (__force __typeof__(addr))__addr; \
> > > })
> > >
> > > ?
> >
> > Yeah, I suppose that should work fine.
>
> Won't the sign bit be put at the wrong place?

sign is either 0 or ~0, by or-ing ~0 into the mask, the masking becomes
no-op.

2022-06-17 16:18:58

by Alexander Potapenko

[permalink] [raw]
Subject: Re: [PATCHv3 4/8] x86/mm: Handle LAM on context switch

On Fri, Jun 10, 2022 at 4:35 PM Kirill A. Shutemov
<[email protected]> wrote:
>
> Linear Address Masking mode for userspace pointers encoded in CR3 bits.
> The mode is selected per-thread. Add new thread features indicate that the
> thread has Linear Address Masking enabled.
>
> switch_mm_irqs_off() now respects these flags and constructs CR3
> accordingly.
>
> The active LAM mode gets recorded in the tlb_state.
>
> Signed-off-by: Kirill A. Shutemov <[email protected]>


> +#ifdef CONFIG_X86_64
> +static inline u64 mm_cr3_lam_mask(struct mm_struct *mm)
> +{
> + return mm->context.lam_cr3_mask;
> +}

Nit: can we have either "cr3_lam_mask" or "lam_cr3_mask" in both places?



--
Alexander Potapenko
Software Engineer

Google Germany GmbH
Erika-Mann-Straße, 33
80636 München

Geschäftsführer: Paul Manicle, Liana Sebastian
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg

Diese E-Mail ist vertraulich. Falls Sie diese fälschlicherweise
erhalten haben sollten, leiten Sie diese bitte nicht an jemand anderes
weiter, löschen Sie alle Kopien und Anhänge davon und lassen Sie mich
bitte wissen, dass die E-Mail an die falsche Person gesendet wurde.


This e-mail is confidential. If you received this communication by
mistake, please don't forward it to anyone else, please erase all
copies and attachments, and please let me know that it has gone to the
wrong person.

2022-06-17 22:48:22

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCHv3 4/8] x86/mm: Handle LAM on context switch

On Fri, Jun 17, 2022 at 05:35:05PM +0200, Alexander Potapenko wrote:
> On Fri, Jun 10, 2022 at 4:35 PM Kirill A. Shutemov
> <[email protected]> wrote:
> >
> > Linear Address Masking mode for userspace pointers encoded in CR3 bits.
> > The mode is selected per-thread. Add new thread features indicate that the
> > thread has Linear Address Masking enabled.
> >
> > switch_mm_irqs_off() now respects these flags and constructs CR3
> > accordingly.
> >
> > The active LAM mode gets recorded in the tlb_state.
> >
> > Signed-off-by: Kirill A. Shutemov <[email protected]>
>
>
> > +#ifdef CONFIG_X86_64
> > +static inline u64 mm_cr3_lam_mask(struct mm_struct *mm)
> > +{
> > + return mm->context.lam_cr3_mask;
> > +}
>
> Nit: can we have either "cr3_lam_mask" or "lam_cr3_mask" in both places?

With changes sugessted by Peter, the field in the mmu_context will be
called 'lam' as it is not CR3 mask anymore.

--
Kirill A. Shutemov

2022-06-28 23:40:08

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCHv3 4/8] x86/mm: Handle LAM on context switch

On 6/10/22 07:35, Kirill A. Shutemov wrote:
> Linear Address Masking mode for userspace pointers encoded in CR3 bits.
> The mode is selected per-thread. Add new thread features indicate that the
> thread has Linear Address Masking enabled.
>
> switch_mm_irqs_off() now respects these flags and constructs CR3
> accordingly.
>
> The active LAM mode gets recorded in the tlb_state.
>
> Signed-off-by: Kirill A. Shutemov <[email protected]>
> ---
> arch/x86/include/asm/mmu.h | 1 +
> arch/x86/include/asm/mmu_context.h | 24 ++++++++++++
> arch/x86/include/asm/tlbflush.h | 3 ++
> arch/x86/mm/tlb.c | 62 ++++++++++++++++++++++--------
> 4 files changed, 75 insertions(+), 15 deletions(-)
>
> diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
> index 5d7494631ea9..d150e92163b6 100644
> --- a/arch/x86/include/asm/mmu.h
> +++ b/arch/x86/include/asm/mmu.h
> @@ -40,6 +40,7 @@ typedef struct {
>
> #ifdef CONFIG_X86_64
> unsigned short flags;
> + u64 lam_cr3_mask;
> #endif
>
> struct mutex lock;
> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
> index b8d40ddeab00..e6eac047c728 100644
> --- a/arch/x86/include/asm/mmu_context.h
> +++ b/arch/x86/include/asm/mmu_context.h
> @@ -91,6 +91,29 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
> }
> #endif
>
> +#ifdef CONFIG_X86_64
> +static inline u64 mm_cr3_lam_mask(struct mm_struct *mm)
> +{
> + return mm->context.lam_cr3_mask;
> +}
> +
> +static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
> +{
> + mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask;
> +}
> +
> +#else
> +
> +static inline u64 mm_cr3_lam_mask(struct mm_struct *mm)
> +{
> + return 0;
> +}
> +
> +static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
> +{
> +}
> +#endif

Do we really need the ifdeffery here? I see no real harm in having the
field exist on 32-bit -- we don't care much about performance for 32-bit
kernels.

> - if (real_prev == next) {
> + if (real_prev == next && prev_lam == new_lam) {
> VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
> next->context.ctx_id);

This looks wrong to me. If we change threads within the same mm but lam
changes (which is certainly possible by a race if nothing else) then
this will go down the "we really are changing mms" path, not the "we're
not changing but we might need to flush something" path.

2022-06-29 00:12:55

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCHv3 5/8] x86/uaccess: Provide untagged_addr() and remove tags before address check

On 6/10/22 07:35, Kirill A. Shutemov wrote:
> untagged_addr() is a helper used by the core-mm to strip tag bits and
> get the address to the canonical shape. In only handles userspace
> addresses. The untagging mask is stored in mmu_context and will be set
> on enabling LAM for the process.
>
> The tags must not be included into check whether it's okay to access the
> userspace address.
>
> Strip tags in access_ok().

What is the intended behavior for an access that spans a tag boundary?

Also, at the risk of a potentially silly question, why do we need to
strip the tag before access_ok()? With LAM, every valid tagged user
address is also a valid untagged address, right? (There is no
particular need to enforce the actual value of TASK_SIZE_MAX on
*access*, just on mmap.)

IOW, wouldn't it be sufficient, and probably better than what we have
now, to just check that the entire range has the high bit clear?

--Andy

2022-06-29 01:09:18

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCHv3 4/8] x86/mm: Handle LAM on context switch

On Tue, Jun 28, 2022 at 04:33:21PM -0700, Andy Lutomirski wrote:
> On 6/10/22 07:35, Kirill A. Shutemov wrote:
> > Linear Address Masking mode for userspace pointers encoded in CR3 bits.
> > The mode is selected per-thread. Add new thread features indicate that the
> > thread has Linear Address Masking enabled.
> >
> > switch_mm_irqs_off() now respects these flags and constructs CR3
> > accordingly.
> >
> > The active LAM mode gets recorded in the tlb_state.
> >
> > Signed-off-by: Kirill A. Shutemov <[email protected]>
> > ---
> > arch/x86/include/asm/mmu.h | 1 +
> > arch/x86/include/asm/mmu_context.h | 24 ++++++++++++
> > arch/x86/include/asm/tlbflush.h | 3 ++
> > arch/x86/mm/tlb.c | 62 ++++++++++++++++++++++--------
> > 4 files changed, 75 insertions(+), 15 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
> > index 5d7494631ea9..d150e92163b6 100644
> > --- a/arch/x86/include/asm/mmu.h
> > +++ b/arch/x86/include/asm/mmu.h
> > @@ -40,6 +40,7 @@ typedef struct {
> > #ifdef CONFIG_X86_64
> > unsigned short flags;
> > + u64 lam_cr3_mask;
> > #endif
> > struct mutex lock;
> > diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
> > index b8d40ddeab00..e6eac047c728 100644
> > --- a/arch/x86/include/asm/mmu_context.h
> > +++ b/arch/x86/include/asm/mmu_context.h
> > @@ -91,6 +91,29 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
> > }
> > #endif
> > +#ifdef CONFIG_X86_64
> > +static inline u64 mm_cr3_lam_mask(struct mm_struct *mm)
> > +{
> > + return mm->context.lam_cr3_mask;
> > +}
> > +
> > +static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
> > +{
> > + mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask;
> > +}
> > +
> > +#else
> > +
> > +static inline u64 mm_cr3_lam_mask(struct mm_struct *mm)
> > +{
> > + return 0;
> > +}
> > +
> > +static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
> > +{
> > +}
> > +#endif
>
> Do we really need the ifdeffery here? I see no real harm in having the
> field exist on 32-bit -- we don't care much about performance for 32-bit
> kernels.

The waste doesn't feel right to me. I would rather keep it.

But sure I can do this if needed.

> > - if (real_prev == next) {
> > + if (real_prev == next && prev_lam == new_lam) {
> > VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
> > next->context.ctx_id);
>
> This looks wrong to me. If we change threads within the same mm but lam
> changes (which is certainly possible by a race if nothing else) then this
> will go down the "we really are changing mms" path, not the "we're not
> changing but we might need to flush something" path.

If LAM gets enabled we must write CR3 with the new LAM mode. Without the
change real_prev == next case will not do this for !was_lazy case.

Note that currently enabling LAM is done by setting LAM mode in the mmu
context and doing switch_mm(current->mm, current->mm, current), so it is
very important case.

--
Kirill A. Shutemov

2022-06-29 01:18:09

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCHv3 5/8] x86/uaccess: Provide untagged_addr() and remove tags before address check

On Tue, Jun 28, 2022 at 04:40:48PM -0700, Andy Lutomirski wrote:
> On 6/10/22 07:35, Kirill A. Shutemov wrote:
> > untagged_addr() is a helper used by the core-mm to strip tag bits and
> > get the address to the canonical shape. In only handles userspace
> > addresses. The untagging mask is stored in mmu_context and will be set
> > on enabling LAM for the process.
> >
> > The tags must not be included into check whether it's okay to access the
> > userspace address.
> >
> > Strip tags in access_ok().
>
> What is the intended behavior for an access that spans a tag boundary?

You mean if 'addr' passed to access_ok() is below 47- or 56-bit but 'addr'
+ 'size' overflows into tags? If is not valid access and the current
implementation works correctly.

> Also, at the risk of a potentially silly question, why do we need to strip
> the tag before access_ok()? With LAM, every valid tagged user address is
> also a valid untagged address, right? (There is no particular need to
> enforce the actual value of TASK_SIZE_MAX on *access*, just on mmap.)
>
> IOW, wouldn't it be sufficient, and probably better than what we have now,
> to just check that the entire range has the high bit clear?

No. We do things to addresses on kernel side beyond dereferencing them.
Like comparing addresses have to make sense. find_vma() has to find
relevant mapping and so on.

--
Kirill A. Shutemov

2022-06-30 02:00:51

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCHv3 4/8] x86/mm: Handle LAM on context switch



On Tue, Jun 28, 2022, at 5:34 PM, Kirill A. Shutemov wrote:
> On Tue, Jun 28, 2022 at 04:33:21PM -0700, Andy Lutomirski wrote:
>> On 6/10/22 07:35, Kirill A. Shutemov wrote:
>> > Linear Address Masking mode for userspace pointers encoded in CR3 bits.
>> > The mode is selected per-thread. Add new thread features indicate that the
>> > thread has Linear Address Masking enabled.
>> >
>> > switch_mm_irqs_off() now respects these flags and constructs CR3
>> > accordingly.
>> >
>> > The active LAM mode gets recorded in the tlb_state.
>> >
>> > Signed-off-by: Kirill A. Shutemov <[email protected]>
>> > ---
>> > arch/x86/include/asm/mmu.h | 1 +
>> > arch/x86/include/asm/mmu_context.h | 24 ++++++++++++
>> > arch/x86/include/asm/tlbflush.h | 3 ++
>> > arch/x86/mm/tlb.c | 62 ++++++++++++++++++++++--------
>> > 4 files changed, 75 insertions(+), 15 deletions(-)
>> >
>> > diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
>> > index 5d7494631ea9..d150e92163b6 100644
>> > --- a/arch/x86/include/asm/mmu.h
>> > +++ b/arch/x86/include/asm/mmu.h
>> > @@ -40,6 +40,7 @@ typedef struct {
>> > #ifdef CONFIG_X86_64
>> > unsigned short flags;
>> > + u64 lam_cr3_mask;
>> > #endif
>> > struct mutex lock;
>> > diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
>> > index b8d40ddeab00..e6eac047c728 100644
>> > --- a/arch/x86/include/asm/mmu_context.h
>> > +++ b/arch/x86/include/asm/mmu_context.h
>> > @@ -91,6 +91,29 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
>> > }
>> > #endif
>> > +#ifdef CONFIG_X86_64
>> > +static inline u64 mm_cr3_lam_mask(struct mm_struct *mm)
>> > +{
>> > + return mm->context.lam_cr3_mask;
>> > +}
>> > +
>> > +static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
>> > +{
>> > + mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask;
>> > +}
>> > +
>> > +#else
>> > +
>> > +static inline u64 mm_cr3_lam_mask(struct mm_struct *mm)
>> > +{
>> > + return 0;
>> > +}
>> > +
>> > +static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
>> > +{
>> > +}
>> > +#endif
>>
>> Do we really need the ifdeffery here? I see no real harm in having the
>> field exist on 32-bit -- we don't care much about performance for 32-bit
>> kernels.
>
> The waste doesn't feel right to me. I would rather keep it.
>
> But sure I can do this if needed.

I could go either way here.

>
>> > - if (real_prev == next) {
>> > + if (real_prev == next && prev_lam == new_lam) {
>> > VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
>> > next->context.ctx_id);
>>
>> This looks wrong to me. If we change threads within the same mm but lam
>> changes (which is certainly possible by a race if nothing else) then this
>> will go down the "we really are changing mms" path, not the "we're not
>> changing but we might need to flush something" path.
>
> If LAM gets enabled we must write CR3 with the new LAM mode. Without the
> change real_prev == next case will not do this for !was_lazy case.

You could fix that. Or you could determine that this isn’t actually needed, just like updating the LDT in that path isn’t needed, if you change the way LAM is updated.

>
> Note that currently enabling LAM is done by setting LAM mode in the mmu
> context and doing switch_mm(current->mm, current->mm, current), so it is
> very important case.
>

Well, I did separately ask why this is the case.

> --
> Kirill A. Shutemov

2022-06-30 03:24:23

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCHv3 5/8] x86/uaccess: Provide untagged_addr() and remove tags before address check



On Tue, Jun 28, 2022, at 5:42 PM, Kirill A. Shutemov wrote:
> On Tue, Jun 28, 2022 at 04:40:48PM -0700, Andy Lutomirski wrote:
>> On 6/10/22 07:35, Kirill A. Shutemov wrote:
>> > untagged_addr() is a helper used by the core-mm to strip tag bits and
>> > get the address to the canonical shape. In only handles userspace
>> > addresses. The untagging mask is stored in mmu_context and will be set
>> > on enabling LAM for the process.
>> >
>> > The tags must not be included into check whether it's okay to access the
>> > userspace address.
>> >
>> > Strip tags in access_ok().
>>
>> What is the intended behavior for an access that spans a tag boundary?
>
> You mean if 'addr' passed to access_ok() is below 47- or 56-bit but 'addr'
> + 'size' overflows into tags? If is not valid access and the current
> implementation works correctly.
>
>> Also, at the risk of a potentially silly question, why do we need to strip
>> the tag before access_ok()? With LAM, every valid tagged user address is
>> also a valid untagged address, right? (There is no particular need to
>> enforce the actual value of TASK_SIZE_MAX on *access*, just on mmap.)
>>
>> IOW, wouldn't it be sufficient, and probably better than what we have now,
>> to just check that the entire range has the high bit clear?
>
> No. We do things to addresses on kernel side beyond dereferencing them.
> Like comparing addresses have to make sense. find_vma() has to find
> relevant mapping and so on.

I think you’re misunderstanding me. Of course things like find_vma() need to untag the address. (But things like munmap, IMO, ought not accept tagged addresses.)

But I’m not talking about that at all. I’m asking why we need to untag an address for access_ok. In the bad old days, access_ok checked that a range was below a *variable* cutoff. But set_fs() is gone, and I don’t think this is needed any more.

With some off-the-cuff bit hackery, I think it ought to be sufficient to calculate addr+len and fail if the overflow or sign bits get set. If LAM is off, we could calculate addr | len and fail if either of the top two bits is set, but LAM may not let us get away with that. The general point being that, on x86 (as long as we ignore AMD’s LAM-like feature) an address is a user address if the top bit is clear. Whether that address is canonical or not or will translate or not is a separate issue. (And making this change would require allowing uaccess to #GP, which requires some care.)

What do you think?

>
> --
> Kirill A. Shutemov

2022-07-05 00:42:15

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCHv3 5/8] x86/uaccess: Provide untagged_addr() and remove tags before address check

On Wed, Jun 29, 2022 at 07:38:20PM -0700, Andy Lutomirski wrote:
>
>
> On Tue, Jun 28, 2022, at 5:42 PM, Kirill A. Shutemov wrote:
> > On Tue, Jun 28, 2022 at 04:40:48PM -0700, Andy Lutomirski wrote:
> >> On 6/10/22 07:35, Kirill A. Shutemov wrote:
> >> > untagged_addr() is a helper used by the core-mm to strip tag bits and
> >> > get the address to the canonical shape. In only handles userspace
> >> > addresses. The untagging mask is stored in mmu_context and will be set
> >> > on enabling LAM for the process.
> >> >
> >> > The tags must not be included into check whether it's okay to access the
> >> > userspace address.
> >> >
> >> > Strip tags in access_ok().
> >>
> >> What is the intended behavior for an access that spans a tag boundary?
> >
> > You mean if 'addr' passed to access_ok() is below 47- or 56-bit but 'addr'
> > + 'size' overflows into tags? If is not valid access and the current
> > implementation works correctly.
> >
> >> Also, at the risk of a potentially silly question, why do we need to strip
> >> the tag before access_ok()? With LAM, every valid tagged user address is
> >> also a valid untagged address, right? (There is no particular need to
> >> enforce the actual value of TASK_SIZE_MAX on *access*, just on mmap.)
> >>
> >> IOW, wouldn't it be sufficient, and probably better than what we have now,
> >> to just check that the entire range has the high bit clear?
> >
> > No. We do things to addresses on kernel side beyond dereferencing them.
> > Like comparing addresses have to make sense. find_vma() has to find
> > relevant mapping and so on.
>
> I think you’re misunderstanding me. Of course things like find_vma()
> need to untag the address. (But things like munmap, IMO, ought not
> accept tagged addresses.)
>
> But I’m not talking about that at all. I’m asking why we need to untag
> an address for access_ok. In the bad old days, access_ok checked that a
> range was below a *variable* cutoff. But set_fs() is gone, and I don’t
> think this is needed any more.
>
> With some off-the-cuff bit hackery, I think it ought to be sufficient to
> calculate addr+len and fail if the overflow or sign bits get set. If LAM
> is off, we could calculate addr | len and fail if either of the top two
> bits is set, but LAM may not let us get away with that. The general
> point being that, on x86 (as long as we ignore AMD’s LAM-like feature)
> an address is a user address if the top bit is clear. Whether that
> address is canonical or not or will translate or not is a separate
> issue. (And making this change would require allowing uaccess to #GP,
> which requires some care.)
>
> What do you think?

I think it can work. Below is my first take on this. It is raw and
requires more work.

You talk about access_ok(), but I also checked what has to be done in
get_user()/put_user() path. Not sure it is a good idea (but it seems
work).

The basic idea is to OR start and end of the range and check that MSB is
clear.

I reworked array_index_mask_nospec() thing to make the address all-ones if
it is ouside of the user range. It should work to stop speculation as well
as making it zero as we do now.

The patch as it is breaks 32-bit. It has to be handled separately.

Any comments?

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 803241dfc473..53c6f86b036f 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -45,6 +45,12 @@ static inline bool pagefault_disabled(void);
#define untagged_ptr(mm, ptr) (ptr)
#endif

+#define __access_ok(ptr, size) \
+({ \
+ unsigned long addr = (unsigned long)(ptr); \
+ !((addr | (size) | addr + (size)) >> (BITS_PER_LONG - 1)); \
+})
+
/**
* access_ok - Checks if a user space pointer is valid
* @addr: User space pointer to start of block to check
@@ -62,10 +68,10 @@ static inline bool pagefault_disabled(void);
* Return: true (nonzero) if the memory block may be valid, false (zero)
* if it is definitely invalid.
*/
-#define access_ok(addr, size) \
+#define access_ok(addr, size) \
({ \
WARN_ON_IN_IRQ(); \
- likely(__access_ok(untagged_addr(current->mm, addr), size)); \
+ likely(__access_ok(addr, size)); \
})

#include <asm-generic/access_ok.h>
@@ -150,13 +156,7 @@ extern int __get_user_bad(void);
* Return: zero on success, or -EFAULT on error.
* On error, the variable @x is set to zero.
*/
-#define get_user(x,ptr) \
-({ \
- __typeof__(*(ptr)) __user *__ptr_clean; \
- __ptr_clean = untagged_ptr(current->mm, ptr); \
- might_fault(); \
- do_get_user_call(get_user,x,__ptr_clean); \
-})
+#define get_user(x,ptr) ({ might_fault(); do_get_user_call(get_user,x,ptr); })

/**
* __get_user - Get a simple variable from user space, with less checking.
@@ -253,12 +253,7 @@ extern void __put_user_nocheck_8(void);
*
* Return: zero on success, or -EFAULT on error.
*/
-#define put_user(x, ptr) ({ \
- __typeof__(*(ptr)) __user *__ptr_clean; \
- __ptr_clean = untagged_ptr(current->mm, ptr); \
- might_fault(); \
- do_put_user_call(put_user,x,__ptr_clean); \
-})
+#define put_user(x, ptr) ({ might_fault(); do_put_user_call(put_user,x,ptr); })

/**
* __put_user - Write a simple value into user space, with less checking.
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index b70d98d79a9d..e526ae6ff844 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -37,22 +37,13 @@

#define ASM_BARRIER_NOSPEC ALTERNATIVE "", "lfence", X86_FEATURE_LFENCE_RDTSC

-#ifdef CONFIG_X86_5LEVEL
-#define LOAD_TASK_SIZE_MINUS_N(n) \
- ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rdx), \
- __stringify(mov $((1 << 56) - 4096 - (n)),%rdx), X86_FEATURE_LA57
-#else
-#define LOAD_TASK_SIZE_MINUS_N(n) \
- mov $(TASK_SIZE_MAX - (n)),%_ASM_DX
-#endif
-
.text
SYM_FUNC_START(__get_user_1)
- LOAD_TASK_SIZE_MINUS_N(0)
- cmp %_ASM_DX,%_ASM_AX
- jae bad_get_user
- sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
- and %_ASM_DX, %_ASM_AX
+ mov %_ASM_AX, %_ASM_DX
+ shl $1, %_ASM_DX
+ jb bad_get_user
+ sbb %_ASM_DX, %_ASM_DX
+ or %_ASM_DX, %_ASM_AX
ASM_STAC
1: movzbl (%_ASM_AX),%edx
xor %eax,%eax
@@ -62,11 +53,13 @@ SYM_FUNC_END(__get_user_1)
EXPORT_SYMBOL(__get_user_1)

SYM_FUNC_START(__get_user_2)
- LOAD_TASK_SIZE_MINUS_N(1)
- cmp %_ASM_DX,%_ASM_AX
- jae bad_get_user
- sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
- and %_ASM_DX, %_ASM_AX
+ mov %_ASM_AX, %_ASM_DX
+ add $1, %_ASM_DX
+ or %_ASM_AX, %_ASM_DX
+ shl $1, %_ASM_DX
+ jb bad_get_user
+ sbb %_ASM_DX, %_ASM_DX
+ or %_ASM_DX, %_ASM_AX
ASM_STAC
2: movzwl (%_ASM_AX),%edx
xor %eax,%eax
@@ -76,11 +69,13 @@ SYM_FUNC_END(__get_user_2)
EXPORT_SYMBOL(__get_user_2)

SYM_FUNC_START(__get_user_4)
- LOAD_TASK_SIZE_MINUS_N(3)
- cmp %_ASM_DX,%_ASM_AX
- jae bad_get_user
- sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
- and %_ASM_DX, %_ASM_AX
+ mov %_ASM_AX, %_ASM_DX
+ add $3, %_ASM_DX
+ or %_ASM_AX, %_ASM_DX
+ shl $1, %_ASM_DX
+ jb bad_get_user
+ sbb %_ASM_DX, %_ASM_DX
+ or %_ASM_DX, %_ASM_AX
ASM_STAC
3: movl (%_ASM_AX),%edx
xor %eax,%eax
@@ -91,22 +86,26 @@ EXPORT_SYMBOL(__get_user_4)

SYM_FUNC_START(__get_user_8)
#ifdef CONFIG_X86_64
- LOAD_TASK_SIZE_MINUS_N(7)
- cmp %_ASM_DX,%_ASM_AX
- jae bad_get_user
- sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
- and %_ASM_DX, %_ASM_AX
+ mov %_ASM_AX, %_ASM_DX
+ add $7, %_ASM_DX
+ or %_ASM_AX, %_ASM_DX
+ shl $1, %_ASM_DX
+ jb bad_get_user
+ sbb %_ASM_DX, %_ASM_DX
+ or %_ASM_DX, %_ASM_AX
ASM_STAC
4: movq (%_ASM_AX),%rdx
xor %eax,%eax
ASM_CLAC
RET
#else
- LOAD_TASK_SIZE_MINUS_N(7)
- cmp %_ASM_DX,%_ASM_AX
- jae bad_get_user_8
- sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
- and %_ASM_DX, %_ASM_AX
+ mov %_ASM_AX, %_ASM_DX
+ add $7, %_ASM_DX
+ or %_ASM_AX, %_ASM_DX
+ shl $1, %_ASM_DX
+ jb bad_get_user
+ sbb %_ASM_DX, %_ASM_DX
+ or %_ASM_DX, %_ASM_AX
ASM_STAC
4: movl (%_ASM_AX),%edx
5: movl 4(%_ASM_AX),%ecx
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index b7dfd60243b7..eb7c4396cb1e 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -33,20 +33,11 @@
* as they get called from within inline assembly.
*/

-#ifdef CONFIG_X86_5LEVEL
-#define LOAD_TASK_SIZE_MINUS_N(n) \
- ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rbx), \
- __stringify(mov $((1 << 56) - 4096 - (n)),%rbx), X86_FEATURE_LA57
-#else
-#define LOAD_TASK_SIZE_MINUS_N(n) \
- mov $(TASK_SIZE_MAX - (n)),%_ASM_BX
-#endif
-
.text
SYM_FUNC_START(__put_user_1)
- LOAD_TASK_SIZE_MINUS_N(0)
- cmp %_ASM_BX,%_ASM_CX
- jae .Lbad_put_user
+ mov %_ASM_CX, %_ASM_BX
+ shl $1, %_ASM_BX
+ jb .Lbad_put_user
SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL)
ENDBR
ASM_STAC
@@ -59,9 +50,11 @@ EXPORT_SYMBOL(__put_user_1)
EXPORT_SYMBOL(__put_user_nocheck_1)

SYM_FUNC_START(__put_user_2)
- LOAD_TASK_SIZE_MINUS_N(1)
- cmp %_ASM_BX,%_ASM_CX
- jae .Lbad_put_user
+ mov %_ASM_CX, %_ASM_BX
+ add $1, %_ASM_BX
+ or %_ASM_CX, %_ASM_BX
+ shl $1, %_ASM_BX
+ jb .Lbad_put_user
SYM_INNER_LABEL(__put_user_nocheck_2, SYM_L_GLOBAL)
ENDBR
ASM_STAC
@@ -74,9 +67,11 @@ EXPORT_SYMBOL(__put_user_2)
EXPORT_SYMBOL(__put_user_nocheck_2)

SYM_FUNC_START(__put_user_4)
- LOAD_TASK_SIZE_MINUS_N(3)
- cmp %_ASM_BX,%_ASM_CX
- jae .Lbad_put_user
+ mov %_ASM_CX, %_ASM_BX
+ add $3, %_ASM_BX
+ or %_ASM_CX, %_ASM_BX
+ shl $1, %_ASM_BX
+ jb .Lbad_put_user
SYM_INNER_LABEL(__put_user_nocheck_4, SYM_L_GLOBAL)
ENDBR
ASM_STAC
@@ -89,9 +84,11 @@ EXPORT_SYMBOL(__put_user_4)
EXPORT_SYMBOL(__put_user_nocheck_4)

SYM_FUNC_START(__put_user_8)
- LOAD_TASK_SIZE_MINUS_N(7)
- cmp %_ASM_BX,%_ASM_CX
- jae .Lbad_put_user
+ mov %_ASM_CX, %_ASM_BX
+ add $7, %_ASM_BX
+ or %_ASM_CX, %_ASM_BX
+ shl $1, %_ASM_BX
+ jb .Lbad_put_user
SYM_INNER_LABEL(__put_user_nocheck_8, SYM_L_GLOBAL)
ENDBR
ASM_STAC
--
Kirill A. Shutemov