2022-07-12 23:41:44

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCHv5 05/13] x86/uaccess: Provide untagged_addr() and remove tags before address check

untagged_addr() is a helper used by the core-mm to strip tag bits and
get the address to the canonical shape. In only handles userspace
addresses. The untagging mask is stored in mmu_context and will be set
on enabling LAM for the process.

The tags must not be included into check whether it's okay to access the
userspace address.

Strip tags in access_ok().

get_user() and put_user() don't use access_ok(), but check access
against TASK_SIZE directly in assembly. Strip tags, before calling into
the assembly helper.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/include/asm/mmu.h | 3 +++
arch/x86/include/asm/mmu_context.h | 11 ++++++++
arch/x86/include/asm/uaccess.h | 42 +++++++++++++++++++++++++++---
arch/x86/kernel/process.c | 3 +++
4 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 002889ca8978..2fdb390040b5 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -43,6 +43,9 @@ typedef struct {

/* Active LAM mode: X86_CR3_LAM_U48 or X86_CR3_LAM_U57 or 0 (disabled) */
unsigned long lam_cr3_mask;
+
+ /* Significant bits of the virtual address. Excludes tag bits. */
+ u64 untag_mask;
#endif

struct mutex lock;
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 69c943b2ae90..5bd3d46685dc 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -100,6 +100,12 @@ static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
{
mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask;
+ mm->context.untag_mask = oldmm->context.untag_mask;
+}
+
+static inline void mm_reset_untag_mask(struct mm_struct *mm)
+{
+ mm->context.untag_mask = -1UL;
}

#else
@@ -112,6 +118,10 @@ static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
{
}
+
+static inline void mm_reset_untag_mask(struct mm_struct *mm)
+{
+}
#endif

#define enter_lazy_tlb enter_lazy_tlb
@@ -138,6 +148,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.execute_only_pkey = -1;
}
#endif
+ mm_reset_untag_mask(mm);
init_new_context_ldt(mm);
return 0;
}
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 913e593a3b45..803241dfc473 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -6,6 +6,7 @@
*/
#include <linux/compiler.h>
#include <linux/kasan-checks.h>
+#include <linux/mm_types.h>
#include <linux/string.h>
#include <asm/asm.h>
#include <asm/page.h>
@@ -20,6 +21,30 @@ static inline bool pagefault_disabled(void);
# define WARN_ON_IN_IRQ()
#endif

+#ifdef CONFIG_X86_64
+/*
+ * Mask out tag bits from the address.
+ *
+ * Magic with the 'sign' allows to untag userspace pointer without any branches
+ * while leaving kernel addresses intact.
+ */
+#define untagged_addr(mm, addr) ({ \
+ u64 __addr = (__force u64)(addr); \
+ s64 sign = (s64)__addr >> 63; \
+ __addr &= (mm)->context.untag_mask | sign; \
+ (__force __typeof__(addr))__addr; \
+})
+
+#define untagged_ptr(mm, ptr) ({ \
+ u64 __ptrval = (__force u64)(ptr); \
+ __ptrval = untagged_addr(mm, __ptrval); \
+ (__force __typeof__(*(ptr)) *)__ptrval; \
+})
+#else
+#define untagged_addr(mm, addr) (addr)
+#define untagged_ptr(mm, ptr) (ptr)
+#endif
+
/**
* access_ok - Checks if a user space pointer is valid
* @addr: User space pointer to start of block to check
@@ -40,7 +65,7 @@ static inline bool pagefault_disabled(void);
#define access_ok(addr, size) \
({ \
WARN_ON_IN_IRQ(); \
- likely(__access_ok(addr, size)); \
+ likely(__access_ok(untagged_addr(current->mm, addr), size)); \
})

#include <asm-generic/access_ok.h>
@@ -125,7 +150,13 @@ extern int __get_user_bad(void);
* Return: zero on success, or -EFAULT on error.
* On error, the variable @x is set to zero.
*/
-#define get_user(x,ptr) ({ might_fault(); do_get_user_call(get_user,x,ptr); })
+#define get_user(x,ptr) \
+({ \
+ __typeof__(*(ptr)) __user *__ptr_clean; \
+ __ptr_clean = untagged_ptr(current->mm, ptr); \
+ might_fault(); \
+ do_get_user_call(get_user,x,__ptr_clean); \
+})

/**
* __get_user - Get a simple variable from user space, with less checking.
@@ -222,7 +253,12 @@ extern void __put_user_nocheck_8(void);
*
* Return: zero on success, or -EFAULT on error.
*/
-#define put_user(x, ptr) ({ might_fault(); do_put_user_call(put_user,x,ptr); })
+#define put_user(x, ptr) ({ \
+ __typeof__(*(ptr)) __user *__ptr_clean; \
+ __ptr_clean = untagged_ptr(current->mm, ptr); \
+ might_fault(); \
+ do_put_user_call(put_user,x,__ptr_clean); \
+})

/**
* __put_user - Write a simple value into user space, with less checking.
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9b2772b7e1f3..18b2bfdf7b9b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -47,6 +47,7 @@
#include <asm/frame.h>
#include <asm/unwind.h>
#include <asm/tdx.h>
+#include <asm/mmu_context.h>

#include "process.h"

@@ -367,6 +368,8 @@ void arch_setup_new_exec(void)
task_clear_spec_ssb_noexec(current);
speculation_ctrl_update(read_thread_flags());
}
+
+ mm_reset_untag_mask(current->mm);
}

#ifdef CONFIG_X86_IOPL_IOPERM
--
2.35.1


2022-07-13 15:06:11

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCHv5.1 04/13] x86/mm: Handle LAM on context switch

Linear Address Masking mode for userspace pointers encoded in CR3 bits.
The mode is selected per-thread. Add new thread features indicate that the
thread has Linear Address Masking enabled.

switch_mm_irqs_off() now respects these flags and constructs CR3
accordingly.

The active LAM mode gets recorded in the tlb_state.

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
v5.1:
- Fix build issue with CONFIG_MODULE=y
---
arch/x86/include/asm/mmu.h | 3 +++
arch/x86/include/asm/mmu_context.h | 24 +++++++++++++++++
arch/x86/include/asm/tlbflush.h | 35 +++++++++++++++++++++++++
arch/x86/mm/tlb.c | 42 +++++++++++++++++++-----------
4 files changed, 89 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 5d7494631ea9..002889ca8978 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -40,6 +40,9 @@ typedef struct {

#ifdef CONFIG_X86_64
unsigned short flags;
+
+ /* Active LAM mode: X86_CR3_LAM_U48 or X86_CR3_LAM_U57 or 0 (disabled) */
+ unsigned long lam_cr3_mask;
#endif

struct mutex lock;
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index b8d40ddeab00..69c943b2ae90 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -91,6 +91,29 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
}
#endif

+#ifdef CONFIG_X86_64
+static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
+{
+ return mm->context.lam_cr3_mask;
+}
+
+static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+ mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask;
+}
+
+#else
+
+static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
+{
+ return 0;
+}
+
+static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+}
+#endif
+
#define enter_lazy_tlb enter_lazy_tlb
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);

@@ -168,6 +191,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
arch_dup_pkeys(oldmm, mm);
paravirt_arch_dup_mmap(oldmm, mm);
+ dup_lam(oldmm, mm);
return ldt_dup_context(oldmm, mm);
}

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 4af5579c7ef7..efe83d33327f 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -100,6 +100,16 @@ struct tlb_state {
*/
bool invalidate_other;

+#ifdef CONFIG_X86_64
+ /*
+ * Active LAM mode.
+ *
+ * X86_CR3_LAM_U57/U48 shifted right by X86_CR3_LAM_U57_BIT or 0 if LAM
+ * disabled.
+ */
+ u8 lam;
+#endif
+
/*
* Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
* the corresponding user PCID needs a flush next time we
@@ -356,6 +366,30 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
}
#define huge_pmd_needs_flush huge_pmd_needs_flush

+#ifdef CONFIG_X86_64
+static inline unsigned long tlbstate_lam_cr3_mask(void)
+{
+ unsigned long lam = this_cpu_read(cpu_tlbstate.lam);
+
+ return lam << X86_CR3_LAM_U57_BIT;
+}
+
+static inline void set_tlbstate_cr3_lam_mask(unsigned long mask)
+{
+ this_cpu_write(cpu_tlbstate.lam, mask >> X86_CR3_LAM_U57_BIT);
+}
+
+#else
+
+static inline unsigned long tlbstate_lam_cr3_mask(void)
+{
+ return 0;
+}
+
+static inline void set_tlbstate_cr3_lam_mask(u64 mask)
+{
+}
+#endif
#endif /* !MODULE */

static inline void __native_tlb_flush_global(unsigned long cr4)
@@ -363,4 +397,5 @@ static inline void __native_tlb_flush_global(unsigned long cr4)
native_write_cr4(cr4 ^ X86_CR4_PGE);
native_write_cr4(cr4);
}
+
#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index d400b6d9d246..4c93f87a8928 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -154,17 +154,18 @@ static inline u16 user_pcid(u16 asid)
return ret;
}

-static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
+static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam)
{
if (static_cpu_has(X86_FEATURE_PCID)) {
- return __sme_pa(pgd) | kern_pcid(asid);
+ return __sme_pa(pgd) | kern_pcid(asid) | lam;
} else {
VM_WARN_ON_ONCE(asid != 0);
- return __sme_pa(pgd);
+ return __sme_pa(pgd) | lam;
}
}

-static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
+static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid,
+ unsigned long lam)
{
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
/*
@@ -173,7 +174,7 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
* boot because all CPU's the have same capabilities:
*/
VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
- return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
+ return __sme_pa(pgd) | kern_pcid(asid) | lam | CR3_NOFLUSH;
}

/*
@@ -274,15 +275,16 @@ static inline void invalidate_user_asid(u16 asid)
(unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
}

-static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
+static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam,
+ bool need_flush)
{
unsigned long new_mm_cr3;

if (need_flush) {
invalidate_user_asid(new_asid);
- new_mm_cr3 = build_cr3(pgdir, new_asid);
+ new_mm_cr3 = build_cr3(pgdir, new_asid, lam);
} else {
- new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
+ new_mm_cr3 = build_cr3_noflush(pgdir, new_asid, lam);
}

/*
@@ -491,6 +493,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ unsigned long prev_lam = tlbstate_lam_cr3_mask();
+ unsigned long new_lam = mm_lam_cr3_mask(next);
bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
@@ -520,7 +524,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* isn't free.
*/
#ifdef CONFIG_DEBUG_VM
- if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
+ if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid, prev_lam))) {
/*
* If we were to BUG here, we'd be very likely to kill
* the system so hard that we don't see the call trace.
@@ -622,15 +626,16 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
barrier();
}

+ set_tlbstate_cr3_lam_mask(new_lam);
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- load_new_mm_cr3(next->pgd, new_asid, true);
+ load_new_mm_cr3(next->pgd, new_asid, new_lam, true);

trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
- load_new_mm_cr3(next->pgd, new_asid, false);
+ load_new_mm_cr3(next->pgd, new_asid, new_lam, false);

trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
@@ -691,6 +696,10 @@ void initialize_tlbstate_and_flush(void)
/* Assert that CR3 already references the right mm. */
WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));

+ /* LAM expected to be disabled in CR3 and init_mm */
+ WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57));
+ WARN_ON(mm_lam_cr3_mask(&init_mm));
+
/*
* Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
* doesn't work like other CR4 bits because it can only be set from
@@ -699,8 +708,8 @@ void initialize_tlbstate_and_flush(void)
WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
!(cr4_read_shadow() & X86_CR4_PCIDE));

- /* Force ASID 0 and force a TLB flush. */
- write_cr3(build_cr3(mm->pgd, 0));
+ /* Disable LAM, force ASID 0 and force a TLB flush. */
+ write_cr3(build_cr3(mm->pgd, 0, 0));

/* Reinitialize tlbstate. */
this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT);
@@ -708,6 +717,7 @@ void initialize_tlbstate_and_flush(void)
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
+ set_tlbstate_cr3_lam_mask(0);

for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
@@ -1047,8 +1057,10 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
*/
unsigned long __get_current_cr3_fast(void)
{
- unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
- this_cpu_read(cpu_tlbstate.loaded_mm_asid));
+ unsigned long cr3 =
+ build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
+ this_cpu_read(cpu_tlbstate.loaded_mm_asid),
+ tlbstate_lam_cr3_mask());

/* For now, be very restrictive about when this can be called. */
VM_WARN_ON(in_nmi() || preemptible());
--
2.35.1

2022-07-20 09:16:52

by Alexander Potapenko

[permalink] [raw]
Subject: Re: [PATCHv5.1 04/13] x86/mm: Handle LAM on context switch

> /*
> @@ -491,6 +493,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
> {
> struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
> u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
> + unsigned long prev_lam = tlbstate_lam_cr3_mask();
Note: this variable is never used if CONFIG_DEBUG_VM is off.

> #ifdef CONFIG_DEBUG_VM
> - if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
> + if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid, prev_lam))) {
> /*
> * If we were to BUG here, we'd be very likely to kill
> * the system so hard that we don't see the call trace.

2022-07-20 13:17:42

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCHv5.1 04/13] x86/mm: Handle LAM on context switch

On Wed, Jul 20, 2022 at 10:57:01AM +0200, Alexander Potapenko wrote:
> > /*
> > @@ -491,6 +493,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
> > {
> > struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
> > u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
> > + unsigned long prev_lam = tlbstate_lam_cr3_mask();
> Note: this variable is never used if CONFIG_DEBUG_VM is off.

Good point. I will add this:

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 4c93f87a8928..5e9ed9f55c36 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -558,6 +558,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
if (real_prev == next) {
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
+ VM_WARN_ON(prev_lam != new_lam);

/*
* Even in lazy TLB mode, the CPU should stay set in the
--
Kirill A. Shutemov

2022-07-21 14:11:44

by Alexander Potapenko

[permalink] [raw]
Subject: Re: [PATCHv5.1 04/13] x86/mm: Handle LAM on context switch

On Wed, Jul 13, 2022 at 5:02 PM Kirill A. Shutemov
<[email protected]> wrote:
>
> Linear Address Masking mode for userspace pointers encoded in CR3 bits.
> The mode is selected per-thread. Add new thread features indicate that the
> thread has Linear Address Masking enabled.
>
> switch_mm_irqs_off() now respects these flags and constructs CR3
> accordingly.
>
> The active LAM mode gets recorded in the tlb_state.
>
> Signed-off-by: Kirill A. Shutemov <[email protected]>
Tested-by: Alexander Potapenko <[email protected]>

> ---
> v5.1:
> - Fix build issue with CONFIG_MODULE=y
> ---
> arch/x86/include/asm/mmu.h | 3 +++
> arch/x86/include/asm/mmu_context.h | 24 +++++++++++++++++
> arch/x86/include/asm/tlbflush.h | 35 +++++++++++++++++++++++++
> arch/x86/mm/tlb.c | 42 +++++++++++++++++++-----------
> 4 files changed, 89 insertions(+), 15 deletions(-)
>
> diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
> index 5d7494631ea9..002889ca8978 100644
> --- a/arch/x86/include/asm/mmu.h
> +++ b/arch/x86/include/asm/mmu.h
> @@ -40,6 +40,9 @@ typedef struct {
>
> #ifdef CONFIG_X86_64
> unsigned short flags;
> +
> + /* Active LAM mode: X86_CR3_LAM_U48 or X86_CR3_LAM_U57 or 0 (disabled) */
> + unsigned long lam_cr3_mask;
> #endif
>
> struct mutex lock;
> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
> index b8d40ddeab00..69c943b2ae90 100644
> --- a/arch/x86/include/asm/mmu_context.h
> +++ b/arch/x86/include/asm/mmu_context.h
> @@ -91,6 +91,29 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
> }
> #endif
>
> +#ifdef CONFIG_X86_64
> +static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
> +{
> + return mm->context.lam_cr3_mask;
> +}
> +
> +static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
> +{
> + mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask;
> +}
> +
> +#else
> +
> +static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
> +{
> + return 0;
> +}
> +
> +static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
> +{
> +}
> +#endif
> +
> #define enter_lazy_tlb enter_lazy_tlb
> extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
>
> @@ -168,6 +191,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
> {
> arch_dup_pkeys(oldmm, mm);
> paravirt_arch_dup_mmap(oldmm, mm);
> + dup_lam(oldmm, mm);
> return ldt_dup_context(oldmm, mm);
> }
>
> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index 4af5579c7ef7..efe83d33327f 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -100,6 +100,16 @@ struct tlb_state {
> */
> bool invalidate_other;
>
> +#ifdef CONFIG_X86_64
> + /*
> + * Active LAM mode.
> + *
> + * X86_CR3_LAM_U57/U48 shifted right by X86_CR3_LAM_U57_BIT or 0 if LAM
> + * disabled.
> + */
> + u8 lam;
> +#endif
> +
> /*
> * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
> * the corresponding user PCID needs a flush next time we
> @@ -356,6 +366,30 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
> }
> #define huge_pmd_needs_flush huge_pmd_needs_flush
>
> +#ifdef CONFIG_X86_64
> +static inline unsigned long tlbstate_lam_cr3_mask(void)
> +{
> + unsigned long lam = this_cpu_read(cpu_tlbstate.lam);
> +
> + return lam << X86_CR3_LAM_U57_BIT;
> +}
> +
> +static inline void set_tlbstate_cr3_lam_mask(unsigned long mask)
> +{
> + this_cpu_write(cpu_tlbstate.lam, mask >> X86_CR3_LAM_U57_BIT);
> +}
> +
> +#else
> +
> +static inline unsigned long tlbstate_lam_cr3_mask(void)
> +{
> + return 0;
> +}
> +
> +static inline void set_tlbstate_cr3_lam_mask(u64 mask)
> +{
> +}
> +#endif
> #endif /* !MODULE */
>
> static inline void __native_tlb_flush_global(unsigned long cr4)
> @@ -363,4 +397,5 @@ static inline void __native_tlb_flush_global(unsigned long cr4)
> native_write_cr4(cr4 ^ X86_CR4_PGE);
> native_write_cr4(cr4);
> }
> +
> #endif /* _ASM_X86_TLBFLUSH_H */
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index d400b6d9d246..4c93f87a8928 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -154,17 +154,18 @@ static inline u16 user_pcid(u16 asid)
> return ret;
> }
>
> -static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
> +static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam)
> {
> if (static_cpu_has(X86_FEATURE_PCID)) {
> - return __sme_pa(pgd) | kern_pcid(asid);
> + return __sme_pa(pgd) | kern_pcid(asid) | lam;
> } else {
> VM_WARN_ON_ONCE(asid != 0);
> - return __sme_pa(pgd);
> + return __sme_pa(pgd) | lam;
> }
> }
>
> -static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
> +static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid,
> + unsigned long lam)
> {
> VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
> /*
> @@ -173,7 +174,7 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
> * boot because all CPU's the have same capabilities:
> */
> VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
> - return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
> + return __sme_pa(pgd) | kern_pcid(asid) | lam | CR3_NOFLUSH;
> }
>
> /*
> @@ -274,15 +275,16 @@ static inline void invalidate_user_asid(u16 asid)
> (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
> }
>
> -static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
> +static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam,
> + bool need_flush)
> {
> unsigned long new_mm_cr3;
>
> if (need_flush) {
> invalidate_user_asid(new_asid);
> - new_mm_cr3 = build_cr3(pgdir, new_asid);
> + new_mm_cr3 = build_cr3(pgdir, new_asid, lam);
> } else {
> - new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
> + new_mm_cr3 = build_cr3_noflush(pgdir, new_asid, lam);
> }
>
> /*
> @@ -491,6 +493,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
> {
> struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
> u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
> + unsigned long prev_lam = tlbstate_lam_cr3_mask();
> + unsigned long new_lam = mm_lam_cr3_mask(next);
> bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
> unsigned cpu = smp_processor_id();
> u64 next_tlb_gen;
> @@ -520,7 +524,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
> * isn't free.
> */
> #ifdef CONFIG_DEBUG_VM
> - if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
> + if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid, prev_lam))) {
> /*
> * If we were to BUG here, we'd be very likely to kill
> * the system so hard that we don't see the call trace.
> @@ -622,15 +626,16 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
> barrier();
> }
>
> + set_tlbstate_cr3_lam_mask(new_lam);
> if (need_flush) {
> this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
> this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
> - load_new_mm_cr3(next->pgd, new_asid, true);
> + load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
>
> trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
> } else {
> /* The new ASID is already up to date. */
> - load_new_mm_cr3(next->pgd, new_asid, false);
> + load_new_mm_cr3(next->pgd, new_asid, new_lam, false);
>
> trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
> }
> @@ -691,6 +696,10 @@ void initialize_tlbstate_and_flush(void)
> /* Assert that CR3 already references the right mm. */
> WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
>
> + /* LAM expected to be disabled in CR3 and init_mm */
> + WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57));
> + WARN_ON(mm_lam_cr3_mask(&init_mm));
> +
> /*
> * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
> * doesn't work like other CR4 bits because it can only be set from
> @@ -699,8 +708,8 @@ void initialize_tlbstate_and_flush(void)
> WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
> !(cr4_read_shadow() & X86_CR4_PCIDE));
>
> - /* Force ASID 0 and force a TLB flush. */
> - write_cr3(build_cr3(mm->pgd, 0));
> + /* Disable LAM, force ASID 0 and force a TLB flush. */
> + write_cr3(build_cr3(mm->pgd, 0, 0));
>
> /* Reinitialize tlbstate. */
> this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT);
> @@ -708,6 +717,7 @@ void initialize_tlbstate_and_flush(void)
> this_cpu_write(cpu_tlbstate.next_asid, 1);
> this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
> this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
> + set_tlbstate_cr3_lam_mask(0);
>
> for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
> this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
> @@ -1047,8 +1057,10 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
> */
> unsigned long __get_current_cr3_fast(void)
> {
> - unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
> - this_cpu_read(cpu_tlbstate.loaded_mm_asid));
> + unsigned long cr3 =
> + build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
> + this_cpu_read(cpu_tlbstate.loaded_mm_asid),
> + tlbstate_lam_cr3_mask());
>
> /* For now, be very restrictive about when this can be called. */
> VM_WARN_ON(in_nmi() || preemptible());
> --
> 2.35.1
>


--
Alexander Potapenko
Software Engineer

Google Germany GmbH
Erika-Mann-Straße, 33
80636 München

Geschäftsführer: Paul Manicle, Liana Sebastian
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg

2022-07-21 14:14:52

by Alexander Potapenko

[permalink] [raw]
Subject: Re: [PATCHv5 05/13] x86/uaccess: Provide untagged_addr() and remove tags before address check

On Wed, Jul 13, 2022 at 1:13 AM Kirill A. Shutemov
<[email protected]> wrote:
>
> untagged_addr() is a helper used by the core-mm to strip tag bits and
> get the address to the canonical shape. In only handles userspace
> addresses. The untagging mask is stored in mmu_context and will be set
> on enabling LAM for the process.
>
> The tags must not be included into check whether it's okay to access the
> userspace address.
>
> Strip tags in access_ok().
>
> get_user() and put_user() don't use access_ok(), but check access
> against TASK_SIZE directly in assembly. Strip tags, before calling into
> the assembly helper.
>
> Signed-off-by: Kirill A. Shutemov <[email protected]>
Tested-by: Alexander Potapenko <[email protected]>