LAM can only be enabled when a process is single-threaded. But _kernel_
threads can temporarily use a single-threaded process's mm.
If LAM is enabled by a userspace process while a kthread is using its
mm, the kthread will not observe LAM enablement (i.e. LAM will be
disabled in CR3). This could be fine for the kthread itself, as LAM only
affects userspace addresses. However, if the kthread context switches to
a thread in the same userspace process, CR3 may or may not be updated
because the mm_struct doesn't change (based on pending TLB flushes). If
CR3 is not updated, the userspace thread will run incorrectly with LAM
disabled, which may cause page faults when using tagged addresses.
Example scenario:
CPU 1 CPU 2
/* kthread */
kthread_use_mm()
/* user thread */
prctl_enable_tagged_addr()
/* LAM enabled on CPU 2 */
/* LAM disabled on CPU 1 */
context_switch() /* to CPU 1 */
/* Switching to user thread */
switch_mm_irqs_off()
/* CR3 not updated */
/* LAM is still disabled on CPU 1 */
Synchronize LAM enablement by sending an IPI from
prctl_enable_tagged_addr() to all CPUs running with the mm_struct to
enable LAM. This makes sure LAM is enabled on CPU 1 in the above
scenario before prctl_enable_tagged_addr() returns and userspace starts
using tagged addresses, and before it's possible to run the userspace
process on CPU 1.
In switch_mm_irqs_off(), move reading the LAM mask until after
mm_cpumask() is updated. This ensures that if an outdated LAM mask is
written to CR3, an IPI is received to update it right after IRQs are
re-enabled.
Fixes: 82721d8b25d7 ("x86/mm: Handle LAM on context switch")
Suggested-by: Andy Lutomirski <[email protected]>
Signed-off-by: Yosry Ahmed <[email protected]>
---
arch/x86/kernel/process_64.c | 13 +++++++++++--
arch/x86/mm/tlb.c | 7 +++----
2 files changed, 14 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 33b268747bb7b..76e91fc68c5f3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -750,6 +750,16 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
#define LAM_U57_BITS 6
+static void enable_lam_func(void *__mm)
+{
+ struct mm_struct *mm = __mm;
+
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) {
+ write_cr3(__read_cr3() | mm->context.lam_cr3_mask);
+ set_tlbstate_lam_mode(mm);
+ }
+}
+
static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
{
if (!cpu_feature_enabled(X86_FEATURE_LAM))
@@ -782,8 +792,7 @@ static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
return -EINVAL;
}
- write_cr3(__read_cr3() | mm->context.lam_cr3_mask);
- set_tlbstate_lam_mode(mm);
+ on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true);
set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
mmap_write_unlock(mm);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5768d386efab6..e8feb2e154db2 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -497,9 +497,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- unsigned long new_lam = mm_lam_cr3_mask(next);
bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
unsigned cpu = smp_processor_id();
+ unsigned long new_lam;
u64 next_tlb_gen;
bool need_flush;
u16 new_asid;
@@ -622,9 +622,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
}
- /*
- * Start remote flushes and then read tlb_gen.
- */
+ /* Start receiving IPIs and then read tlb_gen (and LAM below) */
if (next != &init_mm)
cpumask_set_cpu(cpu, mm_cpumask(next));
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
@@ -636,6 +634,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
barrier();
}
+ new_lam = mm_lam_cr3_mask(next);
set_tlbstate_lam_mode(next);
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
--
2.44.0.278.ge034bb2e1d-goog
LAM can only be enabled when a process is single-threaded. But _kernel_
threads can temporarily use a single-threaded process's mm. That means
that a context-switching kernel thread can race and observe the mm's LAM
metadata (mm->context.lam_cr3_mask) change.
The context switch code does two logical things with that metadata:
populate CR3 and populate 'cpu_tlbstate.lam'. If it hits this race,
'cpu_tlbstate.lam' and CR3 can end up out of sync.
This de-synchronization is currently harmless. But it is confusing and
might lead to warnings or real bugs.
Update set_tlbstate_lam_mode() to take in the LAM mask and untag mask
instead of an mm_struct pointer, and while we are at it, rename it to
cpu_tlbstate_update_lam(). This should also make it clearer that we are
updating cpu_tlbstate. In switch_mm_irqs_off(), read the LAM mask once
and use it for both the cpu_tlbstate update and the CR3 update.
Signed-off-by: Yosry Ahmed <[email protected]>
---
arch/x86/include/asm/mmu_context.h | 8 +++++++-
arch/x86/include/asm/tlbflush.h | 9 ++++-----
arch/x86/kernel/process_64.c | 6 ++++--
arch/x86/mm/tlb.c | 8 +++++---
4 files changed, 20 insertions(+), 11 deletions(-)
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 8dac45a2c7fcf..19091ebb86338 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -88,7 +88,13 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
#ifdef CONFIG_ADDRESS_MASKING
static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
{
- return mm->context.lam_cr3_mask;
+ /*
+ * When switch_mm_irqs_off() is called for a kthread, it may race with
+ * LAM enablement. switch_mm_irqs_off() uses the LAM mask to do two
+ * things: populate CR3 and populate 'cpu_tlbstate.lam'. Make sure it
+ * reads a single value for both.
+ */
+ return READ_ONCE(mm->context.lam_cr3_mask);
}
static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 25726893c6f4d..69e79fff41b80 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -399,11 +399,10 @@ static inline u64 tlbstate_lam_cr3_mask(void)
return lam << X86_CR3_LAM_U57_BIT;
}
-static inline void set_tlbstate_lam_mode(struct mm_struct *mm)
+static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask)
{
- this_cpu_write(cpu_tlbstate.lam,
- mm->context.lam_cr3_mask >> X86_CR3_LAM_U57_BIT);
- this_cpu_write(tlbstate_untag_mask, mm->context.untag_mask);
+ this_cpu_write(cpu_tlbstate.lam, lam >> X86_CR3_LAM_U57_BIT);
+ this_cpu_write(tlbstate_untag_mask, untag_mask);
}
#else
@@ -413,7 +412,7 @@ static inline u64 tlbstate_lam_cr3_mask(void)
return 0;
}
-static inline void set_tlbstate_lam_mode(struct mm_struct *mm)
+static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask)
{
}
#endif
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 76e91fc68c5f3..748d2b3bdb985 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -753,10 +753,12 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
static void enable_lam_func(void *__mm)
{
struct mm_struct *mm = __mm;
+ unsigned long lam = mm_lam_cr3_mask(mm);
+ u64 untag_mask = mm_untag_mask(mm);
if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) {
- write_cr3(__read_cr3() | mm->context.lam_cr3_mask);
- set_tlbstate_lam_mode(mm);
+ write_cr3(__read_cr3() | lam);
+ cpu_tlbstate_update_lam(lam, untag_mask);
}
}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index e8feb2e154db2..b2f74c451b3d7 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -11,6 +11,7 @@
#include <linux/sched/smt.h>
#include <linux/task_work.h>
#include <linux/mmu_notifier.h>
+#include <linux/mmu_context.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -635,7 +636,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
}
new_lam = mm_lam_cr3_mask(next);
- set_tlbstate_lam_mode(next);
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
@@ -654,6 +654,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+ cpu_tlbstate_update_lam(new_lam, mm_untag_mask(next));
if (next != real_prev) {
cr4_update_pce_mm(next);
@@ -700,6 +701,7 @@ void initialize_tlbstate_and_flush(void)
int i;
struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
+ unsigned long lam = mm_lam_cr3_mask(mm);
unsigned long cr3 = __read_cr3();
/* Assert that CR3 already references the right mm. */
@@ -707,7 +709,7 @@ void initialize_tlbstate_and_flush(void)
/* LAM expected to be disabled */
WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57));
- WARN_ON(mm_lam_cr3_mask(mm));
+ WARN_ON(lam);
/*
* Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
@@ -726,7 +728,7 @@ void initialize_tlbstate_and_flush(void)
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
- set_tlbstate_lam_mode(mm);
+ cpu_tlbstate_update_lam(lam, mm_untag_mask(mm));
for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
--
2.44.0.278.ge034bb2e1d-goog
There are two separate checks in prctl_enable_tagged_addr() that nr_bits
is in the correct range. The checks are arranged such the correct case
is sandwiched between both error cases, which do exactly the same thing.
Simplify the if condition and pull the correct case outside with the
rest of the success code path.
Signed-off-by: Yosry Ahmed <[email protected]>
---
arch/x86/kernel/process_64.c | 10 +++-------
1 file changed, 3 insertions(+), 7 deletions(-)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 748d2b3bdb985..0608c4df4e95d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -783,17 +783,13 @@ static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
return -EBUSY;
}
- if (!nr_bits) {
- mmap_write_unlock(mm);
- return -EINVAL;
- } else if (nr_bits <= LAM_U57_BITS) {
- mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
- mm->context.untag_mask = ~GENMASK(62, 57);
- } else {
+ if (!nr_bits || nr_bits > LAM_U57_BITS) {
mmap_write_unlock(mm);
return -EINVAL;
}
+ mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
+ mm->context.untag_mask = ~GENMASK(62, 57);
on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true);
set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
--
2.44.0.278.ge034bb2e1d-goog
On Tue, Mar 12, 2024 at 03:59:49AM +0000, Yosry Ahmed wrote:
> LAM can only be enabled when a process is single-threaded. But _kernel_
> threads can temporarily use a single-threaded process's mm.
>
> If LAM is enabled by a userspace process while a kthread is using its
> mm, the kthread will not observe LAM enablement (i.e. LAM will be
> disabled in CR3). This could be fine for the kthread itself, as LAM only
> affects userspace addresses. However, if the kthread context switches to
> a thread in the same userspace process, CR3 may or may not be updated
> because the mm_struct doesn't change (based on pending TLB flushes). If
> CR3 is not updated, the userspace thread will run incorrectly with LAM
> disabled, which may cause page faults when using tagged addresses.
> Example scenario:
>
> CPU 1 CPU 2
> /* kthread */
> kthread_use_mm()
> /* user thread */
> prctl_enable_tagged_addr()
> /* LAM enabled on CPU 2 */
> /* LAM disabled on CPU 1 */
> context_switch() /* to CPU 1 */
> /* Switching to user thread */
> switch_mm_irqs_off()
> /* CR3 not updated */
> /* LAM is still disabled on CPU 1 */
>
> Synchronize LAM enablement by sending an IPI from
> prctl_enable_tagged_addr() to all CPUs running with the mm_struct to
> enable LAM. This makes sure LAM is enabled on CPU 1 in the above
> scenario before prctl_enable_tagged_addr() returns and userspace starts
> using tagged addresses, and before it's possible to run the userspace
> process on CPU 1.
>
> In switch_mm_irqs_off(), move reading the LAM mask until after
> mm_cpumask() is updated. This ensures that if an outdated LAM mask is
> written to CR3, an IPI is received to update it right after IRQs are
> re-enabled.
>
> Fixes: 82721d8b25d7 ("x86/mm: Handle LAM on context switch")
> Suggested-by: Andy Lutomirski <[email protected]>
> Signed-off-by: Yosry Ahmed <[email protected]>
Reviewed-by: Kirill A. Shutemov <[email protected]>
--
Kiryl Shutsemau / Kirill A. Shutemov
On Tue, Mar 12, 2024 at 03:59:51AM +0000, Yosry Ahmed wrote:
> There are two separate checks in prctl_enable_tagged_addr() that nr_bits
> is in the correct range. The checks are arranged such the correct case
> is sandwiched between both error cases, which do exactly the same thing.
>
> Simplify the if condition and pull the correct case outside with the
> rest of the success code path.
>
> Signed-off-by: Yosry Ahmed <[email protected]>
Reviewed-by: Kirill A. Shutemov <[email protected]>
--
Kiryl Shutsemau / Kirill A. Shutemov
On Tue, Mar 12, 2024 at 02:35:46PM +0200, Kirill A. Shutemov wrote:
> On Tue, Mar 12, 2024 at 03:59:50AM +0000, Yosry Ahmed wrote:
> > diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
> > index 76e91fc68c5f3..748d2b3bdb985 100644
> > --- a/arch/x86/kernel/process_64.c
> > +++ b/arch/x86/kernel/process_64.c
> > @@ -753,10 +753,12 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
> > static void enable_lam_func(void *__mm)
> > {
> > struct mm_struct *mm = __mm;
> > + unsigned long lam = mm_lam_cr3_mask(mm);
> > + u64 untag_mask = mm_untag_mask(mm);
> >
>
> Maybe push these mm dereferences inside the if block below?
> I am not sure if compiler can re-order operations past this_cpu_read().
I am not sure either, so I will push them inside. Better safe than
sorry. Will send a v2 shortly, thanks for the reviews!
>
> > if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) {
> > - write_cr3(__read_cr3() | mm->context.lam_cr3_mask);
> > - set_tlbstate_lam_mode(mm);
> > + write_cr3(__read_cr3() | lam);
> > + cpu_tlbstate_update_lam(lam, untag_mask);
> > }
> > }
> >
>
> --
> Kiryl Shutsemau / Kirill A. Shutemov
On Tue, Mar 12, 2024 at 03:59:50AM +0000, Yosry Ahmed wrote:
> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
> index 76e91fc68c5f3..748d2b3bdb985 100644
> --- a/arch/x86/kernel/process_64.c
> +++ b/arch/x86/kernel/process_64.c
> @@ -753,10 +753,12 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
> static void enable_lam_func(void *__mm)
> {
> struct mm_struct *mm = __mm;
> + unsigned long lam = mm_lam_cr3_mask(mm);
> + u64 untag_mask = mm_untag_mask(mm);
>
Maybe push these mm dereferences inside the if block below?
I am not sure if compiler can re-order operations past this_cpu_read().
> if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) {
> - write_cr3(__read_cr3() | mm->context.lam_cr3_mask);
> - set_tlbstate_lam_mode(mm);
> + write_cr3(__read_cr3() | lam);
> + cpu_tlbstate_update_lam(lam, untag_mask);
> }
> }
>
--
Kiryl Shutsemau / Kirill A. Shutemov