LinuxLists.cc - [PATCH 1/2] x86/mm: delete unused cpu argument to leave

2024-01-26 09:10:15

Subject: [PATCH 1/2] x86/mm: delete unused cpu argument to leave_mm()

The argument is unused since commit 3d28ebceaffa ("x86/mm: Rework lazy
TLB to track the actual loaded mm"), delete it.

Signed-off-by: Yosry Ahmed <[email protected]>
---
arch/x86/include/asm/mmu.h | 2 +-
arch/x86/kernel/alternative.c | 2 +-
arch/x86/mm/tlb.c | 2 +-
arch/x86/xen/mmu_pv.c | 2 +-
drivers/cpuidle/cpuidle.c | 2 +-
include/linux/mmu_context.h | 2 +-
6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 0da5c227f490c..ce4677b8b7356 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -75,7 +75,7 @@ typedef struct {
.lock = __MUTEX_INITIALIZER(mm.context.lock), \
}

-void leave_mm(int cpu);
+void leave_mm(void);
#define leave_mm leave_mm

#endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index cc130b57542ac..66bd265c7a587 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1805,7 +1805,7 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
* restoring the previous mm.
*/
if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
- leave_mm(smp_processor_id());
+ leave_mm();

temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
switch_mm_irqs_off(NULL, mm, current);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5768d386efab6..80b0caa82a91b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -299,7 +299,7 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam,
write_cr3(new_mm_cr3);
}

-void leave_mm(int cpu)
+void leave_mm(void)
{
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);

diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 72af496a160c8..218773cfb009f 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -913,7 +913,7 @@ static void drop_mm_ref_this_cpu(void *info)
struct mm_struct *mm = info;

if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
- leave_mm(smp_processor_id());
+ leave_mm();

/*
* If this cpu still has a stale cr3 reference, then make sure
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 737a026ef58a3..02e40fd7d948c 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -237,7 +237,7 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev,
}

if (target_state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
- leave_mm(dev->cpu);
+ leave_mm();

/* Take note of the planned idle state. */
sched_idle_set_state(target_state);
diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
index f2b7a3f040999..bbaec80c78c50 100644
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -11,7 +11,7 @@
#endif

#ifndef leave_mm
-static inline void leave_mm(int cpu) { }
+static inline void leave_mm(void) { }
#endif

/*
--
2.43.0.429.g432eaa2c6b-goog

2024-01-26 09:11:38

by Yosry Ahmed

[permalink] [raw]

Subject: [PATCH 2/2] x86/mm: clarify "prev" usage in switch_mm_irqs_off()

In the x86 implementation of switch_mm_irqs_off(), we do not use the
"prev" argument passed in by the caller, we use exclusively use
"real_prev", which is cpu_tlbstate.loaded_mm. This is not obvious at the
first sight.

Furthermore, a comment describes a condition that happens
when called with prev == next, but this should not affect the function
in any way since prev is unused. Apparently, the comment is intended to
clarify why we don't rely on prev == next to decide whether we need to
update CR3, but again, it is not obvious. The comment also references
the fact that leave_mm() calls with prev == NULL and tsk == NULL, but
this also shouldn't matter because prev is unused and tsk is only used
in one function which has a NULL check.

Clarify things by renaming (prev -> unused) and (real_prev -> prev),
also move and rewrite the comment as an explanation for why we don't
rely on "prev" supplied by the caller in x86 code and use our own.
Hopefully this makes reading the code easier.

Signed-off-by: Yosry Ahmed <[email protected]>
---
arch/x86/mm/tlb.c | 35 ++++++++++++++++-------------------
1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 80b0caa82a91b..bf9605caf24f7 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -492,10 +492,16 @@ void cr4_update_pce(void *ignored)
static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
#endif

-void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+/*
+ * The "prev" argument passed by the caller does not always match CR3. For
+ * example, the scheduler passes in active_mm when switching from lazy TLB mode
+ * to normal mode, but switch_mm_irqs_off() can be called from x86 code without
+ * updating active_mm. Use cpu_tlbstate.loaded_mm instead.
+ */
+void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
struct task_struct *tsk)
{
- struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
+ struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
unsigned long new_lam = mm_lam_cr3_mask(next);
bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
@@ -504,15 +510,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
bool need_flush;
u16 new_asid;

- /*
- * NB: The scheduler will call us with prev == next when switching
- * from lazy TLB mode to normal mode if active_mm isn't changing.
- * When this happens, we don't assume that CR3 (and hence
- * cpu_tlbstate.loaded_mm) matches next.
- *
- * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
- */
-
/* We don't want flush_tlb_func() to run concurrently with us. */
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
WARN_ON_ONCE(!irqs_disabled());
@@ -527,7 +524,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* isn't free.
*/
#ifdef CONFIG_DEBUG_VM
- if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid,
+ if (WARN_ON_ONCE(__read_cr3() != build_cr3(prev->pgd, prev_asid,
tlbstate_lam_cr3_mask()))) {
/*
* If we were to BUG here, we'd be very likely to kill
@@ -559,7 +556,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* provides that full memory barrier and core serializing
* instruction.
*/
- if (real_prev == next) {
+ if (prev == next) {
/* Not actually switching mm's */
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
@@ -574,7 +571,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* mm_cpumask. The TLB shootdown code can figure out from
* cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
*/
- if (WARN_ON_ONCE(real_prev != &init_mm &&
+ if (WARN_ON_ONCE(prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));

@@ -616,10 +613,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* Skip kernel threads; we never send init_mm TLB flushing IPIs,
* but the bitmap manipulation can cause cache line contention.
*/
- if (real_prev != &init_mm) {
+ if (prev != &init_mm) {
VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
- mm_cpumask(real_prev)));
- cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+ mm_cpumask(prev)));
+ cpumask_clear_cpu(cpu, mm_cpumask(prev));
}

/*
@@ -656,9 +653,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);

- if (next != real_prev) {
+ if (next != prev) {
cr4_update_pce_mm(next);
- switch_ldt(real_prev, next);
+ switch_ldt(prev, next);
}
}

--
2.43.0.429.g432eaa2c6b-goog

2024-02-22 16:48:28

by Dave Hansen

[permalink] [raw]

Subject: Re: [PATCH 2/2] x86/mm: clarify "prev" usage in switch_mm_irqs_off()

On 1/26/24 00:06, Yosry Ahmed wrote:
> +/*
> + * The "prev" argument passed by the caller does not always match CR3. For
> + * example, the scheduler passes in active_mm when switching from lazy TLB mode
> + * to normal mode, but switch_mm_irqs_off() can be called from x86 code without
> + * updating active_mm. Use cpu_tlbstate.loaded_mm instead.
> + */
> +void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
> struct task_struct *tsk)

One nit here: It's not obvious that "unused" is 'the "prev" argument'.

Would something like this be more clear?

/*
* This optimizes when not actually switching mm's. Some architectures
* use the 'unused' argument for this optimization, but x86 must use
* 'cpu_tlbstate.loaded_mm' instead because it does not always keep
* ->active_mm up to date.
*/

Also, I think it might be useful to have the rule that arch/x86 code
_always_ calls switch_mm_irqs_off() with the first argument (the
newly-named 'unused') set to NULL. I think there's only one site:

> void switch_mm(struct mm_struct *prev, struct mm_struct *next,
> struct task_struct *tsk)
> {
> unsigned long flags;
>
> local_irq_save(flags);
> switch_mm_irqs_off(prev, next, tsk);
> local_irq_restore(flags);
> }

2024-02-22 18:55:01

by Dave Hansen

[permalink] [raw]

Subject: Re: [PATCH 2/2] x86/mm: clarify "prev" usage in switch_mm_irqs_off()

On 2/22/24 10:43, Yosry Ahmed wrote:
>> /*
>> * This optimizes when not actually switching mm's. Some architectures
>> * use the 'unused' argument for this optimization, but x86 must use
>> * 'cpu_tlbstate.loaded_mm' instead because it does not always keep
>> * ->active_mm up to date.
>> */
> Yes, this is more clear, thanks! However, Andrew already merged that
> patch into mm-stable, so it cannot be amended. I can send a separate
> patch to rewrite the comment tho if you'd like, WDYT?
>
>> Also, I think it might be useful to have the rule that arch/x86 code
>> _always_ calls switch_mm_irqs_off() with the first argument (the
>> newly-named 'unused') set to NULL. I think there's only one site:
> Agreed. I can also send a separate patch for this. Thanks!

That would be great. I'd be happy to ack them.

2024-02-22 19:57:06

by Yosry Ahmed

[permalink] [raw]

Subject: Re: [PATCH 2/2] x86/mm: clarify "prev" usage in switch_mm_irqs_off()

On Thu, Feb 22, 2024 at 08:48:17AM -0800, Dave Hansen wrote:
> On 1/26/24 00:06, Yosry Ahmed wrote:
> > +/*
> > + * The "prev" argument passed by the caller does not always match CR3. For
> > + * example, the scheduler passes in active_mm when switching from lazy TLB mode
> > + * to normal mode, but switch_mm_irqs_off() can be called from x86 code without
> > + * updating active_mm. Use cpu_tlbstate.loaded_mm instead.
> > + */
> > +void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
> > struct task_struct *tsk)
>
> One nit here: It's not obvious that "unused" is 'the "prev" argument'.
>
> Would something like this be more clear?
>
> /*
> * This optimizes when not actually switching mm's. Some architectures
> * use the 'unused' argument for this optimization, but x86 must use
> * 'cpu_tlbstate.loaded_mm' instead because it does not always keep
> * ->active_mm up to date.
> */

Yes, this is more clear, thanks! However, Andrew already merged that
patch into mm-stable, so it cannot be amended. I can send a separate
patch to rewrite the comment tho if you'd like, WDYT?

>
> Also, I think it might be useful to have the rule that arch/x86 code
> _always_ calls switch_mm_irqs_off() with the first argument (the
> newly-named 'unused') set to NULL. I think there's only one site:

Agreed. I can also send a separate patch for this. Thanks!

>
> > void switch_mm(struct mm_struct *prev, struct mm_struct *next,
> > struct task_struct *tsk)
> > {
> > unsigned long flags;
> >
> > local_irq_save(flags);
> > switch_mm_irqs_off(prev, next, tsk);
> > local_irq_restore(flags);
> > }
>