Currently the tlb_remove_table_smp_sync IPI is sent to all CPUs
indiscriminately, this causes unnecessary work and delays notable in
real-time use-cases and isolated cpus.
This patch will limit this IPI on systems with ARCH_HAS_CPUMASK_BITS,
Where the IPI will only be sent to cpus referencing the affected mm.
Signed-off-by: Yair Podemsky <[email protected]>
Suggested-by: David Hildenbrand <[email protected]>
---
include/asm-generic/tlb.h | 4 ++--
mm/khugepaged.c | 4 ++--
mm/mmu_gather.c | 17 ++++++++++++-----
3 files changed, 16 insertions(+), 9 deletions(-)
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index b46617207c93..0b6ba17cc8d3 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -222,7 +222,7 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
#define tlb_needs_table_invalidate() (true)
#endif
-void tlb_remove_table_sync_one(void);
+void tlb_remove_table_sync_one(struct mm_struct *mm);
#else
@@ -230,7 +230,7 @@ void tlb_remove_table_sync_one(void);
#error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE
#endif
-static inline void tlb_remove_table_sync_one(void) { }
+static inline void tlb_remove_table_sync_one(struct mm_struct *mm) { }
#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6b9d39d65b73..3e5cb079d268 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1166,7 +1166,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(&range);
- tlb_remove_table_sync_one();
+ tlb_remove_table_sync_one(mm);
spin_lock(pte_ptl);
result = __collapse_huge_page_isolate(vma, address, pte, cc,
@@ -1525,7 +1525,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v
addr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
pmd = pmdp_collapse_flush(vma, addr, pmdp);
- tlb_remove_table_sync_one();
+ tlb_remove_table_sync_one(mm);
mmu_notifier_invalidate_range_end(&range);
mm_dec_nr_ptes(mm);
page_table_check_pte_clear_range(mm, addr, pmd);
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index ea9683e12936..692d8175a88e 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -191,7 +191,13 @@ static void tlb_remove_table_smp_sync(void *arg)
/* Simply deliver the interrupt */
}
-void tlb_remove_table_sync_one(void)
+#ifdef CONFIG_ARCH_HAS_CPUMASK_BITS
+#define REMOVE_TABLE_IPI_MASK mm_cpumask(mm)
+#else
+#define REMOVE_TABLE_IPI_MASK cpu_online_mask
+#endif /* CONFIG_ARCH_HAS_CPUMASK_BITS */
+
+void tlb_remove_table_sync_one(struct mm_struct *mm)
{
/*
* This isn't an RCU grace period and hence the page-tables cannot be
@@ -200,7 +206,8 @@ void tlb_remove_table_sync_one(void)
* It is however sufficient for software page-table walkers that rely on
* IRQ disabling.
*/
- smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
+ on_each_cpu_mask(REMOVE_TABLE_IPI_MASK, tlb_remove_table_smp_sync,
+ NULL, true);
}
static void tlb_remove_table_rcu(struct rcu_head *head)
@@ -237,9 +244,9 @@ static inline void tlb_table_invalidate(struct mmu_gather *tlb)
}
}
-static void tlb_remove_table_one(void *table)
+static void tlb_remove_table_one(struct mm_struct *mm, void *table)
{
- tlb_remove_table_sync_one();
+ tlb_remove_table_sync_one(mm);
__tlb_remove_table(table);
}
@@ -262,7 +269,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
if (*batch == NULL) {
tlb_table_invalidate(tlb);
- tlb_remove_table_one(table);
+ tlb_remove_table_one(tlb->mm, table);
return;
}
(*batch)->nr = 0;
---
v2: replaced no REMOVE_TABLE_IPI_MASK REMOVE_TABLE_IPI_MASK to cpu_online_mask
--
2.39.3
On 6/20/23 07:46, Yair Podemsky wrote:
> -void tlb_remove_table_sync_one(void)
> +#ifdef CONFIG_ARCH_HAS_CPUMASK_BITS
> +#define REMOVE_TABLE_IPI_MASK mm_cpumask(mm)
> +#else
> +#define REMOVE_TABLE_IPI_MASK cpu_online_mask
> +#endif /* CONFIG_ARCH_HAS_CPUMASK_BITS */
> +
> +void tlb_remove_table_sync_one(struct mm_struct *mm)
> {
> /*
> * This isn't an RCU grace period and hence the page-tables cannot be
> @@ -200,7 +206,8 @@ void tlb_remove_table_sync_one(void)
> * It is however sufficient for software page-table walkers that rely on
> * IRQ disabling.
> */
> - smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
> + on_each_cpu_mask(REMOVE_TABLE_IPI_MASK, tlb_remove_table_smp_sync,
> + NULL, true);
> }
That "REMOVE_TABLE_IPI_MASK" thing is pretty confusing. It *looks* like
a constant. It does *NOT* look at all like it consumes 'mm'. Worst
case, just create a local variable:
if (IS_ENABLED(CONFIG_ARCH_HAS_CPUMASK_BITS))
ipi_mask = mm_cpumask(mm);
else
ipi_mask = cpu_online_mask;
on_each_cpu_mask(ipi_mask, ...);
That's a billion times more clear and it'll compile down to the same thing.
I do think the CONFIG_ARCH_HAS_CPUMASK_BITS naming is also pretty
confusing, but I don't have any better suggestions. Maybe something
with "MM_CPUMASK" in it?
>
> On Jun 20, 2023, at 7:46 AM, Yair Podemsky <[email protected]> wrote:
>
> @@ -1525,7 +1525,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v
> addr + HPAGE_PMD_SIZE);
> mmu_notifier_invalidate_range_start(&range);
> pmd = pmdp_collapse_flush(vma, addr, pmdp);
> - tlb_remove_table_sync_one();
> + tlb_remove_table_sync_one(mm);
Can’t pmdp_collapse_flush() have one additional argument “freed_tables”
that it would propagate, for instance on x86 to flush_tlb_mm_range() ?
Then you would not need tlb_remove_table_sync_one() to issue an additional
IPI, no?
It just seems that you might still have 2 IPIs in many cases instead of
one, and unless I am missing something, I don’t see why.
On Wed, 2023-06-21 at 10:42 -0700, Dave Hansen wrote:
> On 6/20/23 07:46, Yair Podemsky wrote:
> > -void tlb_remove_table_sync_one(void)
> > +#ifdef CONFIG_ARCH_HAS_CPUMASK_BITS
> > +#define REMOVE_TABLE_IPI_MASK mm_cpumask(mm)
> > +#else
> > +#define REMOVE_TABLE_IPI_MASK cpu_online_mask
> > +#endif /* CONFIG_ARCH_HAS_CPUMASK_BITS */
> > +
> > +void tlb_remove_table_sync_one(struct mm_struct *mm)
> > {
> > /*
> > * This isn't an RCU grace period and hence the page-tables
> > cannot be
> > @@ -200,7 +206,8 @@ void tlb_remove_table_sync_one(void)
> > * It is however sufficient for software page-table walkers
> > that rely on
> > * IRQ disabling.
> > */
> > - smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
> > + on_each_cpu_mask(REMOVE_TABLE_IPI_MASK,
> > tlb_remove_table_smp_sync,
> > + NULL, true);
> > }
>
> That "REMOVE_TABLE_IPI_MASK" thing is pretty confusing. It *looks*
> like
> a constant. It does *NOT* look at all like it consumes 'mm'. Worst
> case, just create a local variable:
>
> if (IS_ENABLED(CONFIG_ARCH_HAS_CPUMASK_BITS))
> ipi_mask = mm_cpumask(mm);
> else
> ipi_mask = cpu_online_mask;
>
> on_each_cpu_mask(ipi_mask, ...);
>
> That's a billion times more clear and it'll compile down to the same
> thing.
>
> I do think the CONFIG_ARCH_HAS_CPUMASK_BITS naming is also pretty
> confusing, but I don't have any better suggestions. Maybe something
> with "MM_CPUMASK" in it?
>
Hi Dave,
Thanks for your suggestions!
I will send a new version with the local variable as you suggested
soon.
As for the config name, what about CONFIG_ARCH_HAS_MM_CPUMASK?
Thanks,
Yair
On 6/22/23 06:14, [email protected] wrote:
> I will send a new version with the local variable as you suggested
> soon.
> As for the config name, what about CONFIG_ARCH_HAS_MM_CPUMASK?
The confusing part about that name is that mm_cpumask() and
mm->cpu_bitmap[] are defined unconditionally. So, they're *around*
unconditionally but just aren't updated.
BTW, it would also be nice to have _some_ kind of data behind this patch.
Fewer IPIs are better I guess, but it would still be nice if you could say:
Before this patch, /proc/interrupts showed 123 IPIs/hour for an
isolated CPU. After the approach here, it was 0.
... or something.
On Wed, 2023-06-21 at 11:02 -0700, Nadav Amit wrote:
> > On Jun 20, 2023, at 7:46 AM, Yair Podemsky <[email protected]>
> > wrote:
> >
> > @@ -1525,7 +1525,7 @@ static void collapse_and_free_pmd(struct
> > mm_struct *mm, struct vm_area_struct *v
> > addr + HPAGE_PMD_SIZE);
> > mmu_notifier_invalidate_range_start(&range);
> > pmd = pmdp_collapse_flush(vma, addr, pmdp);
> > - tlb_remove_table_sync_one();
> > + tlb_remove_table_sync_one(mm);
>
> Can’t pmdp_collapse_flush() have one additional argument
> “freed_tables”
> that it would propagate, for instance on x86 to flush_tlb_mm_range()
> ?
> Then you would not need tlb_remove_table_sync_one() to issue an
> additional
> IPI, no?
>
> It just seems that you might still have 2 IPIs in many cases instead
> of
> one, and unless I am missing something, I don’t see why.
>
Hi Nadav,
Thanks for your comment.
I think you are right and in some configurations 2 IPIs might occur.
However I a am not really dealing with the thp code at the moment,
This patch is about the mmu_gatherer and mostly dealing with IPIs sent
via the other code path.
Thanks,
Yair
On Wed, Jun 21, 2023 at 11:02 AM Nadav Amit <[email protected]> wrote:
>
> >
> > On Jun 20, 2023, at 7:46 AM, Yair Podemsky <[email protected]> wrote:
> >
> > @@ -1525,7 +1525,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v
> > addr + HPAGE_PMD_SIZE);
> > mmu_notifier_invalidate_range_start(&range);
> > pmd = pmdp_collapse_flush(vma, addr, pmdp);
> > - tlb_remove_table_sync_one();
> > + tlb_remove_table_sync_one(mm);
>
> Can’t pmdp_collapse_flush() have one additional argument “freed_tables”
> that it would propagate, for instance on x86 to flush_tlb_mm_range() ?
> Then you would not need tlb_remove_table_sync_one() to issue an additional
> IPI, no?
>
> It just seems that you might still have 2 IPIs in many cases instead of
> one, and unless I am missing something, I don’t see why.
The tlb_remove_table_sync_one() is used to serialize against fast GUP
for the architectures which don't broadcast TLB flush by IPI, for
example, arm64, etc. It may incur one extra IPI for x86 and some
others, but x86 virtualization needs this since the guest may not
flush TLB by sending IPI IIUC. So if the one extra IPI is really a
problem, we may be able to define an arch-specific function to deal
with it, for example, a pv ops off the top of my head. But I'm not a
virtualization expert, I'm not entirely sure whether it is the best
way or not. But the complexity seems overkilling TBH since khugepaged
is usually not called that often.
>
>
On Thu, 2023-06-22 at 06:37 -0700, Dave Hansen wrote:
> On 6/22/23 06:14, [email protected] wrote:
> > I will send a new version with the local variable as you suggested
> > soon.
> > As for the config name, what about CONFIG_ARCH_HAS_MM_CPUMASK?
>
> The confusing part about that name is that mm_cpumask() and
> mm->cpu_bitmap[] are defined unconditionally. So, they're *around*
> unconditionally but just aren't updated.
>
I think your right about the config name,
How about the
CONFIG_ARCH_USE_MM_CPUMASK?
This has the right semantic as these archs use the cpumask field of the
mm struct.
> BTW, it would also be nice to have _some_ kind of data behind this
> patch.
>
> Fewer IPIs are better I guess, but it would still be nice if you
> could say:
>
> Before this patch, /proc/interrupts showed 123 IPIs/hour for an
> isolated CPU. After the approach here, it was 0.
>
> ... or something.
This is part of an ongoing effort to remove IPIs and this one was found
via code inspection.
On 6/26/23 07:36, [email protected] wrote:
> On Thu, 2023-06-22 at 06:37 -0700, Dave Hansen wrote:
>> On 6/22/23 06:14, [email protected] wrote:
>>> I will send a new version with the local variable as you suggested
>>> soon.
>>> As for the config name, what about CONFIG_ARCH_HAS_MM_CPUMASK?
>>
>> The confusing part about that name is that mm_cpumask() and
>> mm->cpu_bitmap[] are defined unconditionally. So, they're *around*
>> unconditionally but just aren't updated.
>>
> I think your right about the config name,
> How about the
> CONFIG_ARCH_USE_MM_CPUMASK?
> This has the right semantic as these archs use the cpumask field of the
> mm struct.
"USE" is still a command. It should, at worst, be "USES". But that's
still kinda generic. How about:
CONFIG_ARCH_UPDATES_MM_CPUMASK
?
>> BTW, it would also be nice to have _some_ kind of data behind this
>> patch.
>>
>> Fewer IPIs are better I guess, but it would still be nice if you
>> could say:
>>
>> Before this patch, /proc/interrupts showed 123 IPIs/hour for an
>> isolated CPU. After the approach here, it was 0.
>>
>> ... or something.
>
> This is part of an ongoing effort to remove IPIs and this one was found
> via code inspection.
OK, so it should be something more like:
This was found via code inspection, but fixing it isn't very
important so we didn't bother to test it any more than just
making sure the thing still boots when it is applied.
Does that cover it?
On Tue, Jun 20, 2023 at 05:46:18PM +0300, Yair Podemsky wrote:
> @@ -191,7 +191,13 @@ static void tlb_remove_table_smp_sync(void *arg)
> /* Simply deliver the interrupt */
> }
>
> -void tlb_remove_table_sync_one(void)
> +#ifdef CONFIG_ARCH_HAS_CPUMASK_BITS
> +#define REMOVE_TABLE_IPI_MASK mm_cpumask(mm)
> +#else
> +#define REMOVE_TABLE_IPI_MASK cpu_online_mask
> +#endif /* CONFIG_ARCH_HAS_CPUMASK_BITS */
> +
> +void tlb_remove_table_sync_one(struct mm_struct *mm)
> {
> /*
> * This isn't an RCU grace period and hence the page-tables cannot be
> @@ -200,7 +206,8 @@ void tlb_remove_table_sync_one(void)
> * It is however sufficient for software page-table walkers that rely on
> * IRQ disabling.
> */
> - smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
> + on_each_cpu_mask(REMOVE_TABLE_IPI_MASK, tlb_remove_table_smp_sync,
> + NULL, true);
Aside from what Dave said about the REMOVE_TABLE_IPI_MASK thing, this
isn't right.
on_each_cpu_mask() includes the current cpu, while smp_call_function()
explicitly does not.
Yes, they all end up in smp_call_function_many_cond(), but the
on_each_cpu*() family will have SCF_RUN_LOCAL set, while the
smp_call_function*() family will not.