In cases the delayed rmap removal is not used (which are
currently UP and s390) skip delayed_rmap flag and make
the related code paths no-op.
Signed-off-by: Alexander Gordeev <[email protected]>
---
include/asm-generic/tlb.h | 32 +++++++++++++++++++-------------
mm/mmu_gather.c | 8 ++++----
2 files changed, 23 insertions(+), 17 deletions(-)
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 154c774d6307..317bef9eee3c 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -265,24 +265,14 @@ extern bool __tlb_remove_page_size(struct mmu_gather *tlb,
* This both sets 'delayed_rmap', and returns true. It would be an inline
* function, except we define it before the 'struct mmu_gather'.
*/
-#define tlb_delay_rmap(tlb) (((tlb)->delayed_rmap = 1), true)
+#define tlb_delay_rmap(tlb) (((tlb)->delayed_rmap = 1), true)
+#define tlb_reset_delay_rmap(tlb) ((tlb)->delayed_rmap = 0)
+#define tlb_rmap_delayed(tlb) ((tlb)->delayed_rmap)
extern void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma);
#endif
#endif
-/*
- * We have a no-op version of the rmap removal that doesn't
- * delay anything. That is used on S390, which flushes remote
- * TLBs synchronously, and on UP, which doesn't have any
- * remote TLBs to flush and is not preemptible due to this
- * all happening under the page table lock.
- */
-#ifndef tlb_delay_rmap
-#define tlb_delay_rmap(tlb) (false)
-static inline void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { }
-#endif
-
/*
* struct mmu_gather is an opaque type used by the mm code for passing around
* any data needed by arch specific code for tlb_remove_page.
@@ -313,10 +303,12 @@ struct mmu_gather {
*/
unsigned int freed_tables : 1;
+#ifdef tlb_delay_rmap
/*
* Do we have pending delayed rmap removals?
*/
unsigned int delayed_rmap : 1;
+#endif
/*
* at which levels have we cleared entries?
@@ -346,6 +338,20 @@ struct mmu_gather {
#endif
};
+/*
+ * We have a no-op version of the rmap removal that doesn't
+ * delay anything. That is used on S390, which flushes remote
+ * TLBs synchronously, and on UP, which doesn't have any
+ * remote TLBs to flush and is not preemptible due to this
+ * all happening under the page table lock.
+ */
+#ifndef tlb_delay_rmap
+#define tlb_delay_rmap(tlb) (false)
+#define tlb_reset_delay_rmap(tlb) do { } while (0)
+#define tlb_rmap_delayed(tlb) (false)
+static inline void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { }
+#endif
+
void tlb_flush_mmu(struct mmu_gather *tlb);
static inline void __tlb_adjust_range(struct mmu_gather *tlb,
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 9f22309affee..b0f1bd20af2f 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -20,7 +20,7 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
struct mmu_gather_batch *batch;
/* No more batching if we have delayed rmaps pending */
- if (tlb->delayed_rmap)
+ if (tlb_rmap_delayed(tlb))
return false;
batch = tlb->active;
@@ -60,7 +60,7 @@ void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
struct mmu_gather_batch *batch;
- if (!tlb->delayed_rmap)
+ if (!tlb_rmap_delayed(tlb))
return;
batch = tlb->active;
@@ -73,7 +73,7 @@ void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
}
}
- tlb->delayed_rmap = 0;
+ tlb_reset_delay_rmap(tlb);
}
#endif
@@ -311,7 +311,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
tlb->active = &tlb->local;
tlb->batch_count = 0;
#endif
- tlb->delayed_rmap = 0;
+ tlb_reset_delay_rmap(tlb);
tlb_table_init(tlb);
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
--
2.31.1
On Tue, Nov 15, 2022 at 11:51 PM Alexander Gordeev
<[email protected]> wrote:
>
> In cases the delayed rmap removal is not used (which are
> currently UP and s390) skip delayed_rmap flag and make
> the related code paths no-op.
So I'm not convinced about this patch.
I particularly dislike adding even more #ifdef's around the data
structure - it already is pretty nasty, and it was hard to see where
things were initialized.
The only actual code impact of this is in tlb_next_batch(), which
tests for "do I have delayed rmaps pending, in which case I won't add
new batches". Everything else is already either optimized away, or
just "one bit declared in a structure that already has bitfields and
has room for several extra bits":
And that "I need to allocate new batches" case really doesn't matter
anyway - it's not even build at all on s390, and on UP where it's
there but technically pointless to have the test it really isn't
noticeable.
So the previous patch I was "this shouldn't actually _matter_, but it
does seem cleaner to do it this way".
But _this_ patch makes me go "it still doesn't matter, but now this
patch is actually adding extra infrastructure for the 'not-mattering'
case".
So I don't _hate_ this patch, but I think this actually makes the
current mess wrt our 'struct mmu_gather' worse rather than better.
That structure is already a pain, with horrendous initialization and
different bit-fields having different lifetimes. I'd rather have one
unconditional simple bitfield, than have another bitfield that has
conditional complications.
Linus