While working with pcpu variables, I noticed that riscv did not support
first chunk allocation in the vmalloc area which may be needed as a fallback
in case of a sparse NUMA configuration.
patch 1 starts by introducing a new function flush_cache_vmap_early() which
is needed since a new vmalloc mapping is established and directly accessed:
on riscv, this would likely fail in case of a reordered access or if the
uarch caches invalid entries in TLB.
Note that most architectures do not include asm-generic/cacheflush.h so to
avoid build failures, this patch implements the new function on each of
those architectures. For all architectures except riscv, this new function
is implemented as a no-op to keep the existing behaviour but it likely
needs another implementation.
patch 2 simply enables the page percpu first chunk allocator in riscv.
Changes in v2:
- Rebase on top of 6.7
- Define flush_cache_vmap_early() for all architectures that do
not include <asm-generic/cacheflush.h> to avoid build failures
Alexandre Ghiti (2):
mm: Introduce flush_cache_vmap_early()
riscv: Enable pcpu page first chunk allocator
arch/arc/include/asm/cacheflush.h | 1 +
arch/arm/include/asm/cacheflush.h | 2 ++
arch/csky/abiv1/inc/abi/cacheflush.h | 1 +
arch/csky/abiv2/inc/abi/cacheflush.h | 1 +
arch/m68k/include/asm/cacheflush_mm.h | 1 +
arch/mips/include/asm/cacheflush.h | 2 ++
arch/nios2/include/asm/cacheflush.h | 1 +
arch/parisc/include/asm/cacheflush.h | 1 +
arch/riscv/Kconfig | 2 ++
arch/riscv/include/asm/cacheflush.h | 3 ++-
arch/riscv/include/asm/tlbflush.h | 1 +
arch/riscv/mm/kasan_init.c | 8 ++++++++
arch/riscv/mm/tlbflush.c | 5 +++++
arch/sh/include/asm/cacheflush.h | 1 +
arch/sparc/include/asm/cacheflush_32.h | 1 +
arch/sparc/include/asm/cacheflush_64.h | 1 +
arch/xtensa/include/asm/cacheflush.h | 6 ++++--
include/asm-generic/cacheflush.h | 6 ++++++
mm/percpu.c | 8 +-------
19 files changed, 42 insertions(+), 10 deletions(-)
--
2.39.2
The pcpu setup when using the page allocator sets up a new vmalloc
mapping very early in the boot process, so early that it cannot use the
flush_cache_vmap() function which may depend on structures not yet
initialized (for example in riscv, we currently send an IPI to flush
other cpus TLB).
But on some architectures, we must call flush_cache_vmap(): for example,
in riscv, some uarchs can cache invalid TLB entries so we need to flush
the new established mapping to avoid taking an exception.
So fix this by introducing a new function flush_cache_vmap_early() which
is called right after setting the new page table entry and before
accessing this new mapping. This new function implements a local flush
tlb on riscv and is no-op for other architectures (same as today).
Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arc/include/asm/cacheflush.h | 1 +
arch/arm/include/asm/cacheflush.h | 2 ++
arch/csky/abiv1/inc/abi/cacheflush.h | 1 +
arch/csky/abiv2/inc/abi/cacheflush.h | 1 +
arch/m68k/include/asm/cacheflush_mm.h | 1 +
arch/mips/include/asm/cacheflush.h | 2 ++
arch/nios2/include/asm/cacheflush.h | 1 +
arch/parisc/include/asm/cacheflush.h | 1 +
arch/riscv/include/asm/cacheflush.h | 3 ++-
arch/riscv/include/asm/tlbflush.h | 1 +
arch/riscv/mm/tlbflush.c | 5 +++++
arch/sh/include/asm/cacheflush.h | 1 +
arch/sparc/include/asm/cacheflush_32.h | 1 +
arch/sparc/include/asm/cacheflush_64.h | 1 +
arch/xtensa/include/asm/cacheflush.h | 6 ++++--
include/asm-generic/cacheflush.h | 6 ++++++
mm/percpu.c | 8 +-------
17 files changed, 32 insertions(+), 10 deletions(-)
diff --git a/arch/arc/include/asm/cacheflush.h b/arch/arc/include/asm/cacheflush.h
index bd5b1a9a0544..6fc74500a9f5 100644
--- a/arch/arc/include/asm/cacheflush.h
+++ b/arch/arc/include/asm/cacheflush.h
@@ -40,6 +40,7 @@ void dma_cache_wback(phys_addr_t start, unsigned long sz);
/* TBD: optimize this */
#define flush_cache_vmap(start, end) flush_cache_all()
+#define flush_cache_vmap_early(start, end) do { } while (0)
#define flush_cache_vunmap(start, end) flush_cache_all()
#define flush_cache_dup_mm(mm) /* called on fork (VIVT only) */
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index f6181f69577f..1075534b0a2e 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -340,6 +340,8 @@ static inline void flush_cache_vmap(unsigned long start, unsigned long end)
dsb(ishst);
}
+#define flush_cache_vmap_early(start, end) do { } while (0)
+
static inline void flush_cache_vunmap(unsigned long start, unsigned long end)
{
if (!cache_is_vipt_nonaliasing())
diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h
index 908d8b0bc4fd..d011a81575d2 100644
--- a/arch/csky/abiv1/inc/abi/cacheflush.h
+++ b/arch/csky/abiv1/inc/abi/cacheflush.h
@@ -43,6 +43,7 @@ static inline void flush_anon_page(struct vm_area_struct *vma,
*/
extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end);
#define flush_cache_vmap(start, end) cache_wbinv_all()
+#define flush_cache_vmap_early(start, end) do { } while (0)
#define flush_cache_vunmap(start, end) cache_wbinv_all()
#define flush_icache_range(start, end) cache_wbinv_range(start, end)
diff --git a/arch/csky/abiv2/inc/abi/cacheflush.h b/arch/csky/abiv2/inc/abi/cacheflush.h
index 40be16907267..6513ac5d2578 100644
--- a/arch/csky/abiv2/inc/abi/cacheflush.h
+++ b/arch/csky/abiv2/inc/abi/cacheflush.h
@@ -41,6 +41,7 @@ void flush_icache_mm_range(struct mm_struct *mm,
void flush_icache_deferred(struct mm_struct *mm);
#define flush_cache_vmap(start, end) do { } while (0)
+#define flush_cache_vmap_early(start, end) do { } while (0)
#define flush_cache_vunmap(start, end) do { } while (0)
#define copy_to_user_page(vma, page, vaddr, dst, src, len) \
diff --git a/arch/m68k/include/asm/cacheflush_mm.h b/arch/m68k/include/asm/cacheflush_mm.h
index ed12358c4783..9a71b0148461 100644
--- a/arch/m68k/include/asm/cacheflush_mm.h
+++ b/arch/m68k/include/asm/cacheflush_mm.h
@@ -191,6 +191,7 @@ extern void cache_push_v(unsigned long vaddr, int len);
#define flush_cache_all() __flush_cache_all()
#define flush_cache_vmap(start, end) flush_cache_all()
+#define flush_cache_vmap_early(start, end) do { } while (0)
#define flush_cache_vunmap(start, end) flush_cache_all()
static inline void flush_cache_mm(struct mm_struct *mm)
diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h
index f36c2519ed97..1f14132b3fc9 100644
--- a/arch/mips/include/asm/cacheflush.h
+++ b/arch/mips/include/asm/cacheflush.h
@@ -97,6 +97,8 @@ static inline void flush_cache_vmap(unsigned long start, unsigned long end)
__flush_cache_vmap();
}
+#define flush_cache_vmap_early(start, end) do { } while (0)
+
extern void (*__flush_cache_vunmap)(void);
static inline void flush_cache_vunmap(unsigned long start, unsigned long end)
diff --git a/arch/nios2/include/asm/cacheflush.h b/arch/nios2/include/asm/cacheflush.h
index 348cea097792..81484a776b33 100644
--- a/arch/nios2/include/asm/cacheflush.h
+++ b/arch/nios2/include/asm/cacheflush.h
@@ -38,6 +38,7 @@ void flush_icache_pages(struct vm_area_struct *vma, struct page *page,
#define flush_icache_pages flush_icache_pages
#define flush_cache_vmap(start, end) flush_dcache_range(start, end)
+#define flush_cache_vmap_early(start, end) do { } while (0)
#define flush_cache_vunmap(start, end) flush_dcache_range(start, end)
extern void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h
index b4006f2a9705..ba4c05bc24d6 100644
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -41,6 +41,7 @@ void flush_kernel_vmap_range(void *vaddr, int size);
void invalidate_kernel_vmap_range(void *vaddr, int size);
#define flush_cache_vmap(start, end) flush_cache_all()
+#define flush_cache_vmap_early(start, end) do { } while (0)
#define flush_cache_vunmap(start, end) flush_cache_all()
void flush_dcache_folio(struct folio *folio);
diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h
index 3cb53c4df27c..a129dac4521d 100644
--- a/arch/riscv/include/asm/cacheflush.h
+++ b/arch/riscv/include/asm/cacheflush.h
@@ -37,7 +37,8 @@ static inline void flush_dcache_page(struct page *page)
flush_icache_mm(vma->vm_mm, 0)
#ifdef CONFIG_64BIT
-#define flush_cache_vmap(start, end) flush_tlb_kernel_range(start, end)
+#define flush_cache_vmap(start, end) flush_tlb_kernel_range(start, end)
+#define flush_cache_vmap_early(start, end) local_flush_tlb_kernel_range(start, end)
#endif
#ifndef CONFIG_SMP
diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h
index 8f3418c5f172..a60416bbe190 100644
--- a/arch/riscv/include/asm/tlbflush.h
+++ b/arch/riscv/include/asm/tlbflush.h
@@ -41,6 +41,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr);
void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end);
void flush_tlb_kernel_range(unsigned long start, unsigned long end);
+void local_flush_tlb_kernel_range(unsigned long start, unsigned long end);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start,
diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
index e6659d7368b3..8aadc5f71c93 100644
--- a/arch/riscv/mm/tlbflush.c
+++ b/arch/riscv/mm/tlbflush.c
@@ -66,6 +66,11 @@ static inline void local_flush_tlb_range_asid(unsigned long start,
local_flush_tlb_range_threshold_asid(start, size, stride, asid);
}
+void local_flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+ local_flush_tlb_range_asid(start, end, PAGE_SIZE, FLUSH_TLB_NO_ASID);
+}
+
static void __ipi_flush_tlb_all(void *info)
{
local_flush_tlb_all();
diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h
index 878b6b551bd2..51112f54552b 100644
--- a/arch/sh/include/asm/cacheflush.h
+++ b/arch/sh/include/asm/cacheflush.h
@@ -90,6 +90,7 @@ extern void copy_from_user_page(struct vm_area_struct *vma,
unsigned long len);
#define flush_cache_vmap(start, end) local_flush_cache_all(NULL)
+#define flush_cache_vmap_early(start, end) do { } while (0)
#define flush_cache_vunmap(start, end) local_flush_cache_all(NULL)
#define flush_dcache_mmap_lock(mapping) do { } while (0)
diff --git a/arch/sparc/include/asm/cacheflush_32.h b/arch/sparc/include/asm/cacheflush_32.h
index f3b7270bf71b..9fee0ccfccb8 100644
--- a/arch/sparc/include/asm/cacheflush_32.h
+++ b/arch/sparc/include/asm/cacheflush_32.h
@@ -48,6 +48,7 @@ static inline void flush_dcache_page(struct page *page)
#define flush_dcache_mmap_unlock(mapping) do { } while (0)
#define flush_cache_vmap(start, end) flush_cache_all()
+#define flush_cache_vmap_early(start, end) do { } while (0)
#define flush_cache_vunmap(start, end) flush_cache_all()
/* When a context switch happens we must flush all user windows so that
diff --git a/arch/sparc/include/asm/cacheflush_64.h b/arch/sparc/include/asm/cacheflush_64.h
index 0e879004efff..2b1261b77ecd 100644
--- a/arch/sparc/include/asm/cacheflush_64.h
+++ b/arch/sparc/include/asm/cacheflush_64.h
@@ -75,6 +75,7 @@ void flush_ptrace_access(struct vm_area_struct *, struct page *,
#define flush_dcache_mmap_unlock(mapping) do { } while (0)
#define flush_cache_vmap(start, end) do { } while (0)
+#define flush_cache_vmap_early(start, end) do { } while (0)
#define flush_cache_vunmap(start, end) do { } while (0)
#endif /* !__ASSEMBLY__ */
diff --git a/arch/xtensa/include/asm/cacheflush.h b/arch/xtensa/include/asm/cacheflush.h
index 785a00ce83c1..38bcecb0e457 100644
--- a/arch/xtensa/include/asm/cacheflush.h
+++ b/arch/xtensa/include/asm/cacheflush.h
@@ -116,8 +116,9 @@ void flush_cache_page(struct vm_area_struct*,
#define flush_cache_mm(mm) flush_cache_all()
#define flush_cache_dup_mm(mm) flush_cache_mm(mm)
-#define flush_cache_vmap(start,end) flush_cache_all()
-#define flush_cache_vunmap(start,end) flush_cache_all()
+#define flush_cache_vmap(start,end) flush_cache_all()
+#define flush_cache_vmap_early(start,end) do { } while (0)
+#define flush_cache_vunmap(start,end) flush_cache_all()
void flush_dcache_folio(struct folio *folio);
#define flush_dcache_folio flush_dcache_folio
@@ -140,6 +141,7 @@ void local_flush_cache_page(struct vm_area_struct *vma,
#define flush_cache_dup_mm(mm) do { } while (0)
#define flush_cache_vmap(start,end) do { } while (0)
+#define flush_cache_vmap_early(start,end) do { } while (0)
#define flush_cache_vunmap(start,end) do { } while (0)
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h
index 84ec53ccc450..7ee8a179d103 100644
--- a/include/asm-generic/cacheflush.h
+++ b/include/asm-generic/cacheflush.h
@@ -91,6 +91,12 @@ static inline void flush_cache_vmap(unsigned long start, unsigned long end)
}
#endif
+#ifndef flush_cache_vmap_early
+static inline void flush_cache_vmap_early(unsigned long start, unsigned long end)
+{
+}
+#endif
+
#ifndef flush_cache_vunmap
static inline void flush_cache_vunmap(unsigned long start, unsigned long end)
{
diff --git a/mm/percpu.c b/mm/percpu.c
index 7b97d31df767..4e11fc1e6def 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -3333,13 +3333,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t
if (rc < 0)
panic("failed to map percpu area, err=%d\n", rc);
- /*
- * FIXME: Archs with virtual cache should flush local
- * cache for the linear mapping here - something
- * equivalent to flush_cache_vmap() on the local cpu.
- * flush_cache_vmap() can't be used as most supporting
- * data structures are not set up yet.
- */
+ flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size);
/* copy static data */
memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
--
2.39.2
As explained in commit 6ea529a2037c ("percpu: make embedding first chunk
allocator check vmalloc space size"), the embedding first chunk allocator
needs the vmalloc space to be larger than the maximum distance between
units which are grouped into NUMA nodes.
On a very sparse NUMA configurations and a small vmalloc area (for example,
it is 64GB in sv39), the allocation of dynamic percpu data in the vmalloc
area could fail.
So provide the pcpu page allocator as a fallback in case we fall into
such a sparse configuration (which happened in arm64 as shown by
commit 09cea6195073 ("arm64: support page mapping percpu first chunk
allocator")).
Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/riscv/Kconfig | 2 ++
arch/riscv/mm/kasan_init.c | 8 ++++++++
2 files changed, 10 insertions(+)
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 7603bd8ab333..8ba4a63e0ae5 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -415,7 +415,9 @@ config NUMA
depends on SMP && MMU
select ARCH_SUPPORTS_NUMA_BALANCING
select GENERIC_ARCH_NUMA
+ select HAVE_SETUP_PER_CPU_AREA
select NEED_PER_CPU_EMBED_FIRST_CHUNK
+ select NEED_PER_CPU_PAGE_FIRST_CHUNK
select OF_NUMA
select USE_PERCPU_NUMA_NODE_ID
help
diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c
index 5e39dcf23fdb..4c9a2c527f08 100644
--- a/arch/riscv/mm/kasan_init.c
+++ b/arch/riscv/mm/kasan_init.c
@@ -438,6 +438,14 @@ static void __init kasan_shallow_populate(void *start, void *end)
kasan_shallow_populate_pgd(vaddr, vend);
}
+#ifdef CONFIG_KASAN_VMALLOC
+void __init kasan_populate_early_vm_area_shadow(void *start, unsigned long size)
+{
+ kasan_populate(kasan_mem_to_shadow(start),
+ kasan_mem_to_shadow(start + size));
+}
+#endif
+
static void __init create_tmp_mapping(void)
{
void *ptr;
--
2.39.2
On Tue, Dec 12, 2023 at 10:36 PM Alexandre Ghiti <[email protected]> wrote:
> The pcpu setup when using the page allocator sets up a new vmalloc
> mapping very early in the boot process, so early that it cannot use the
> flush_cache_vmap() function which may depend on structures not yet
> initialized (for example in riscv, we currently send an IPI to flush
> other cpus TLB).
>
> But on some architectures, we must call flush_cache_vmap(): for example,
> in riscv, some uarchs can cache invalid TLB entries so we need to flush
> the new established mapping to avoid taking an exception.
>
> So fix this by introducing a new function flush_cache_vmap_early() which
> is called right after setting the new page table entry and before
> accessing this new mapping. This new function implements a local flush
> tlb on riscv and is no-op for other architectures (same as today).
>
> Signed-off-by: Alexandre Ghiti <[email protected]>
> arch/m68k/include/asm/cacheflush_mm.h | 1 +
Acked-by: Geert Uytterhoeven <[email protected]>
Gr{oetje,eeting}s,
Geert
--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- [email protected]
In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds
Hello,
On Tue, Dec 12, 2023 at 10:34:55PM +0100, Alexandre Ghiti wrote:
> While working with pcpu variables, I noticed that riscv did not support
> first chunk allocation in the vmalloc area which may be needed as a fallback
> in case of a sparse NUMA configuration.
>
> patch 1 starts by introducing a new function flush_cache_vmap_early() which
> is needed since a new vmalloc mapping is established and directly accessed:
> on riscv, this would likely fail in case of a reordered access or if the
> uarch caches invalid entries in TLB.
> Note that most architectures do not include asm-generic/cacheflush.h so to
> avoid build failures, this patch implements the new function on each of
> those architectures. For all architectures except riscv, this new function
> is implemented as a no-op to keep the existing behaviour but it likely
> needs another implementation.
>
> patch 2 simply enables the page percpu first chunk allocator in riscv.
>
> Changes in v2:
> - Rebase on top of 6.7
> - Define flush_cache_vmap_early() for all architectures that do
> not include <asm-generic/cacheflush.h> to avoid build failures
>
> Alexandre Ghiti (2):
> mm: Introduce flush_cache_vmap_early()
> riscv: Enable pcpu page first chunk allocator
>
> arch/arc/include/asm/cacheflush.h | 1 +
> arch/arm/include/asm/cacheflush.h | 2 ++
> arch/csky/abiv1/inc/abi/cacheflush.h | 1 +
> arch/csky/abiv2/inc/abi/cacheflush.h | 1 +
> arch/m68k/include/asm/cacheflush_mm.h | 1 +
> arch/mips/include/asm/cacheflush.h | 2 ++
> arch/nios2/include/asm/cacheflush.h | 1 +
> arch/parisc/include/asm/cacheflush.h | 1 +
> arch/riscv/Kconfig | 2 ++
> arch/riscv/include/asm/cacheflush.h | 3 ++-
> arch/riscv/include/asm/tlbflush.h | 1 +
> arch/riscv/mm/kasan_init.c | 8 ++++++++
> arch/riscv/mm/tlbflush.c | 5 +++++
> arch/sh/include/asm/cacheflush.h | 1 +
> arch/sparc/include/asm/cacheflush_32.h | 1 +
> arch/sparc/include/asm/cacheflush_64.h | 1 +
> arch/xtensa/include/asm/cacheflush.h | 6 ++++--
> include/asm-generic/cacheflush.h | 6 ++++++
> mm/percpu.c | 8 +-------
> 19 files changed, 42 insertions(+), 10 deletions(-)
>
> --
> 2.39.2
>
Thanks for the quick v2. Applied to percpu#for-6.8.
Thanks,
Dennis
Hello:
This series was applied to riscv/linux.git (fixes)
by Dennis Zhou <[email protected]>:
On Tue, 12 Dec 2023 22:34:55 +0100 you wrote:
> While working with pcpu variables, I noticed that riscv did not support
> first chunk allocation in the vmalloc area which may be needed as a fallback
> in case of a sparse NUMA configuration.
>
> patch 1 starts by introducing a new function flush_cache_vmap_early() which
> is needed since a new vmalloc mapping is established and directly accessed:
> on riscv, this would likely fail in case of a reordered access or if the
> uarch caches invalid entries in TLB.
> Note that most architectures do not include asm-generic/cacheflush.h so to
> avoid build failures, this patch implements the new function on each of
> those architectures. For all architectures except riscv, this new function
> is implemented as a no-op to keep the existing behaviour but it likely
> needs another implementation.
>
> [...]
Here is the summary with links:
- [v2,1/2] mm: Introduce flush_cache_vmap_early()
https://git.kernel.org/riscv/c/7a92fc8b4d20
- [v2,2/2] riscv: Enable pcpu page first chunk allocator
https://git.kernel.org/riscv/c/6b9f29b81b15
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html