This patchset implements sv48 support at runtime. The kernel will try to
boot with 4-level page table and will fallback to 3-level if the HW does not
support it.
The biggest advantage is that we only have one kernel for 64bit, which
is way easier to maintain.
Folding the 4th level into a 3-level page table has almost no cost at
runtime. But as mentioned Palmer, the relocatable code generated is less
performant.
At the moment, there is no way to build a 3-level page table non-relocatable
64bit kernel. We agreed that distributions will use this runtime configuration
anyway, but Palmer proposed to introduce a new Kconfig, which I will do later
as sv48 support was asked for 5.8.
Finally, the user can now ask for sv39 explicitly by using the device-tree
which will reduce memory footprint and reduce the number of memory accesses
in case of TLB miss.
Alexandre Ghiti (8):
riscv: Get rid of compile time logic with MAX_EARLY_MAPPING_SIZE
riscv: Allow to dynamically define VA_BITS
riscv: Simplify MAXPHYSMEM config
riscv: Prepare ptdump for vm layout dynamic addresses
riscv: Implement sv48 support
riscv: Allow user to downgrade to sv39 when hw supports sv48
riscv: Use pgtable_l4_enabled to output mmu type in cpuinfo
riscv: Explicit comment about user virtual address space size
arch/riscv/Kconfig | 34 ++--
arch/riscv/boot/dts/sifive/fu540-c000.dtsi | 4 -
arch/riscv/include/asm/csr.h | 3 +-
arch/riscv/include/asm/fixmap.h | 1 +
arch/riscv/include/asm/page.h | 15 ++
arch/riscv/include/asm/pgalloc.h | 36 ++++
arch/riscv/include/asm/pgtable-64.h | 97 ++++++++++-
arch/riscv/include/asm/pgtable.h | 30 +++-
arch/riscv/include/asm/sparsemem.h | 2 +-
arch/riscv/kernel/cpu.c | 24 +--
arch/riscv/kernel/head.S | 3 +-
arch/riscv/mm/context.c | 4 +-
arch/riscv/mm/init.c | 194 ++++++++++++++++++---
arch/riscv/mm/ptdump.c | 49 +++++-
14 files changed, 410 insertions(+), 86 deletions(-)
--
2.20.1
With 4-level page table folding at runtime, we don't know at compile time
the size of the virtual address space so we must set VA_BITS dynamically
so that sparsemem reserves the right amount of memory for struct pages.
Signed-off-by: Alexandre Ghiti <[email protected]>
Reviewed-by: Anup Patel <[email protected]>
Reviewed-by: Palmer Dabbelt <[email protected]>
---
arch/riscv/Kconfig | 10 ----------
arch/riscv/include/asm/pgtable.h | 12 ++++++++++--
arch/riscv/include/asm/sparsemem.h | 2 +-
3 files changed, 11 insertions(+), 13 deletions(-)
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 93127d5913fe..64b25a90d60f 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -101,16 +101,6 @@ config ZONE_DMA32
bool
default y if 64BIT
-config VA_BITS
- int
- default 32 if 32BIT
- default 39 if 64BIT
-
-config PA_BITS
- int
- default 34 if 32BIT
- default 56 if 64BIT
-
config PAGE_OFFSET
hex
default 0xC0000000 if 32BIT && MAXPHYSMEM_2GB
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 25213cfaf680..8e96315b3366 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -41,7 +41,7 @@
* position vmemmap directly below the VMALLOC region.
*/
#define VMEMMAP_SHIFT \
- (CONFIG_VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT)
+ (VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT)
#define VMEMMAP_SIZE BIT(VMEMMAP_SHIFT)
#define VMEMMAP_END (VMALLOC_START - 1)
#define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE)
@@ -80,6 +80,14 @@
#endif /* CONFIG_64BIT */
#ifdef CONFIG_MMU
+#ifdef CONFIG_64BIT
+#define VA_BITS 39
+#define PA_BITS 56
+#else
+#define VA_BITS 32
+#define PA_BITS 34
+#endif
+
/* Number of entries in the page global directory */
#define PTRS_PER_PGD (PAGE_SIZE / sizeof(pgd_t))
/* Number of entries in the page table */
@@ -466,7 +474,7 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
* and give the kernel the other (upper) half.
*/
#ifdef CONFIG_64BIT
-#define KERN_VIRT_START (-(BIT(CONFIG_VA_BITS)) + TASK_SIZE)
+#define KERN_VIRT_START (-(BIT(VA_BITS)) + TASK_SIZE)
#else
#define KERN_VIRT_START FIXADDR_START
#endif
diff --git a/arch/riscv/include/asm/sparsemem.h b/arch/riscv/include/asm/sparsemem.h
index 45a7018a8118..f08d72155bc8 100644
--- a/arch/riscv/include/asm/sparsemem.h
+++ b/arch/riscv/include/asm/sparsemem.h
@@ -4,7 +4,7 @@
#define _ASM_RISCV_SPARSEMEM_H
#ifdef CONFIG_SPARSEMEM
-#define MAX_PHYSMEM_BITS CONFIG_PA_BITS
+#define MAX_PHYSMEM_BITS PA_BITS
#define SECTION_SIZE_BITS 27
#endif /* CONFIG_SPARSEMEM */
--
2.20.1
There is no need to compare at compile time MAX_EARLY_MAPPING_SIZE value
with PGDIR_SIZE since MAX_EARLY_MAPPING_SIZE is set to 128MB which is less
than PGDIR_SIZE that is equal to 1GB: that allows to simplify early_pmd
definition.
Signed-off-by: Alexandre Ghiti <[email protected]>
Reviewed-by: Anup Patel <[email protected]>
Reviewed-by: Palmer Dabbelt <[email protected]>
---
arch/riscv/mm/init.c | 16 ++++------------
1 file changed, 4 insertions(+), 12 deletions(-)
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 7074522d40c6..5782cae58ac2 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -256,13 +256,7 @@ static void __init create_pte_mapping(pte_t *ptep,
pmd_t trampoline_pmd[PTRS_PER_PMD] __page_aligned_bss;
pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss;
-
-#if MAX_EARLY_MAPPING_SIZE < PGDIR_SIZE
-#define NUM_EARLY_PMDS 1UL
-#else
-#define NUM_EARLY_PMDS (1UL + MAX_EARLY_MAPPING_SIZE / PGDIR_SIZE)
-#endif
-pmd_t early_pmd[PTRS_PER_PMD * NUM_EARLY_PMDS] __initdata __aligned(PAGE_SIZE);
+pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
static pmd_t *__init get_pmd_virt(phys_addr_t pa)
{
@@ -276,14 +270,12 @@ static pmd_t *__init get_pmd_virt(phys_addr_t pa)
static phys_addr_t __init alloc_pmd(uintptr_t va)
{
- uintptr_t pmd_num;
-
if (mmu_enabled)
return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
- pmd_num = (va - kernel_virt_addr) >> PGDIR_SHIFT;
- BUG_ON(pmd_num >= NUM_EARLY_PMDS);
- return (uintptr_t)&early_pmd[pmd_num * PTRS_PER_PMD];
+ BUG_ON((va - kernel_virt_addr) >> PGDIR_SHIFT);
+
+ return (uintptr_t)early_pmd;
}
static void __init create_pmd_mapping(pmd_t *pmdp,
--
2.20.1
Either the user specifies maximum physical memory size of 2GB or the
user lives with the system constraint which is 1/4th of maximum
addressable memory in Sv39 MMU mode (i.e. 128GB) for now.
Signed-off-by: Alexandre Ghiti <[email protected]>
Reviewed-by: Anup Patel <[email protected]>
---
arch/riscv/Kconfig | 20 ++++++--------------
1 file changed, 6 insertions(+), 14 deletions(-)
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 64b25a90d60f..e167f16131f4 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -106,7 +106,7 @@ config PAGE_OFFSET
default 0xC0000000 if 32BIT && MAXPHYSMEM_2GB
default 0x80000000 if 64BIT && !MMU
default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB
- default 0xffffffe000000000 if 64BIT && MAXPHYSMEM_128GB
+ default 0xffffffe000000000 if 64BIT && !MAXPHYSMEM_2GB
config ARCH_FLATMEM_ENABLE
def_bool y
@@ -223,19 +223,11 @@ config MODULE_SECTIONS
bool
select HAVE_MOD_ARCH_SPECIFIC
-choice
- prompt "Maximum Physical Memory"
- default MAXPHYSMEM_2GB if 32BIT
- default MAXPHYSMEM_2GB if 64BIT && CMODEL_MEDLOW
- default MAXPHYSMEM_128GB if 64BIT && CMODEL_MEDANY
-
- config MAXPHYSMEM_2GB
- bool "2GiB"
- config MAXPHYSMEM_128GB
- depends on 64BIT && CMODEL_MEDANY
- bool "128GiB"
-endchoice
-
+config MAXPHYSMEM_2GB
+ bool "Maximum Physical Memory 2GiB"
+ default y if 32BIT
+ default y if 64BIT && CMODEL_MEDLOW
+ default n
config SMP
bool "Symmetric Multi-Processing"
--
2.20.1
By adding a new 4th level of page table, give the possibility to 64bit
kernel to address 2^48 bytes of virtual address: in practice, that roughly
offers ~160TB of virtual address space to userspace and allows up to 64TB
of physical memory.
If the underlying hardware does not support sv48, we will automatically
fallback to a standard 3-level page table by folding the new PUD level into
PGDIR level. In order to detect HW capabilities at runtime, we
use SATP feature that ignores writes with an unsupported mode.
Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/riscv/Kconfig | 6 +-
arch/riscv/include/asm/csr.h | 3 +-
arch/riscv/include/asm/fixmap.h | 1 +
arch/riscv/include/asm/page.h | 15 +++
arch/riscv/include/asm/pgalloc.h | 36 +++++++
arch/riscv/include/asm/pgtable-64.h | 97 ++++++++++++++++-
arch/riscv/include/asm/pgtable.h | 9 +-
arch/riscv/kernel/head.S | 3 +-
arch/riscv/mm/context.c | 4 +-
arch/riscv/mm/init.c | 159 +++++++++++++++++++++++++---
10 files changed, 309 insertions(+), 24 deletions(-)
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index e167f16131f4..3f73f60e9732 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -68,6 +68,7 @@ config RISCV
select ARCH_HAS_GCOV_PROFILE_ALL
select HAVE_COPY_THREAD_TLS
select HAVE_ARCH_KASAN if MMU && 64BIT
+ select RELOCATABLE if 64BIT
config ARCH_MMAP_RND_BITS_MIN
default 18 if 64BIT
@@ -106,7 +107,7 @@ config PAGE_OFFSET
default 0xC0000000 if 32BIT && MAXPHYSMEM_2GB
default 0x80000000 if 64BIT && !MMU
default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB
- default 0xffffffe000000000 if 64BIT && !MAXPHYSMEM_2GB
+ default 0xffffc00000000000 if 64BIT && !MAXPHYSMEM_2GB
config ARCH_FLATMEM_ENABLE
def_bool y
@@ -155,8 +156,11 @@ config GENERIC_HWEIGHT
config FIX_EARLYCON_MEM
def_bool MMU
+# On a 64BIT relocatable kernel, the 4-level page table is at runtime folded
+# on a 3-level page table when sv48 is not supported.
config PGTABLE_LEVELS
int
+ default 4 if 64BIT && RELOCATABLE
default 3 if 64BIT
default 2
diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h
index cec462e198ce..d41536c3f8d4 100644
--- a/arch/riscv/include/asm/csr.h
+++ b/arch/riscv/include/asm/csr.h
@@ -40,11 +40,10 @@
#ifndef CONFIG_64BIT
#define SATP_PPN _AC(0x003FFFFF, UL)
#define SATP_MODE_32 _AC(0x80000000, UL)
-#define SATP_MODE SATP_MODE_32
#else
#define SATP_PPN _AC(0x00000FFFFFFFFFFF, UL)
#define SATP_MODE_39 _AC(0x8000000000000000, UL)
-#define SATP_MODE SATP_MODE_39
+#define SATP_MODE_48 _AC(0x9000000000000000, UL)
#endif
/* Exception cause high bit - is an interrupt if set */
diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
index 2368d49eb4ef..d891cf9c73c5 100644
--- a/arch/riscv/include/asm/fixmap.h
+++ b/arch/riscv/include/asm/fixmap.h
@@ -27,6 +27,7 @@ enum fixed_addresses {
FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1,
FIX_PTE,
FIX_PMD,
+ FIX_PUD,
FIX_TEXT_POKE1,
FIX_TEXT_POKE0,
FIX_EARLYCON_MEM_BASE,
diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
index 48bb09b6a9b7..5e77fe7f0d6d 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -31,7 +31,19 @@
* When not using MMU this corresponds to the first free page in
* physical memory (aligned on a page boundary).
*/
+#ifdef CONFIG_RELOCATABLE
+#define PAGE_OFFSET __page_offset
+
+#ifdef CONFIG_64BIT
+/*
+ * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so
+ * define the PAGE_OFFSET value for SV39.
+ */
+#define PAGE_OFFSET_L3 0xffffffe000000000
+#endif /* CONFIG_64BIT */
+#else
#define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL)
+#endif /* CONFIG_RELOCATABLE */
#define KERN_VIRT_SIZE (-PAGE_OFFSET)
@@ -102,6 +114,9 @@ extern unsigned long pfn_base;
extern unsigned long max_low_pfn;
extern unsigned long min_low_pfn;
extern unsigned long kernel_virt_addr;
+#ifdef CONFIG_RELOCATABLE
+extern unsigned long __page_offset;
+#endif
#define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset))
#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset)
diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
index 3f601ee8233f..540eaa5a8658 100644
--- a/arch/riscv/include/asm/pgalloc.h
+++ b/arch/riscv/include/asm/pgalloc.h
@@ -36,6 +36,42 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
}
+
+static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
+{
+ if (pgtable_l4_enabled) {
+ unsigned long pfn = virt_to_pfn(pud);
+
+ set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
+ }
+}
+
+static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d,
+ pud_t *pud)
+{
+ if (pgtable_l4_enabled) {
+ unsigned long pfn = virt_to_pfn(pud);
+
+ set_p4d_safe(p4d,
+ __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
+ }
+}
+
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+ if (pgtable_l4_enabled)
+ return (pud_t *)__get_free_page(
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_ZERO);
+ return NULL;
+}
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+ if (pgtable_l4_enabled)
+ free_page((unsigned long)pud);
+}
+
+#define __pud_free_tlb(tlb, pud, addr) pud_free((tlb)->mm, pud)
#endif /* __PAGETABLE_PMD_FOLDED */
#define pmd_pgtable(pmd) pmd_page(pmd)
diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
index b15f70a1fdfa..c84c31fbf8da 100644
--- a/arch/riscv/include/asm/pgtable-64.h
+++ b/arch/riscv/include/asm/pgtable-64.h
@@ -8,16 +8,32 @@
#include <linux/const.h>
-#define PGDIR_SHIFT 30
+extern bool pgtable_l4_enabled;
+
+#define PGDIR_SHIFT (pgtable_l4_enabled ? 39 : 30)
/* Size of region mapped by a page global directory */
#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
+/* pud is folded into pgd in case of 3-level page table */
+#define PUD_SHIFT 30
+#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
+#define PUD_MASK (~(PUD_SIZE - 1))
+
#define PMD_SHIFT 21
/* Size of region mapped by a page middle directory */
#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
#define PMD_MASK (~(PMD_SIZE - 1))
+/* Page Upper Directory entry */
+typedef struct {
+ unsigned long pud;
+} pud_t;
+
+#define pud_val(x) ((x).pud)
+#define __pud(x) ((pud_t) { (x) })
+#define PTRS_PER_PUD (PAGE_SIZE / sizeof(pud_t))
+
/* Page Middle Directory entry */
typedef struct {
unsigned long pmd;
@@ -60,6 +76,16 @@ static inline void pud_clear(pud_t *pudp)
set_pud(pudp, __pud(0));
}
+static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot)
+{
+ return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
+}
+
+static inline unsigned long _pud_pfn(pud_t pud)
+{
+ return pud_val(pud) >> _PAGE_PFN_SHIFT;
+}
+
static inline unsigned long pud_page_vaddr(pud_t pud)
{
return (unsigned long)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT);
@@ -70,6 +96,15 @@ static inline struct page *pud_page(pud_t pud)
return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
}
+#define mm_pud_folded mm_pud_folded
+static inline bool mm_pud_folded(struct mm_struct *mm)
+{
+ if (pgtable_l4_enabled)
+ return false;
+
+ return true;
+}
+
#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
@@ -90,4 +125,64 @@ static inline unsigned long _pmd_pfn(pmd_t pmd)
#define pmd_ERROR(e) \
pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
+#define pud_ERROR(e) \
+ pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
+
+static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
+{
+ if (pgtable_l4_enabled)
+ *p4dp = p4d;
+ else
+ set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) });
+}
+
+static inline int p4d_none(p4d_t p4d)
+{
+ if (pgtable_l4_enabled)
+ return (p4d_val(p4d) == 0);
+
+ return 0;
+}
+
+static inline int p4d_present(p4d_t p4d)
+{
+ if (pgtable_l4_enabled)
+ return (p4d_val(p4d) & _PAGE_PRESENT);
+
+ return 1;
+}
+
+static inline int p4d_bad(p4d_t p4d)
+{
+ if (pgtable_l4_enabled)
+ return !p4d_present(p4d);
+
+ return 0;
+}
+
+static inline void p4d_clear(p4d_t *p4d)
+{
+ if (pgtable_l4_enabled)
+ set_p4d(p4d, __p4d(0));
+}
+
+static inline unsigned long p4d_page_vaddr(p4d_t p4d)
+{
+ if (pgtable_l4_enabled)
+ return (unsigned long)pfn_to_virt(
+ p4d_val(p4d) >> _PAGE_PFN_SHIFT);
+
+ return pud_page_vaddr((pud_t) { p4d_val(p4d) });
+}
+
+#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
+
+static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
+{
+ if (pgtable_l4_enabled)
+ return (pud_t *)p4d_page_vaddr(*p4d) + pud_index(address);
+
+ return (pud_t *)p4d;
+}
+
#endif /* _ASM_RISCV_PGTABLE_64_H */
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 8e96315b3366..b8a8ba69d0a2 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -20,12 +20,14 @@
* the kernel.
*/
#define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1)
-#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR
+#define KERNEL_LINK_ADDR (VMALLOC_LINK_END - SZ_2G + 1)
#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1)
#define VMALLOC_END (PAGE_OFFSET - 1)
#define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE)
+#define VMALLOC_LINK_END (_AC(CONFIG_PAGE_OFFSET, UL) - 1)
+
#define BPF_JIT_REGION_SIZE (SZ_128M)
#define BPF_JIT_REGION_START (kernel_virt_addr)
#define BPF_JIT_REGION_END (kernel_virt_addr + BPF_JIT_REGION_SIZE)
@@ -67,8 +69,7 @@
#ifndef __ASSEMBLY__
-/* Page Upper Directory not used in RISC-V */
-#include <asm-generic/pgtable-nopud.h>
+#include <asm-generic/pgtable-nop4d.h>
#include <asm/page.h>
#include <asm/tlbflush.h>
#include <linux/mm_types.h>
@@ -81,7 +82,7 @@
#ifdef CONFIG_MMU
#ifdef CONFIG_64BIT
-#define VA_BITS 39
+#define VA_BITS (pgtable_l4_enabled ? 48 : 39)
#define PA_BITS 56
#else
#define VA_BITS 32
diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
index 8f5bb7731327..0632c4834c68 100644
--- a/arch/riscv/kernel/head.S
+++ b/arch/riscv/kernel/head.S
@@ -62,7 +62,8 @@ relocate:
/* Compute satp for kernel page tables, but don't load it yet */
srl a2, a0, PAGE_SHIFT
- li a1, SATP_MODE
+ la a1, satp_mode
+ REG_L a1, 0(a1)
or a2, a2, a1
/*
diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
index 613ec81a8979..6830504f8b11 100644
--- a/arch/riscv/mm/context.c
+++ b/arch/riscv/mm/context.c
@@ -9,6 +9,8 @@
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
+extern u64 satp_mode;
+
/*
* When necessary, performs a deferred icache flush for the given MM context,
* on the local CPU. RISC-V has no direct mechanism for instruction cache
@@ -59,7 +61,7 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
cpumask_set_cpu(cpu, mm_cpumask(next));
#ifdef CONFIG_MMU
- csr_write(CSR_SATP, virt_to_pfn(next->pgd) | SATP_MODE);
+ csr_write(CSR_SATP, virt_to_pfn(next->pgd) | satp_mode);
local_flush_tlb_all();
#endif
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 5782cae58ac2..bad8da099ff6 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -25,8 +25,23 @@
#include "../kernel/head.h"
-unsigned long kernel_virt_addr = KERNEL_VIRT_ADDR;
+#ifdef CONFIG_64BIT
+u64 satp_mode = IS_ENABLED(CONFIG_MAXPHYSMEM_2GB) ?
+ SATP_MODE_39 : SATP_MODE_48;
+bool pgtable_l4_enabled = IS_ENABLED(CONFIG_MAXPHYSMEM_2GB) ? false : true;
+#else
+u64 satp_mode = SATP_MODE_32;
+bool pgtable_l4_enabled;
+#endif
+EXPORT_SYMBOL(pgtable_l4_enabled);
+EXPORT_SYMBOL(satp_mode);
+
+unsigned long kernel_virt_addr;
EXPORT_SYMBOL(kernel_virt_addr);
+#ifdef CONFIG_RELOCATABLE
+unsigned long __page_offset = _AC(CONFIG_PAGE_OFFSET, UL);
+EXPORT_SYMBOL(__page_offset);
+#endif
unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
__page_aligned_bss;
@@ -254,9 +269,12 @@ static void __init create_pte_mapping(pte_t *ptep,
#ifndef __PAGETABLE_PMD_FOLDED
+pud_t trampoline_pud[PTRS_PER_PUD] __page_aligned_bss;
pmd_t trampoline_pmd[PTRS_PER_PMD] __page_aligned_bss;
+pud_t fixmap_pud[PTRS_PER_PUD] __page_aligned_bss;
pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss;
pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
+pud_t early_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE);
static pmd_t *__init get_pmd_virt(phys_addr_t pa)
{
@@ -273,7 +291,8 @@ static phys_addr_t __init alloc_pmd(uintptr_t va)
if (mmu_enabled)
return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
- BUG_ON((va - kernel_virt_addr) >> PGDIR_SHIFT);
+ /* Only one PMD is available for early mapping */
+ BUG_ON((va - kernel_virt_addr) >> PUD_SHIFT);
return (uintptr_t)early_pmd;
}
@@ -305,19 +324,70 @@ static void __init create_pmd_mapping(pmd_t *pmdp,
create_pte_mapping(ptep, va, pa, sz, prot);
}
-#define pgd_next_t pmd_t
-#define alloc_pgd_next(__va) alloc_pmd(__va)
-#define get_pgd_next_virt(__pa) get_pmd_virt(__pa)
+static pud_t *__init get_pud_virt(phys_addr_t pa)
+{
+ if (mmu_enabled) {
+ clear_fixmap(FIX_PUD);
+ return (pud_t *)set_fixmap_offset(FIX_PUD, pa);
+ } else {
+ return (pud_t *)((uintptr_t)pa);
+ }
+}
+
+static phys_addr_t __init alloc_pud(uintptr_t va)
+{
+ if (mmu_enabled)
+ return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
+
+ /* Only one PUD is available for early mapping */
+ BUG_ON((va - kernel_virt_addr) >> PGDIR_SHIFT);
+
+ return (uintptr_t)early_pud;
+}
+
+static void __init create_pud_mapping(pud_t *pudp,
+ uintptr_t va, phys_addr_t pa,
+ phys_addr_t sz, pgprot_t prot)
+{
+ pmd_t *nextp;
+ phys_addr_t next_phys;
+ uintptr_t pud_index = pud_index(va);
+
+ if (sz == PUD_SIZE) {
+ if (pud_val(pudp[pud_index]) == 0)
+ pudp[pud_index] = pfn_pud(PFN_DOWN(pa), prot);
+ return;
+ }
+
+ if (pud_val(pudp[pud_index]) == 0) {
+ next_phys = alloc_pmd(va);
+ pudp[pud_index] = pfn_pud(PFN_DOWN(next_phys), PAGE_TABLE);
+ nextp = get_pmd_virt(next_phys);
+ memset(nextp, 0, PAGE_SIZE);
+ } else {
+ next_phys = PFN_PHYS(_pud_pfn(pudp[pud_index]));
+ nextp = get_pmd_virt(next_phys);
+ }
+
+ create_pmd_mapping(nextp, va, pa, sz, prot);
+}
+
+#define pgd_next_t pud_t
+#define alloc_pgd_next(__va) alloc_pud(__va)
+#define get_pgd_next_virt(__pa) get_pud_virt(__pa)
#define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \
- create_pmd_mapping(__nextp, __va, __pa, __sz, __prot)
-#define fixmap_pgd_next fixmap_pmd
+ create_pud_mapping(__nextp, __va, __pa, __sz, __prot)
+#define fixmap_pgd_next (pgtable_l4_enabled ? \
+ (uintptr_t)fixmap_pud : (uintptr_t)fixmap_pmd)
+#define trampoline_pgd_next (pgtable_l4_enabled ? \
+ (uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd)
#else
#define pgd_next_t pte_t
#define alloc_pgd_next(__va) alloc_pte(__va)
#define get_pgd_next_virt(__pa) get_pte_virt(__pa)
#define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \
create_pte_mapping(__nextp, __va, __pa, __sz, __prot)
-#define fixmap_pgd_next fixmap_pte
+#define fixmap_pgd_next ((uintptr_t)fixmap_pte)
#endif
static void __init create_pgd_mapping(pgd_t *pgdp,
@@ -328,6 +398,13 @@ static void __init create_pgd_mapping(pgd_t *pgdp,
phys_addr_t next_phys;
uintptr_t pgd_index = pgd_index(va);
+#ifndef __PAGETABLE_PMD_FOLDED
+ if (!pgtable_l4_enabled) {
+ create_pud_mapping((pud_t *)pgdp, va, pa, sz, prot);
+ return;
+ }
+#endif
+
if (sz == PGDIR_SIZE) {
if (pgd_val(pgdp[pgd_index]) == 0)
pgdp[pgd_index] = pfn_pgd(PFN_DOWN(pa), prot);
@@ -419,6 +496,47 @@ void __init relocate_kernel(uintptr_t load_pa)
}
}
+#if defined(CONFIG_64BIT) && !defined(CONFIG_MAXPHYSMEM_2GB)
+void disable_pgtable_l4(void)
+{
+ pgtable_l4_enabled = false;
+ __page_offset = PAGE_OFFSET_L3;
+ satp_mode = SATP_MODE_39;
+}
+
+/* There is a simple way to determine if 4-level is supported by the
+ * underlying hardware: establish 1:1 mapping in 4-level page table mode
+ * then read SATP to see if the configuration was taken into account
+ * meaning sv48 is supported.
+ */
+asmlinkage __init void set_satp_mode(uintptr_t load_pa)
+{
+ u64 identity_satp, hw_satp;
+ int cpus_node;
+
+ create_pgd_mapping(early_pg_dir, load_pa, (uintptr_t)early_pud,
+ PGDIR_SIZE, PAGE_TABLE);
+ create_pud_mapping(early_pud, load_pa, (uintptr_t)early_pmd,
+ PUD_SIZE, PAGE_TABLE);
+ create_pmd_mapping(early_pmd, load_pa, load_pa,
+ PMD_SIZE, PAGE_KERNEL_EXEC);
+
+ identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode;
+ local_flush_tlb_all();
+ csr_write(CSR_SATP, identity_satp);
+
+ hw_satp = csr_read(CSR_SATP);
+ csr_write(CSR_SATP, 0ULL);
+ local_flush_tlb_all();
+
+ if (hw_satp != identity_satp)
+ disable_pgtable_l4();
+
+ memset(early_pg_dir, 0, PAGE_SIZE);
+ memset(early_pud, 0, PAGE_SIZE);
+ memset(early_pmd, 0, PAGE_SIZE);
+}
+#endif
#endif
static uintptr_t load_pa, load_sz;
@@ -442,9 +560,14 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
load_pa = (uintptr_t)(&_start);
load_sz = (uintptr_t)(&_end) - load_pa;
+#if defined(CONFIG_64BIT) && !defined(CONFIG_MAXPHYSMEM_2GB)
+ set_satp_mode(load_pa);
+#endif
+
+ kernel_virt_addr = KERNEL_VIRT_ADDR;
+
va_pa_offset = PAGE_OFFSET - load_pa;
va_kernel_pa_offset = kernel_virt_addr - load_pa;
-
pfn_base = PFN_DOWN(load_pa);
#ifdef CONFIG_RELOCATABLE
@@ -473,15 +596,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
/* Setup early PGD for fixmap */
create_pgd_mapping(early_pg_dir, FIXADDR_START,
- (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
+ fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
#ifndef __PAGETABLE_PMD_FOLDED
- /* Setup fixmap PMD */
+ /* Setup fixmap PUD and PMD */
+ if (pgtable_l4_enabled)
+ create_pud_mapping(fixmap_pud, FIXADDR_START,
+ (uintptr_t)fixmap_pmd, PUD_SIZE, PAGE_TABLE);
create_pmd_mapping(fixmap_pmd, FIXADDR_START,
(uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE);
+
/* Setup trampoline PGD and PMD */
create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr,
- (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
+ trampoline_pgd_next, PGDIR_SIZE, PAGE_TABLE);
+ if (pgtable_l4_enabled)
+ create_pud_mapping(trampoline_pud, kernel_virt_addr,
+ (uintptr_t)trampoline_pmd, PUD_SIZE, PAGE_TABLE);
create_pmd_mapping(trampoline_pmd, kernel_virt_addr,
load_pa, PMD_SIZE, PAGE_KERNEL_EXEC);
#else
@@ -558,12 +688,13 @@ static void __init setup_vm_final(void)
vm_area_add_early(&vm_kernel);
- /* Clear fixmap PTE and PMD mappings */
+ /* Clear fixmap page table mappings */
clear_fixmap(FIX_PTE);
clear_fixmap(FIX_PMD);
+ clear_fixmap(FIX_PUD);
/* Move to swapper page table */
- csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE);
+ csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | satp_mode);
local_flush_tlb_all();
}
--
2.20.1
This is a preparatory patch for sv48 support that will introduce
dynamic PAGE_OFFSET.
Dynamic PAGE_OFFSET implies that all zones (vmalloc, vmemmap, fixaddr...)
whose addresses depend on PAGE_OFFSET become dynamic and can't be used
to statically initialize the array used by ptdump to identify the
different zones of the vm layout.
Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/riscv/mm/ptdump.c | 49 ++++++++++++++++++++++++++++++++++--------
1 file changed, 40 insertions(+), 9 deletions(-)
diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c
index 7eab76a93106..7d9386a7f5c2 100644
--- a/arch/riscv/mm/ptdump.c
+++ b/arch/riscv/mm/ptdump.c
@@ -49,22 +49,41 @@ struct addr_marker {
const char *name;
};
+enum address_markers_idx {
+#ifdef CONFIG_KASAN
+ KASAN_SHADOW_START_NR,
+ KASAN_SHADOW_END_NR,
+#endif
+ FIXMAP_START_NR,
+ FIXMAP_END_NR,
+ PCI_IO_START_NR,
+ PCI_IO_END_NR,
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ VMEMMAP_START_NR,
+ VMEMMAP_END_NR,
+#endif
+ VMALLOC_START_NR,
+ VMALLOC_END_NR,
+ PAGE_OFFSET_NR,
+ END_OF_SPACE_NR
+};
+
static struct addr_marker address_markers[] = {
#ifdef CONFIG_KASAN
{KASAN_SHADOW_START, "Kasan shadow start"},
{KASAN_SHADOW_END, "Kasan shadow end"},
#endif
- {FIXADDR_START, "Fixmap start"},
- {FIXADDR_TOP, "Fixmap end"},
- {PCI_IO_START, "PCI I/O start"},
- {PCI_IO_END, "PCI I/O end"},
+ {0, "Fixmap start"},
+ {0, "Fixmap end"},
+ {0, "PCI I/O start"},
+ {0, "PCI I/O end"},
#ifdef CONFIG_SPARSEMEM_VMEMMAP
- {VMEMMAP_START, "vmemmap start"},
- {VMEMMAP_END, "vmemmap end"},
+ {0, "vmemmap start"},
+ {0, "vmemmap end"},
#endif
- {VMALLOC_START, "vmalloc() area"},
- {VMALLOC_END, "vmalloc() end"},
- {PAGE_OFFSET, "Linear mapping"},
+ {0, "vmalloc() area"},
+ {0, "vmalloc() end"},
+ {0, "Linear mapping"},
{-1, NULL},
};
@@ -304,6 +323,18 @@ static int ptdump_init(void)
{
unsigned int i, j;
+ address_markers[FIXMAP_START_NR].start_address = FIXADDR_START;
+ address_markers[FIXMAP_END_NR].start_address = FIXADDR_TOP;
+ address_markers[PCI_IO_START_NR].start_address = PCI_IO_START;
+ address_markers[PCI_IO_END_NR].start_address = PCI_IO_END;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
+ address_markers[VMEMMAP_END_NR].start_address = VMEMMAP_END;
+#endif
+ address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
+ address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
+ address_markers[PAGE_OFFSET_NR].start_address = PAGE_OFFSET;
+
for (i = 0; i < ARRAY_SIZE(pg_level); i++)
for (j = 0; j < ARRAY_SIZE(pte_bits); j++)
pg_level[i].mask |= pte_bits[j].mask;
--
2.20.1
This is made possible by using the mmu-type property of the cpu node of
the device tree.
By default, the kernel will boot with 4-level page table if the hw supports
it but it can be interesting for the user to select 3-level page table as
it is less memory consuming and faster since it requires less memory
accesses in case of a TLB miss.
Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/riscv/mm/init.c | 25 +++++++++++++++++++++++--
1 file changed, 23 insertions(+), 2 deletions(-)
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index bad8da099ff6..1776eeb53d61 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -509,11 +509,32 @@ void disable_pgtable_l4(void)
* then read SATP to see if the configuration was taken into account
* meaning sv48 is supported.
*/
-asmlinkage __init void set_satp_mode(uintptr_t load_pa)
+asmlinkage __init void set_satp_mode(uintptr_t load_pa, uintptr_t dtb_pa)
{
u64 identity_satp, hw_satp;
int cpus_node;
+ /* 1/ Check if the user asked for sv39 explicitly in the device tree */
+ cpus_node = fdt_path_offset((void *)dtb_pa, "/cpus");
+ if (cpus_node >= 0) {
+ int node;
+
+ fdt_for_each_subnode(node, (void *)dtb_pa, cpus_node) {
+ const char *mmu_type = fdt_getprop((void *)dtb_pa, node,
+ "mmu-type", NULL);
+ if (!mmu_type)
+ continue;
+
+ if (!strcmp(mmu_type, "riscv,sv39")) {
+ disable_pgtable_l4();
+ return;
+ }
+
+ break;
+ }
+ }
+
+ /* 2/ Determine if the HW supports sv48: if not, fallback to sv39 */
create_pgd_mapping(early_pg_dir, load_pa, (uintptr_t)early_pud,
PGDIR_SIZE, PAGE_TABLE);
create_pud_mapping(early_pud, load_pa, (uintptr_t)early_pmd,
@@ -561,7 +582,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
load_sz = (uintptr_t)(&_end) - load_pa;
#if defined(CONFIG_64BIT) && !defined(CONFIG_MAXPHYSMEM_2GB)
- set_satp_mode(load_pa);
+ set_satp_mode(load_pa, dtb_pa);
#endif
kernel_virt_addr = KERNEL_VIRT_ADDR;
--
2.20.1
Now that the mmu type is determined at runtime using SATP
characteristic, use the global variable pgtable_l4_enabled to output
mmu type of the processor through /proc/cpuinfo instead of relying on
device tree infos.
Signed-off-by: Alexandre Ghiti <[email protected]>
Reviewed-by: Anup Patel <[email protected]>
Reviewed-by: Palmer Dabbelt <[email protected]>
---
arch/riscv/boot/dts/sifive/fu540-c000.dtsi | 4 ----
arch/riscv/kernel/cpu.c | 24 ++++++++++++----------
2 files changed, 13 insertions(+), 15 deletions(-)
diff --git a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi
index 7db861053483..6138590a2229 100644
--- a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi
+++ b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi
@@ -50,7 +50,6 @@
i-cache-size = <32768>;
i-tlb-sets = <1>;
i-tlb-size = <32>;
- mmu-type = "riscv,sv39";
reg = <1>;
riscv,isa = "rv64imafdc";
tlb-split;
@@ -74,7 +73,6 @@
i-cache-size = <32768>;
i-tlb-sets = <1>;
i-tlb-size = <32>;
- mmu-type = "riscv,sv39";
reg = <2>;
riscv,isa = "rv64imafdc";
tlb-split;
@@ -98,7 +96,6 @@
i-cache-size = <32768>;
i-tlb-sets = <1>;
i-tlb-size = <32>;
- mmu-type = "riscv,sv39";
reg = <3>;
riscv,isa = "rv64imafdc";
tlb-split;
@@ -122,7 +119,6 @@
i-cache-size = <32768>;
i-tlb-sets = <1>;
i-tlb-size = <32>;
- mmu-type = "riscv,sv39";
reg = <4>;
riscv,isa = "rv64imafdc";
tlb-split;
diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
index 40a3c442ac5f..38a699b997a8 100644
--- a/arch/riscv/kernel/cpu.c
+++ b/arch/riscv/kernel/cpu.c
@@ -8,6 +8,8 @@
#include <linux/of.h>
#include <asm/smp.h>
+extern bool pgtable_l4_enabled;
+
/*
* Returns the hart ID of the given device tree node, or -ENODEV if the node
* isn't an enabled and valid RISC-V hart node.
@@ -54,18 +56,19 @@ static void print_isa(struct seq_file *f, const char *isa)
seq_puts(f, "\n");
}
-static void print_mmu(struct seq_file *f, const char *mmu_type)
+static void print_mmu(struct seq_file *f)
{
+ char sv_type[16];
+
#if defined(CONFIG_32BIT)
- if (strcmp(mmu_type, "riscv,sv32") != 0)
- return;
+ strncpy(sv_type, "sv32", 5);
#elif defined(CONFIG_64BIT)
- if (strcmp(mmu_type, "riscv,sv39") != 0 &&
- strcmp(mmu_type, "riscv,sv48") != 0)
- return;
+ if (pgtable_l4_enabled)
+ strncpy(sv_type, "sv48", 5);
+ else
+ strncpy(sv_type, "sv39", 5);
#endif
-
- seq_printf(f, "mmu\t\t: %s\n", mmu_type+6);
+ seq_printf(f, "mmu\t\t: %s\n", sv_type);
}
static void *c_start(struct seq_file *m, loff_t *pos)
@@ -90,14 +93,13 @@ static int c_show(struct seq_file *m, void *v)
{
unsigned long cpu_id = (unsigned long)v - 1;
struct device_node *node = of_get_cpu_node(cpu_id, NULL);
- const char *compat, *isa, *mmu;
+ const char *compat, *isa;
seq_printf(m, "processor\t: %lu\n", cpu_id);
seq_printf(m, "hart\t\t: %lu\n", cpuid_to_hartid_map(cpu_id));
if (!of_property_read_string(node, "riscv,isa", &isa))
print_isa(m, isa);
- if (!of_property_read_string(node, "mmu-type", &mmu))
- print_mmu(m, mmu);
+ print_mmu(m);
if (!of_property_read_string(node, "compatible", &compat)
&& strcmp(compat, "riscv"))
seq_printf(m, "uarch\t\t: %s\n", compat);
--
2.20.1
Define precisely the size of the user accessible virtual space size
for sv32/39/48 mmu types and explain why the whole virtual address
space is split into 2 equal chunks between kernel and user space.
Signed-off-by: Alexandre Ghiti <[email protected]>
Reviewed-by: Anup Patel <[email protected]>
Reviewed-by: Palmer Dabbelt <[email protected]>
---
arch/riscv/include/asm/pgtable.h | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index b8a8ba69d0a2..0c7d07f614b3 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -481,8 +481,15 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
#endif
/*
- * Task size is 0x4000000000 for RV64 or 0x9fc00000 for RV32.
- * Note that PGDIR_SIZE must evenly divide TASK_SIZE.
+ * Task size is:
+ * - 0x9fc00000 (~2.5GB) for RV32.
+ * - 0x4000000000 ( 256GB) for RV64 using SV39 mmu
+ * - 0x800000000000 ( 128TB) for RV64 using SV48 mmu
+ *
+ * Note that PGDIR_SIZE must evenly divide TASK_SIZE since "RISC-V
+ * Instruction Set Manual Volume II: Privileged Architecture" states that
+ * "load and store effective addresses, which are 64bits, must have bits
+ * 63–48 all equal to bit 47, or else a page-fault exception will occur."
*/
#ifdef CONFIG_64BIT
#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2)
--
2.20.1
On Sun, May 24, 2020 at 2:44 PM Alexandre Ghiti <[email protected]> wrote:
>
> This is a preparatory patch for sv48 support that will introduce
> dynamic PAGE_OFFSET.
>
> Dynamic PAGE_OFFSET implies that all zones (vmalloc, vmemmap, fixaddr...)
> whose addresses depend on PAGE_OFFSET become dynamic and can't be used
> to statically initialize the array used by ptdump to identify the
> different zones of the vm layout.
>
> Signed-off-by: Alexandre Ghiti <[email protected]>
> ---
> arch/riscv/mm/ptdump.c | 49 ++++++++++++++++++++++++++++++++++--------
> 1 file changed, 40 insertions(+), 9 deletions(-)
>
> diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c
> index 7eab76a93106..7d9386a7f5c2 100644
> --- a/arch/riscv/mm/ptdump.c
> +++ b/arch/riscv/mm/ptdump.c
> @@ -49,22 +49,41 @@ struct addr_marker {
> const char *name;
> };
>
> +enum address_markers_idx {
> +#ifdef CONFIG_KASAN
> + KASAN_SHADOW_START_NR,
> + KASAN_SHADOW_END_NR,
> +#endif
> + FIXMAP_START_NR,
> + FIXMAP_END_NR,
> + PCI_IO_START_NR,
> + PCI_IO_END_NR,
> +#ifdef CONFIG_SPARSEMEM_VMEMMAP
> + VMEMMAP_START_NR,
> + VMEMMAP_END_NR,
> +#endif
> + VMALLOC_START_NR,
> + VMALLOC_END_NR,
> + PAGE_OFFSET_NR,
> + END_OF_SPACE_NR
> +};
> +
> static struct addr_marker address_markers[] = {
> #ifdef CONFIG_KASAN
> {KASAN_SHADOW_START, "Kasan shadow start"},
> {KASAN_SHADOW_END, "Kasan shadow end"},
> #endif
> - {FIXADDR_START, "Fixmap start"},
> - {FIXADDR_TOP, "Fixmap end"},
> - {PCI_IO_START, "PCI I/O start"},
> - {PCI_IO_END, "PCI I/O end"},
> + {0, "Fixmap start"},
> + {0, "Fixmap end"},
> + {0, "PCI I/O start"},
> + {0, "PCI I/O end"},
> #ifdef CONFIG_SPARSEMEM_VMEMMAP
> - {VMEMMAP_START, "vmemmap start"},
> - {VMEMMAP_END, "vmemmap end"},
> + {0, "vmemmap start"},
> + {0, "vmemmap end"},
> #endif
> - {VMALLOC_START, "vmalloc() area"},
> - {VMALLOC_END, "vmalloc() end"},
> - {PAGE_OFFSET, "Linear mapping"},
> + {0, "vmalloc() area"},
> + {0, "vmalloc() end"},
> + {0, "Linear mapping"},
> {-1, NULL},
> };
>
> @@ -304,6 +323,18 @@ static int ptdump_init(void)
> {
> unsigned int i, j;
>
> + address_markers[FIXMAP_START_NR].start_address = FIXADDR_START;
> + address_markers[FIXMAP_END_NR].start_address = FIXADDR_TOP;
> + address_markers[PCI_IO_START_NR].start_address = PCI_IO_START;
> + address_markers[PCI_IO_END_NR].start_address = PCI_IO_END;
> +#ifdef CONFIG_SPARSEMEM_VMEMMAP
> + address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
> + address_markers[VMEMMAP_END_NR].start_address = VMEMMAP_END;
> +#endif
> + address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
> + address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
> + address_markers[PAGE_OFFSET_NR].start_address = PAGE_OFFSET;
> +
> for (i = 0; i < ARRAY_SIZE(pg_level); i++)
> for (j = 0; j < ARRAY_SIZE(pte_bits); j++)
> pg_level[i].mask |= pte_bits[j].mask;
> --
> 2.20.1
>
Looks good to me.
Reviewed-by: Anup Patel <[email protected]>
Regards,
Anup
On Sun, May 24, 2020 at 2:47 PM Alexandre Ghiti <[email protected]> wrote:
>
> Now that the mmu type is determined at runtime using SATP
> characteristic, use the global variable pgtable_l4_enabled to output
> mmu type of the processor through /proc/cpuinfo instead of relying on
> device tree infos.
>
> Signed-off-by: Alexandre Ghiti <[email protected]>
> Reviewed-by: Anup Patel <[email protected]>
> Reviewed-by: Palmer Dabbelt <[email protected]>
> ---
> arch/riscv/boot/dts/sifive/fu540-c000.dtsi | 4 ----
> arch/riscv/kernel/cpu.c | 24 ++++++++++++----------
> 2 files changed, 13 insertions(+), 15 deletions(-)
>
> diff --git a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi
> index 7db861053483..6138590a2229 100644
> --- a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi
> +++ b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi
> @@ -50,7 +50,6 @@
> i-cache-size = <32768>;
> i-tlb-sets = <1>;
> i-tlb-size = <32>;
> - mmu-type = "riscv,sv39";
> reg = <1>;
> riscv,isa = "rv64imafdc";
> tlb-split;
> @@ -74,7 +73,6 @@
> i-cache-size = <32768>;
> i-tlb-sets = <1>;
> i-tlb-size = <32>;
> - mmu-type = "riscv,sv39";
> reg = <2>;
> riscv,isa = "rv64imafdc";
> tlb-split;
> @@ -98,7 +96,6 @@
> i-cache-size = <32768>;
> i-tlb-sets = <1>;
> i-tlb-size = <32>;
> - mmu-type = "riscv,sv39";
> reg = <3>;
> riscv,isa = "rv64imafdc";
> tlb-split;
> @@ -122,7 +119,6 @@
> i-cache-size = <32768>;
> i-tlb-sets = <1>;
> i-tlb-size = <32>;
> - mmu-type = "riscv,sv39";
> reg = <4>;
> riscv,isa = "rv64imafdc";
> tlb-split;
Your PATCH6 is already doing the right thing by skipping CPU DT
nodes that don't have "mmu-type" DT property.
The "mmu-type" DT property is very critical for RUNTIME M-mode
firmware (OpenSBI) because it tells whether a given CPU has MMU
(or not). This is also in agreement with the current DT bindings
document for RISC-V CPUs.
I suggest to drop the change in sifive/fu540-c000.dtsi and rest of
the patch is fine so my Reviewed-by still holds.
Regards,
Anup
> diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
> index 40a3c442ac5f..38a699b997a8 100644
> --- a/arch/riscv/kernel/cpu.c
> +++ b/arch/riscv/kernel/cpu.c
> @@ -8,6 +8,8 @@
> #include <linux/of.h>
> #include <asm/smp.h>
>
> +extern bool pgtable_l4_enabled;
> +
> /*
> * Returns the hart ID of the given device tree node, or -ENODEV if the node
> * isn't an enabled and valid RISC-V hart node.
> @@ -54,18 +56,19 @@ static void print_isa(struct seq_file *f, const char *isa)
> seq_puts(f, "\n");
> }
>
> -static void print_mmu(struct seq_file *f, const char *mmu_type)
> +static void print_mmu(struct seq_file *f)
> {
> + char sv_type[16];
> +
> #if defined(CONFIG_32BIT)
> - if (strcmp(mmu_type, "riscv,sv32") != 0)
> - return;
> + strncpy(sv_type, "sv32", 5);
> #elif defined(CONFIG_64BIT)
> - if (strcmp(mmu_type, "riscv,sv39") != 0 &&
> - strcmp(mmu_type, "riscv,sv48") != 0)
> - return;
> + if (pgtable_l4_enabled)
> + strncpy(sv_type, "sv48", 5);
> + else
> + strncpy(sv_type, "sv39", 5);
> #endif
> -
> - seq_printf(f, "mmu\t\t: %s\n", mmu_type+6);
> + seq_printf(f, "mmu\t\t: %s\n", sv_type);
> }
>
> static void *c_start(struct seq_file *m, loff_t *pos)
> @@ -90,14 +93,13 @@ static int c_show(struct seq_file *m, void *v)
> {
> unsigned long cpu_id = (unsigned long)v - 1;
> struct device_node *node = of_get_cpu_node(cpu_id, NULL);
> - const char *compat, *isa, *mmu;
> + const char *compat, *isa;
>
> seq_printf(m, "processor\t: %lu\n", cpu_id);
> seq_printf(m, "hart\t\t: %lu\n", cpuid_to_hartid_map(cpu_id));
> if (!of_property_read_string(node, "riscv,isa", &isa))
> print_isa(m, isa);
> - if (!of_property_read_string(node, "mmu-type", &mmu))
> - print_mmu(m, mmu);
> + print_mmu(m);
> if (!of_property_read_string(node, "compatible", &compat)
> && strcmp(compat, "riscv"))
> seq_printf(m, "uarch\t\t: %s\n", compat);
> --
> 2.20.1
>
On Sun, May 24, 2020 at 2:45 PM Alexandre Ghiti <[email protected]> wrote:
>
> By adding a new 4th level of page table, give the possibility to 64bit
> kernel to address 2^48 bytes of virtual address: in practice, that roughly
> offers ~160TB of virtual address space to userspace and allows up to 64TB
> of physical memory.
>
> If the underlying hardware does not support sv48, we will automatically
> fallback to a standard 3-level page table by folding the new PUD level into
> PGDIR level. In order to detect HW capabilities at runtime, we
> use SATP feature that ignores writes with an unsupported mode.
>
> Signed-off-by: Alexandre Ghiti <[email protected]>
> ---
> arch/riscv/Kconfig | 6 +-
> arch/riscv/include/asm/csr.h | 3 +-
> arch/riscv/include/asm/fixmap.h | 1 +
> arch/riscv/include/asm/page.h | 15 +++
> arch/riscv/include/asm/pgalloc.h | 36 +++++++
> arch/riscv/include/asm/pgtable-64.h | 97 ++++++++++++++++-
> arch/riscv/include/asm/pgtable.h | 9 +-
> arch/riscv/kernel/head.S | 3 +-
> arch/riscv/mm/context.c | 4 +-
> arch/riscv/mm/init.c | 159 +++++++++++++++++++++++++---
> 10 files changed, 309 insertions(+), 24 deletions(-)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index e167f16131f4..3f73f60e9732 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -68,6 +68,7 @@ config RISCV
> select ARCH_HAS_GCOV_PROFILE_ALL
> select HAVE_COPY_THREAD_TLS
> select HAVE_ARCH_KASAN if MMU && 64BIT
> + select RELOCATABLE if 64BIT
>
> config ARCH_MMAP_RND_BITS_MIN
> default 18 if 64BIT
> @@ -106,7 +107,7 @@ config PAGE_OFFSET
> default 0xC0000000 if 32BIT && MAXPHYSMEM_2GB
> default 0x80000000 if 64BIT && !MMU
> default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB
> - default 0xffffffe000000000 if 64BIT && !MAXPHYSMEM_2GB
> + default 0xffffc00000000000 if 64BIT && !MAXPHYSMEM_2GB
>
> config ARCH_FLATMEM_ENABLE
> def_bool y
> @@ -155,8 +156,11 @@ config GENERIC_HWEIGHT
> config FIX_EARLYCON_MEM
> def_bool MMU
>
> +# On a 64BIT relocatable kernel, the 4-level page table is at runtime folded
> +# on a 3-level page table when sv48 is not supported.
> config PGTABLE_LEVELS
> int
> + default 4 if 64BIT && RELOCATABLE
> default 3 if 64BIT
> default 2
>
> diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h
> index cec462e198ce..d41536c3f8d4 100644
> --- a/arch/riscv/include/asm/csr.h
> +++ b/arch/riscv/include/asm/csr.h
> @@ -40,11 +40,10 @@
> #ifndef CONFIG_64BIT
> #define SATP_PPN _AC(0x003FFFFF, UL)
> #define SATP_MODE_32 _AC(0x80000000, UL)
> -#define SATP_MODE SATP_MODE_32
> #else
> #define SATP_PPN _AC(0x00000FFFFFFFFFFF, UL)
> #define SATP_MODE_39 _AC(0x8000000000000000, UL)
> -#define SATP_MODE SATP_MODE_39
> +#define SATP_MODE_48 _AC(0x9000000000000000, UL)
> #endif
>
> /* Exception cause high bit - is an interrupt if set */
> diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
> index 2368d49eb4ef..d891cf9c73c5 100644
> --- a/arch/riscv/include/asm/fixmap.h
> +++ b/arch/riscv/include/asm/fixmap.h
> @@ -27,6 +27,7 @@ enum fixed_addresses {
> FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1,
> FIX_PTE,
> FIX_PMD,
> + FIX_PUD,
> FIX_TEXT_POKE1,
> FIX_TEXT_POKE0,
> FIX_EARLYCON_MEM_BASE,
> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> index 48bb09b6a9b7..5e77fe7f0d6d 100644
> --- a/arch/riscv/include/asm/page.h
> +++ b/arch/riscv/include/asm/page.h
> @@ -31,7 +31,19 @@
> * When not using MMU this corresponds to the first free page in
> * physical memory (aligned on a page boundary).
> */
> +#ifdef CONFIG_RELOCATABLE
> +#define PAGE_OFFSET __page_offset
> +
> +#ifdef CONFIG_64BIT
> +/*
> + * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so
> + * define the PAGE_OFFSET value for SV39.
> + */
> +#define PAGE_OFFSET_L3 0xffffffe000000000
> +#endif /* CONFIG_64BIT */
> +#else
> #define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL)
> +#endif /* CONFIG_RELOCATABLE */
>
> #define KERN_VIRT_SIZE (-PAGE_OFFSET)
>
> @@ -102,6 +114,9 @@ extern unsigned long pfn_base;
> extern unsigned long max_low_pfn;
> extern unsigned long min_low_pfn;
> extern unsigned long kernel_virt_addr;
> +#ifdef CONFIG_RELOCATABLE
> +extern unsigned long __page_offset;
> +#endif
>
> #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset))
> #define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset)
> diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
> index 3f601ee8233f..540eaa5a8658 100644
> --- a/arch/riscv/include/asm/pgalloc.h
> +++ b/arch/riscv/include/asm/pgalloc.h
> @@ -36,6 +36,42 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
>
> set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> }
> +
> +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
> +{
> + if (pgtable_l4_enabled) {
> + unsigned long pfn = virt_to_pfn(pud);
> +
> + set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> + }
> +}
> +
> +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d,
> + pud_t *pud)
> +{
> + if (pgtable_l4_enabled) {
> + unsigned long pfn = virt_to_pfn(pud);
> +
> + set_p4d_safe(p4d,
> + __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> + }
> +}
> +
> +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
> +{
> + if (pgtable_l4_enabled)
> + return (pud_t *)__get_free_page(
> + GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_ZERO);
> + return NULL;
> +}
> +
> +static inline void pud_free(struct mm_struct *mm, pud_t *pud)
> +{
> + if (pgtable_l4_enabled)
> + free_page((unsigned long)pud);
> +}
> +
> +#define __pud_free_tlb(tlb, pud, addr) pud_free((tlb)->mm, pud)
> #endif /* __PAGETABLE_PMD_FOLDED */
>
> #define pmd_pgtable(pmd) pmd_page(pmd)
> diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
> index b15f70a1fdfa..c84c31fbf8da 100644
> --- a/arch/riscv/include/asm/pgtable-64.h
> +++ b/arch/riscv/include/asm/pgtable-64.h
> @@ -8,16 +8,32 @@
>
> #include <linux/const.h>
>
> -#define PGDIR_SHIFT 30
> +extern bool pgtable_l4_enabled;
> +
> +#define PGDIR_SHIFT (pgtable_l4_enabled ? 39 : 30)
> /* Size of region mapped by a page global directory */
> #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
> #define PGDIR_MASK (~(PGDIR_SIZE - 1))
>
> +/* pud is folded into pgd in case of 3-level page table */
> +#define PUD_SHIFT 30
> +#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
> +#define PUD_MASK (~(PUD_SIZE - 1))
> +
> #define PMD_SHIFT 21
> /* Size of region mapped by a page middle directory */
> #define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
> #define PMD_MASK (~(PMD_SIZE - 1))
>
> +/* Page Upper Directory entry */
> +typedef struct {
> + unsigned long pud;
> +} pud_t;
> +
> +#define pud_val(x) ((x).pud)
> +#define __pud(x) ((pud_t) { (x) })
> +#define PTRS_PER_PUD (PAGE_SIZE / sizeof(pud_t))
> +
> /* Page Middle Directory entry */
> typedef struct {
> unsigned long pmd;
> @@ -60,6 +76,16 @@ static inline void pud_clear(pud_t *pudp)
> set_pud(pudp, __pud(0));
> }
>
> +static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot)
> +{
> + return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
> +}
> +
> +static inline unsigned long _pud_pfn(pud_t pud)
> +{
> + return pud_val(pud) >> _PAGE_PFN_SHIFT;
> +}
> +
> static inline unsigned long pud_page_vaddr(pud_t pud)
> {
> return (unsigned long)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT);
> @@ -70,6 +96,15 @@ static inline struct page *pud_page(pud_t pud)
> return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
> }
>
> +#define mm_pud_folded mm_pud_folded
> +static inline bool mm_pud_folded(struct mm_struct *mm)
> +{
> + if (pgtable_l4_enabled)
> + return false;
> +
> + return true;
> +}
> +
> #define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
>
> static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
> @@ -90,4 +125,64 @@ static inline unsigned long _pmd_pfn(pmd_t pmd)
> #define pmd_ERROR(e) \
> pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
>
> +#define pud_ERROR(e) \
> + pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
> +
> +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
> +{
> + if (pgtable_l4_enabled)
> + *p4dp = p4d;
> + else
> + set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) });
> +}
> +
> +static inline int p4d_none(p4d_t p4d)
> +{
> + if (pgtable_l4_enabled)
> + return (p4d_val(p4d) == 0);
> +
> + return 0;
> +}
> +
> +static inline int p4d_present(p4d_t p4d)
> +{
> + if (pgtable_l4_enabled)
> + return (p4d_val(p4d) & _PAGE_PRESENT);
> +
> + return 1;
> +}
> +
> +static inline int p4d_bad(p4d_t p4d)
> +{
> + if (pgtable_l4_enabled)
> + return !p4d_present(p4d);
> +
> + return 0;
> +}
> +
> +static inline void p4d_clear(p4d_t *p4d)
> +{
> + if (pgtable_l4_enabled)
> + set_p4d(p4d, __p4d(0));
> +}
> +
> +static inline unsigned long p4d_page_vaddr(p4d_t p4d)
> +{
> + if (pgtable_l4_enabled)
> + return (unsigned long)pfn_to_virt(
> + p4d_val(p4d) >> _PAGE_PFN_SHIFT);
> +
> + return pud_page_vaddr((pud_t) { p4d_val(p4d) });
> +}
> +
> +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
> +
> +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
> +{
> + if (pgtable_l4_enabled)
> + return (pud_t *)p4d_page_vaddr(*p4d) + pud_index(address);
> +
> + return (pud_t *)p4d;
> +}
> +
> #endif /* _ASM_RISCV_PGTABLE_64_H */
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index 8e96315b3366..b8a8ba69d0a2 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -20,12 +20,14 @@
> * the kernel.
> */
> #define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1)
> -#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR
> +#define KERNEL_LINK_ADDR (VMALLOC_LINK_END - SZ_2G + 1)
>
> #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1)
> #define VMALLOC_END (PAGE_OFFSET - 1)
> #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE)
>
> +#define VMALLOC_LINK_END (_AC(CONFIG_PAGE_OFFSET, UL) - 1)
> +
> #define BPF_JIT_REGION_SIZE (SZ_128M)
> #define BPF_JIT_REGION_START (kernel_virt_addr)
> #define BPF_JIT_REGION_END (kernel_virt_addr + BPF_JIT_REGION_SIZE)
> @@ -67,8 +69,7 @@
>
> #ifndef __ASSEMBLY__
>
> -/* Page Upper Directory not used in RISC-V */
> -#include <asm-generic/pgtable-nopud.h>
> +#include <asm-generic/pgtable-nop4d.h>
> #include <asm/page.h>
> #include <asm/tlbflush.h>
> #include <linux/mm_types.h>
> @@ -81,7 +82,7 @@
>
> #ifdef CONFIG_MMU
> #ifdef CONFIG_64BIT
> -#define VA_BITS 39
> +#define VA_BITS (pgtable_l4_enabled ? 48 : 39)
> #define PA_BITS 56
> #else
> #define VA_BITS 32
> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
> index 8f5bb7731327..0632c4834c68 100644
> --- a/arch/riscv/kernel/head.S
> +++ b/arch/riscv/kernel/head.S
> @@ -62,7 +62,8 @@ relocate:
>
> /* Compute satp for kernel page tables, but don't load it yet */
> srl a2, a0, PAGE_SHIFT
> - li a1, SATP_MODE
> + la a1, satp_mode
> + REG_L a1, 0(a1)
> or a2, a2, a1
>
> /*
> diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
> index 613ec81a8979..6830504f8b11 100644
> --- a/arch/riscv/mm/context.c
> +++ b/arch/riscv/mm/context.c
> @@ -9,6 +9,8 @@
> #include <asm/cacheflush.h>
> #include <asm/mmu_context.h>
>
> +extern u64 satp_mode;
> +
> /*
> * When necessary, performs a deferred icache flush for the given MM context,
> * on the local CPU. RISC-V has no direct mechanism for instruction cache
> @@ -59,7 +61,7 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
> cpumask_set_cpu(cpu, mm_cpumask(next));
>
> #ifdef CONFIG_MMU
> - csr_write(CSR_SATP, virt_to_pfn(next->pgd) | SATP_MODE);
> + csr_write(CSR_SATP, virt_to_pfn(next->pgd) | satp_mode);
> local_flush_tlb_all();
> #endif
>
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 5782cae58ac2..bad8da099ff6 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -25,8 +25,23 @@
>
> #include "../kernel/head.h"
>
> -unsigned long kernel_virt_addr = KERNEL_VIRT_ADDR;
> +#ifdef CONFIG_64BIT
> +u64 satp_mode = IS_ENABLED(CONFIG_MAXPHYSMEM_2GB) ?
> + SATP_MODE_39 : SATP_MODE_48;
> +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_MAXPHYSMEM_2GB) ? false : true;
> +#else
> +u64 satp_mode = SATP_MODE_32;
> +bool pgtable_l4_enabled;
> +#endif
> +EXPORT_SYMBOL(pgtable_l4_enabled);
> +EXPORT_SYMBOL(satp_mode);
> +
> +unsigned long kernel_virt_addr;
> EXPORT_SYMBOL(kernel_virt_addr);
> +#ifdef CONFIG_RELOCATABLE
> +unsigned long __page_offset = _AC(CONFIG_PAGE_OFFSET, UL);
> +EXPORT_SYMBOL(__page_offset);
> +#endif
>
> unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
> __page_aligned_bss;
> @@ -254,9 +269,12 @@ static void __init create_pte_mapping(pte_t *ptep,
>
> #ifndef __PAGETABLE_PMD_FOLDED
>
> +pud_t trampoline_pud[PTRS_PER_PUD] __page_aligned_bss;
> pmd_t trampoline_pmd[PTRS_PER_PMD] __page_aligned_bss;
> +pud_t fixmap_pud[PTRS_PER_PUD] __page_aligned_bss;
> pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss;
> pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
> +pud_t early_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE);
>
> static pmd_t *__init get_pmd_virt(phys_addr_t pa)
> {
> @@ -273,7 +291,8 @@ static phys_addr_t __init alloc_pmd(uintptr_t va)
> if (mmu_enabled)
> return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
>
> - BUG_ON((va - kernel_virt_addr) >> PGDIR_SHIFT);
> + /* Only one PMD is available for early mapping */
> + BUG_ON((va - kernel_virt_addr) >> PUD_SHIFT);
>
> return (uintptr_t)early_pmd;
> }
> @@ -305,19 +324,70 @@ static void __init create_pmd_mapping(pmd_t *pmdp,
> create_pte_mapping(ptep, va, pa, sz, prot);
> }
>
> -#define pgd_next_t pmd_t
> -#define alloc_pgd_next(__va) alloc_pmd(__va)
> -#define get_pgd_next_virt(__pa) get_pmd_virt(__pa)
> +static pud_t *__init get_pud_virt(phys_addr_t pa)
> +{
> + if (mmu_enabled) {
> + clear_fixmap(FIX_PUD);
> + return (pud_t *)set_fixmap_offset(FIX_PUD, pa);
> + } else {
> + return (pud_t *)((uintptr_t)pa);
> + }
> +}
> +
> +static phys_addr_t __init alloc_pud(uintptr_t va)
> +{
> + if (mmu_enabled)
> + return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
> +
> + /* Only one PUD is available for early mapping */
> + BUG_ON((va - kernel_virt_addr) >> PGDIR_SHIFT);
> +
> + return (uintptr_t)early_pud;
> +}
> +
> +static void __init create_pud_mapping(pud_t *pudp,
> + uintptr_t va, phys_addr_t pa,
> + phys_addr_t sz, pgprot_t prot)
> +{
> + pmd_t *nextp;
> + phys_addr_t next_phys;
> + uintptr_t pud_index = pud_index(va);
> +
> + if (sz == PUD_SIZE) {
> + if (pud_val(pudp[pud_index]) == 0)
> + pudp[pud_index] = pfn_pud(PFN_DOWN(pa), prot);
> + return;
> + }
> +
> + if (pud_val(pudp[pud_index]) == 0) {
> + next_phys = alloc_pmd(va);
> + pudp[pud_index] = pfn_pud(PFN_DOWN(next_phys), PAGE_TABLE);
> + nextp = get_pmd_virt(next_phys);
> + memset(nextp, 0, PAGE_SIZE);
> + } else {
> + next_phys = PFN_PHYS(_pud_pfn(pudp[pud_index]));
> + nextp = get_pmd_virt(next_phys);
> + }
> +
> + create_pmd_mapping(nextp, va, pa, sz, prot);
> +}
> +
> +#define pgd_next_t pud_t
> +#define alloc_pgd_next(__va) alloc_pud(__va)
> +#define get_pgd_next_virt(__pa) get_pud_virt(__pa)
> #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \
> - create_pmd_mapping(__nextp, __va, __pa, __sz, __prot)
> -#define fixmap_pgd_next fixmap_pmd
> + create_pud_mapping(__nextp, __va, __pa, __sz, __prot)
> +#define fixmap_pgd_next (pgtable_l4_enabled ? \
> + (uintptr_t)fixmap_pud : (uintptr_t)fixmap_pmd)
> +#define trampoline_pgd_next (pgtable_l4_enabled ? \
> + (uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd)
> #else
> #define pgd_next_t pte_t
> #define alloc_pgd_next(__va) alloc_pte(__va)
> #define get_pgd_next_virt(__pa) get_pte_virt(__pa)
> #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \
> create_pte_mapping(__nextp, __va, __pa, __sz, __prot)
> -#define fixmap_pgd_next fixmap_pte
> +#define fixmap_pgd_next ((uintptr_t)fixmap_pte)
> #endif
>
> static void __init create_pgd_mapping(pgd_t *pgdp,
> @@ -328,6 +398,13 @@ static void __init create_pgd_mapping(pgd_t *pgdp,
> phys_addr_t next_phys;
> uintptr_t pgd_index = pgd_index(va);
>
> +#ifndef __PAGETABLE_PMD_FOLDED
> + if (!pgtable_l4_enabled) {
> + create_pud_mapping((pud_t *)pgdp, va, pa, sz, prot);
> + return;
> + }
> +#endif
> +
> if (sz == PGDIR_SIZE) {
> if (pgd_val(pgdp[pgd_index]) == 0)
> pgdp[pgd_index] = pfn_pgd(PFN_DOWN(pa), prot);
> @@ -419,6 +496,47 @@ void __init relocate_kernel(uintptr_t load_pa)
> }
> }
>
> +#if defined(CONFIG_64BIT) && !defined(CONFIG_MAXPHYSMEM_2GB)
> +void disable_pgtable_l4(void)
> +{
> + pgtable_l4_enabled = false;
> + __page_offset = PAGE_OFFSET_L3;
> + satp_mode = SATP_MODE_39;
> +}
> +
> +/* There is a simple way to determine if 4-level is supported by the
> + * underlying hardware: establish 1:1 mapping in 4-level page table mode
> + * then read SATP to see if the configuration was taken into account
> + * meaning sv48 is supported.
> + */
> +asmlinkage __init void set_satp_mode(uintptr_t load_pa)
> +{
> + u64 identity_satp, hw_satp;
> + int cpus_node;
> +
> + create_pgd_mapping(early_pg_dir, load_pa, (uintptr_t)early_pud,
> + PGDIR_SIZE, PAGE_TABLE);
> + create_pud_mapping(early_pud, load_pa, (uintptr_t)early_pmd,
> + PUD_SIZE, PAGE_TABLE);
> + create_pmd_mapping(early_pmd, load_pa, load_pa,
> + PMD_SIZE, PAGE_KERNEL_EXEC);
> +
> + identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode;
> + local_flush_tlb_all();
> + csr_write(CSR_SATP, identity_satp);
> +
> + hw_satp = csr_read(CSR_SATP);
> + csr_write(CSR_SATP, 0ULL);
> + local_flush_tlb_all();
> +
> + if (hw_satp != identity_satp)
> + disable_pgtable_l4();
> +
> + memset(early_pg_dir, 0, PAGE_SIZE);
> + memset(early_pud, 0, PAGE_SIZE);
> + memset(early_pmd, 0, PAGE_SIZE);
> +}
> +#endif
> #endif
>
> static uintptr_t load_pa, load_sz;
> @@ -442,9 +560,14 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
> load_pa = (uintptr_t)(&_start);
> load_sz = (uintptr_t)(&_end) - load_pa;
>
> +#if defined(CONFIG_64BIT) && !defined(CONFIG_MAXPHYSMEM_2GB)
> + set_satp_mode(load_pa);
> +#endif
> +
> + kernel_virt_addr = KERNEL_VIRT_ADDR;
> +
> va_pa_offset = PAGE_OFFSET - load_pa;
> va_kernel_pa_offset = kernel_virt_addr - load_pa;
> -
> pfn_base = PFN_DOWN(load_pa);
>
> #ifdef CONFIG_RELOCATABLE
> @@ -473,15 +596,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
>
> /* Setup early PGD for fixmap */
> create_pgd_mapping(early_pg_dir, FIXADDR_START,
> - (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
> + fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
>
> #ifndef __PAGETABLE_PMD_FOLDED
> - /* Setup fixmap PMD */
> + /* Setup fixmap PUD and PMD */
> + if (pgtable_l4_enabled)
> + create_pud_mapping(fixmap_pud, FIXADDR_START,
> + (uintptr_t)fixmap_pmd, PUD_SIZE, PAGE_TABLE);
> create_pmd_mapping(fixmap_pmd, FIXADDR_START,
> (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE);
> +
> /* Setup trampoline PGD and PMD */
> create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr,
> - (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
> + trampoline_pgd_next, PGDIR_SIZE, PAGE_TABLE);
> + if (pgtable_l4_enabled)
> + create_pud_mapping(trampoline_pud, kernel_virt_addr,
> + (uintptr_t)trampoline_pmd, PUD_SIZE, PAGE_TABLE);
> create_pmd_mapping(trampoline_pmd, kernel_virt_addr,
> load_pa, PMD_SIZE, PAGE_KERNEL_EXEC);
> #else
> @@ -558,12 +688,13 @@ static void __init setup_vm_final(void)
>
> vm_area_add_early(&vm_kernel);
>
> - /* Clear fixmap PTE and PMD mappings */
> + /* Clear fixmap page table mappings */
> clear_fixmap(FIX_PTE);
> clear_fixmap(FIX_PMD);
> + clear_fixmap(FIX_PUD);
>
> /* Move to swapper page table */
> - csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE);
> + csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | satp_mode);
> local_flush_tlb_all();
> }
>
> --
> 2.20.1
>
Looks good to me.
Reviewed-by: Anup Patel <[email protected]>
Regards,
Anup
On Sun, May 24, 2020 at 2:46 PM Alexandre Ghiti <[email protected]> wrote:
>
> This is made possible by using the mmu-type property of the cpu node of
> the device tree.
>
> By default, the kernel will boot with 4-level page table if the hw supports
> it but it can be interesting for the user to select 3-level page table as
> it is less memory consuming and faster since it requires less memory
> accesses in case of a TLB miss.
>
> Signed-off-by: Alexandre Ghiti <[email protected]>
> ---
> arch/riscv/mm/init.c | 25 +++++++++++++++++++++++--
> 1 file changed, 23 insertions(+), 2 deletions(-)
>
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index bad8da099ff6..1776eeb53d61 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -509,11 +509,32 @@ void disable_pgtable_l4(void)
> * then read SATP to see if the configuration was taken into account
> * meaning sv48 is supported.
> */
> -asmlinkage __init void set_satp_mode(uintptr_t load_pa)
> +asmlinkage __init void set_satp_mode(uintptr_t load_pa, uintptr_t dtb_pa)
> {
> u64 identity_satp, hw_satp;
> int cpus_node;
>
> + /* 1/ Check if the user asked for sv39 explicitly in the device tree */
> + cpus_node = fdt_path_offset((void *)dtb_pa, "/cpus");
> + if (cpus_node >= 0) {
> + int node;
> +
> + fdt_for_each_subnode(node, (void *)dtb_pa, cpus_node) {
> + const char *mmu_type = fdt_getprop((void *)dtb_pa, node,
> + "mmu-type", NULL);
> + if (!mmu_type)
> + continue;
> +
> + if (!strcmp(mmu_type, "riscv,sv39")) {
> + disable_pgtable_l4();
> + return;
> + }
> +
> + break;
> + }
> + }
> +
> + /* 2/ Determine if the HW supports sv48: if not, fallback to sv39 */
> create_pgd_mapping(early_pg_dir, load_pa, (uintptr_t)early_pud,
> PGDIR_SIZE, PAGE_TABLE);
> create_pud_mapping(early_pud, load_pa, (uintptr_t)early_pmd,
> @@ -561,7 +582,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
> load_sz = (uintptr_t)(&_end) - load_pa;
>
> #if defined(CONFIG_64BIT) && !defined(CONFIG_MAXPHYSMEM_2GB)
> - set_satp_mode(load_pa);
> + set_satp_mode(load_pa, dtb_pa);
> #endif
>
> kernel_virt_addr = KERNEL_VIRT_ADDR;
> --
> 2.20.1
>
Looks good to me.
Reviewed-by: Anup Patel <[email protected]>
Regards,
Anup
On Sun, May 24, 2020 at 2:45 PM Alexandre Ghiti <[email protected]> wrote:
>
> By adding a new 4th level of page table, give the possibility to 64bit
> kernel to address 2^48 bytes of virtual address: in practice, that roughly
> offers ~160TB of virtual address space to userspace and allows up to 64TB
> of physical memory.
>
> If the underlying hardware does not support sv48, we will automatically
> fallback to a standard 3-level page table by folding the new PUD level into
> PGDIR level. In order to detect HW capabilities at runtime, we
> use SATP feature that ignores writes with an unsupported mode.
>
> Signed-off-by: Alexandre Ghiti <[email protected]>
> ---
> arch/riscv/Kconfig | 6 +-
> arch/riscv/include/asm/csr.h | 3 +-
> arch/riscv/include/asm/fixmap.h | 1 +
> arch/riscv/include/asm/page.h | 15 +++
> arch/riscv/include/asm/pgalloc.h | 36 +++++++
> arch/riscv/include/asm/pgtable-64.h | 97 ++++++++++++++++-
> arch/riscv/include/asm/pgtable.h | 9 +-
> arch/riscv/kernel/head.S | 3 +-
> arch/riscv/mm/context.c | 4 +-
> arch/riscv/mm/init.c | 159 +++++++++++++++++++++++++---
> 10 files changed, 309 insertions(+), 24 deletions(-)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index e167f16131f4..3f73f60e9732 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -68,6 +68,7 @@ config RISCV
> select ARCH_HAS_GCOV_PROFILE_ALL
> select HAVE_COPY_THREAD_TLS
> select HAVE_ARCH_KASAN if MMU && 64BIT
> + select RELOCATABLE if 64BIT
>
> config ARCH_MMAP_RND_BITS_MIN
> default 18 if 64BIT
> @@ -106,7 +107,7 @@ config PAGE_OFFSET
> default 0xC0000000 if 32BIT && MAXPHYSMEM_2GB
> default 0x80000000 if 64BIT && !MMU
> default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB
> - default 0xffffffe000000000 if 64BIT && !MAXPHYSMEM_2GB
> + default 0xffffc00000000000 if 64BIT && !MAXPHYSMEM_2GB
>
> config ARCH_FLATMEM_ENABLE
> def_bool y
> @@ -155,8 +156,11 @@ config GENERIC_HWEIGHT
> config FIX_EARLYCON_MEM
> def_bool MMU
>
> +# On a 64BIT relocatable kernel, the 4-level page table is at runtime folded
> +# on a 3-level page table when sv48 is not supported.
> config PGTABLE_LEVELS
> int
> + default 4 if 64BIT && RELOCATABLE
> default 3 if 64BIT
> default 2
>
> diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h
> index cec462e198ce..d41536c3f8d4 100644
> --- a/arch/riscv/include/asm/csr.h
> +++ b/arch/riscv/include/asm/csr.h
> @@ -40,11 +40,10 @@
> #ifndef CONFIG_64BIT
> #define SATP_PPN _AC(0x003FFFFF, UL)
> #define SATP_MODE_32 _AC(0x80000000, UL)
> -#define SATP_MODE SATP_MODE_32
> #else
> #define SATP_PPN _AC(0x00000FFFFFFFFFFF, UL)
> #define SATP_MODE_39 _AC(0x8000000000000000, UL)
> -#define SATP_MODE SATP_MODE_39
> +#define SATP_MODE_48 _AC(0x9000000000000000, UL)
> #endif
>
> /* Exception cause high bit - is an interrupt if set */
> diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
> index 2368d49eb4ef..d891cf9c73c5 100644
> --- a/arch/riscv/include/asm/fixmap.h
> +++ b/arch/riscv/include/asm/fixmap.h
> @@ -27,6 +27,7 @@ enum fixed_addresses {
> FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1,
> FIX_PTE,
> FIX_PMD,
> + FIX_PUD,
> FIX_TEXT_POKE1,
> FIX_TEXT_POKE0,
> FIX_EARLYCON_MEM_BASE,
> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> index 48bb09b6a9b7..5e77fe7f0d6d 100644
> --- a/arch/riscv/include/asm/page.h
> +++ b/arch/riscv/include/asm/page.h
> @@ -31,7 +31,19 @@
> * When not using MMU this corresponds to the first free page in
> * physical memory (aligned on a page boundary).
> */
> +#ifdef CONFIG_RELOCATABLE
> +#define PAGE_OFFSET __page_offset
> +
> +#ifdef CONFIG_64BIT
> +/*
> + * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so
> + * define the PAGE_OFFSET value for SV39.
> + */
> +#define PAGE_OFFSET_L3 0xffffffe000000000
> +#endif /* CONFIG_64BIT */
> +#else
> #define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL)
> +#endif /* CONFIG_RELOCATABLE */
>
> #define KERN_VIRT_SIZE (-PAGE_OFFSET)
>
> @@ -102,6 +114,9 @@ extern unsigned long pfn_base;
> extern unsigned long max_low_pfn;
> extern unsigned long min_low_pfn;
> extern unsigned long kernel_virt_addr;
> +#ifdef CONFIG_RELOCATABLE
> +extern unsigned long __page_offset;
> +#endif
>
> #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset))
> #define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset)
> diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
> index 3f601ee8233f..540eaa5a8658 100644
> --- a/arch/riscv/include/asm/pgalloc.h
> +++ b/arch/riscv/include/asm/pgalloc.h
> @@ -36,6 +36,42 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
>
> set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> }
> +
> +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
> +{
> + if (pgtable_l4_enabled) {
> + unsigned long pfn = virt_to_pfn(pud);
> +
> + set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> + }
> +}
> +
> +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d,
> + pud_t *pud)
> +{
> + if (pgtable_l4_enabled) {
> + unsigned long pfn = virt_to_pfn(pud);
> +
> + set_p4d_safe(p4d,
> + __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> + }
> +}
> +
> +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
> +{
> + if (pgtable_l4_enabled)
> + return (pud_t *)__get_free_page(
> + GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_ZERO);
> + return NULL;
> +}
> +
> +static inline void pud_free(struct mm_struct *mm, pud_t *pud)
> +{
> + if (pgtable_l4_enabled)
> + free_page((unsigned long)pud);
> +}
> +
> +#define __pud_free_tlb(tlb, pud, addr) pud_free((tlb)->mm, pud)
> #endif /* __PAGETABLE_PMD_FOLDED */
>
> #define pmd_pgtable(pmd) pmd_page(pmd)
> diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
> index b15f70a1fdfa..c84c31fbf8da 100644
> --- a/arch/riscv/include/asm/pgtable-64.h
> +++ b/arch/riscv/include/asm/pgtable-64.h
> @@ -8,16 +8,32 @@
>
> #include <linux/const.h>
>
> -#define PGDIR_SHIFT 30
> +extern bool pgtable_l4_enabled;
> +
> +#define PGDIR_SHIFT (pgtable_l4_enabled ? 39 : 30)
> /* Size of region mapped by a page global directory */
> #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
> #define PGDIR_MASK (~(PGDIR_SIZE - 1))
>
> +/* pud is folded into pgd in case of 3-level page table */
> +#define PUD_SHIFT 30
> +#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
> +#define PUD_MASK (~(PUD_SIZE - 1))
> +
> #define PMD_SHIFT 21
> /* Size of region mapped by a page middle directory */
> #define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
> #define PMD_MASK (~(PMD_SIZE - 1))
>
> +/* Page Upper Directory entry */
> +typedef struct {
> + unsigned long pud;
> +} pud_t;
> +
> +#define pud_val(x) ((x).pud)
> +#define __pud(x) ((pud_t) { (x) })
> +#define PTRS_PER_PUD (PAGE_SIZE / sizeof(pud_t))
> +
> /* Page Middle Directory entry */
> typedef struct {
> unsigned long pmd;
> @@ -60,6 +76,16 @@ static inline void pud_clear(pud_t *pudp)
> set_pud(pudp, __pud(0));
> }
>
> +static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot)
> +{
> + return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
> +}
> +
> +static inline unsigned long _pud_pfn(pud_t pud)
> +{
> + return pud_val(pud) >> _PAGE_PFN_SHIFT;
> +}
> +
> static inline unsigned long pud_page_vaddr(pud_t pud)
> {
> return (unsigned long)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT);
> @@ -70,6 +96,15 @@ static inline struct page *pud_page(pud_t pud)
> return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
> }
>
> +#define mm_pud_folded mm_pud_folded
> +static inline bool mm_pud_folded(struct mm_struct *mm)
> +{
> + if (pgtable_l4_enabled)
> + return false;
> +
> + return true;
> +}
> +
> #define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
>
> static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
> @@ -90,4 +125,64 @@ static inline unsigned long _pmd_pfn(pmd_t pmd)
> #define pmd_ERROR(e) \
> pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
>
> +#define pud_ERROR(e) \
> + pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
> +
> +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
> +{
> + if (pgtable_l4_enabled)
> + *p4dp = p4d;
> + else
> + set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) });
> +}
> +
> +static inline int p4d_none(p4d_t p4d)
> +{
> + if (pgtable_l4_enabled)
> + return (p4d_val(p4d) == 0);
> +
> + return 0;
> +}
> +
> +static inline int p4d_present(p4d_t p4d)
> +{
> + if (pgtable_l4_enabled)
> + return (p4d_val(p4d) & _PAGE_PRESENT);
> +
> + return 1;
> +}
> +
> +static inline int p4d_bad(p4d_t p4d)
> +{
> + if (pgtable_l4_enabled)
> + return !p4d_present(p4d);
> +
> + return 0;
> +}
> +
> +static inline void p4d_clear(p4d_t *p4d)
> +{
> + if (pgtable_l4_enabled)
> + set_p4d(p4d, __p4d(0));
> +}
> +
> +static inline unsigned long p4d_page_vaddr(p4d_t p4d)
> +{
> + if (pgtable_l4_enabled)
> + return (unsigned long)pfn_to_virt(
> + p4d_val(p4d) >> _PAGE_PFN_SHIFT);
> +
> + return pud_page_vaddr((pud_t) { p4d_val(p4d) });
> +}
> +
> +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
> +
> +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
> +{
> + if (pgtable_l4_enabled)
> + return (pud_t *)p4d_page_vaddr(*p4d) + pud_index(address);
> +
> + return (pud_t *)p4d;
> +}
> +
> #endif /* _ASM_RISCV_PGTABLE_64_H */
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index 8e96315b3366..b8a8ba69d0a2 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -20,12 +20,14 @@
> * the kernel.
> */
> #define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1)
> -#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR
> +#define KERNEL_LINK_ADDR (VMALLOC_LINK_END - SZ_2G + 1)
>
> #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1)
> #define VMALLOC_END (PAGE_OFFSET - 1)
> #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE)
>
> +#define VMALLOC_LINK_END (_AC(CONFIG_PAGE_OFFSET, UL) - 1)
> +
> #define BPF_JIT_REGION_SIZE (SZ_128M)
> #define BPF_JIT_REGION_START (kernel_virt_addr)
> #define BPF_JIT_REGION_END (kernel_virt_addr + BPF_JIT_REGION_SIZE)
> @@ -67,8 +69,7 @@
>
> #ifndef __ASSEMBLY__
>
> -/* Page Upper Directory not used in RISC-V */
> -#include <asm-generic/pgtable-nopud.h>
> +#include <asm-generic/pgtable-nop4d.h>
> #include <asm/page.h>
> #include <asm/tlbflush.h>
> #include <linux/mm_types.h>
> @@ -81,7 +82,7 @@
>
> #ifdef CONFIG_MMU
> #ifdef CONFIG_64BIT
> -#define VA_BITS 39
> +#define VA_BITS (pgtable_l4_enabled ? 48 : 39)
> #define PA_BITS 56
> #else
> #define VA_BITS 32
> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
> index 8f5bb7731327..0632c4834c68 100644
> --- a/arch/riscv/kernel/head.S
> +++ b/arch/riscv/kernel/head.S
> @@ -62,7 +62,8 @@ relocate:
>
> /* Compute satp for kernel page tables, but don't load it yet */
> srl a2, a0, PAGE_SHIFT
> - li a1, SATP_MODE
> + la a1, satp_mode
> + REG_L a1, 0(a1)
> or a2, a2, a1
>
> /*
> diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
> index 613ec81a8979..6830504f8b11 100644
> --- a/arch/riscv/mm/context.c
> +++ b/arch/riscv/mm/context.c
> @@ -9,6 +9,8 @@
> #include <asm/cacheflush.h>
> #include <asm/mmu_context.h>
>
> +extern u64 satp_mode;
Please move this to asm/pgtable.h next to "extern void *dtb_early_va".
Same thing can be done for "pgtable_l4_enabled" to help PATCH7.
I forgot to mention this in previous emails.
Regards,
Anup
> +
> /*
> * When necessary, performs a deferred icache flush for the given MM context,
> * on the local CPU. RISC-V has no direct mechanism for instruction cache
> @@ -59,7 +61,7 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
> cpumask_set_cpu(cpu, mm_cpumask(next));
>
> #ifdef CONFIG_MMU
> - csr_write(CSR_SATP, virt_to_pfn(next->pgd) | SATP_MODE);
> + csr_write(CSR_SATP, virt_to_pfn(next->pgd) | satp_mode);
> local_flush_tlb_all();
> #endif
>
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 5782cae58ac2..bad8da099ff6 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -25,8 +25,23 @@
>
> #include "../kernel/head.h"
>
> -unsigned long kernel_virt_addr = KERNEL_VIRT_ADDR;
> +#ifdef CONFIG_64BIT
> +u64 satp_mode = IS_ENABLED(CONFIG_MAXPHYSMEM_2GB) ?
> + SATP_MODE_39 : SATP_MODE_48;
> +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_MAXPHYSMEM_2GB) ? false : true;
> +#else
> +u64 satp_mode = SATP_MODE_32;
> +bool pgtable_l4_enabled;
> +#endif
> +EXPORT_SYMBOL(pgtable_l4_enabled);
> +EXPORT_SYMBOL(satp_mode);
> +
> +unsigned long kernel_virt_addr;
> EXPORT_SYMBOL(kernel_virt_addr);
> +#ifdef CONFIG_RELOCATABLE
> +unsigned long __page_offset = _AC(CONFIG_PAGE_OFFSET, UL);
> +EXPORT_SYMBOL(__page_offset);
> +#endif
>
> unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
> __page_aligned_bss;
> @@ -254,9 +269,12 @@ static void __init create_pte_mapping(pte_t *ptep,
>
> #ifndef __PAGETABLE_PMD_FOLDED
>
> +pud_t trampoline_pud[PTRS_PER_PUD] __page_aligned_bss;
> pmd_t trampoline_pmd[PTRS_PER_PMD] __page_aligned_bss;
> +pud_t fixmap_pud[PTRS_PER_PUD] __page_aligned_bss;
> pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss;
> pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
> +pud_t early_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE);
>
> static pmd_t *__init get_pmd_virt(phys_addr_t pa)
> {
> @@ -273,7 +291,8 @@ static phys_addr_t __init alloc_pmd(uintptr_t va)
> if (mmu_enabled)
> return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
>
> - BUG_ON((va - kernel_virt_addr) >> PGDIR_SHIFT);
> + /* Only one PMD is available for early mapping */
> + BUG_ON((va - kernel_virt_addr) >> PUD_SHIFT);
>
> return (uintptr_t)early_pmd;
> }
> @@ -305,19 +324,70 @@ static void __init create_pmd_mapping(pmd_t *pmdp,
> create_pte_mapping(ptep, va, pa, sz, prot);
> }
>
> -#define pgd_next_t pmd_t
> -#define alloc_pgd_next(__va) alloc_pmd(__va)
> -#define get_pgd_next_virt(__pa) get_pmd_virt(__pa)
> +static pud_t *__init get_pud_virt(phys_addr_t pa)
> +{
> + if (mmu_enabled) {
> + clear_fixmap(FIX_PUD);
> + return (pud_t *)set_fixmap_offset(FIX_PUD, pa);
> + } else {
> + return (pud_t *)((uintptr_t)pa);
> + }
> +}
> +
> +static phys_addr_t __init alloc_pud(uintptr_t va)
> +{
> + if (mmu_enabled)
> + return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
> +
> + /* Only one PUD is available for early mapping */
> + BUG_ON((va - kernel_virt_addr) >> PGDIR_SHIFT);
> +
> + return (uintptr_t)early_pud;
> +}
> +
> +static void __init create_pud_mapping(pud_t *pudp,
> + uintptr_t va, phys_addr_t pa,
> + phys_addr_t sz, pgprot_t prot)
> +{
> + pmd_t *nextp;
> + phys_addr_t next_phys;
> + uintptr_t pud_index = pud_index(va);
> +
> + if (sz == PUD_SIZE) {
> + if (pud_val(pudp[pud_index]) == 0)
> + pudp[pud_index] = pfn_pud(PFN_DOWN(pa), prot);
> + return;
> + }
> +
> + if (pud_val(pudp[pud_index]) == 0) {
> + next_phys = alloc_pmd(va);
> + pudp[pud_index] = pfn_pud(PFN_DOWN(next_phys), PAGE_TABLE);
> + nextp = get_pmd_virt(next_phys);
> + memset(nextp, 0, PAGE_SIZE);
> + } else {
> + next_phys = PFN_PHYS(_pud_pfn(pudp[pud_index]));
> + nextp = get_pmd_virt(next_phys);
> + }
> +
> + create_pmd_mapping(nextp, va, pa, sz, prot);
> +}
> +
> +#define pgd_next_t pud_t
> +#define alloc_pgd_next(__va) alloc_pud(__va)
> +#define get_pgd_next_virt(__pa) get_pud_virt(__pa)
> #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \
> - create_pmd_mapping(__nextp, __va, __pa, __sz, __prot)
> -#define fixmap_pgd_next fixmap_pmd
> + create_pud_mapping(__nextp, __va, __pa, __sz, __prot)
> +#define fixmap_pgd_next (pgtable_l4_enabled ? \
> + (uintptr_t)fixmap_pud : (uintptr_t)fixmap_pmd)
> +#define trampoline_pgd_next (pgtable_l4_enabled ? \
> + (uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd)
> #else
> #define pgd_next_t pte_t
> #define alloc_pgd_next(__va) alloc_pte(__va)
> #define get_pgd_next_virt(__pa) get_pte_virt(__pa)
> #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \
> create_pte_mapping(__nextp, __va, __pa, __sz, __prot)
> -#define fixmap_pgd_next fixmap_pte
> +#define fixmap_pgd_next ((uintptr_t)fixmap_pte)
> #endif
>
> static void __init create_pgd_mapping(pgd_t *pgdp,
> @@ -328,6 +398,13 @@ static void __init create_pgd_mapping(pgd_t *pgdp,
> phys_addr_t next_phys;
> uintptr_t pgd_index = pgd_index(va);
>
> +#ifndef __PAGETABLE_PMD_FOLDED
> + if (!pgtable_l4_enabled) {
> + create_pud_mapping((pud_t *)pgdp, va, pa, sz, prot);
> + return;
> + }
> +#endif
> +
> if (sz == PGDIR_SIZE) {
> if (pgd_val(pgdp[pgd_index]) == 0)
> pgdp[pgd_index] = pfn_pgd(PFN_DOWN(pa), prot);
> @@ -419,6 +496,47 @@ void __init relocate_kernel(uintptr_t load_pa)
> }
> }
>
> +#if defined(CONFIG_64BIT) && !defined(CONFIG_MAXPHYSMEM_2GB)
> +void disable_pgtable_l4(void)
> +{
> + pgtable_l4_enabled = false;
> + __page_offset = PAGE_OFFSET_L3;
> + satp_mode = SATP_MODE_39;
> +}
> +
> +/* There is a simple way to determine if 4-level is supported by the
> + * underlying hardware: establish 1:1 mapping in 4-level page table mode
> + * then read SATP to see if the configuration was taken into account
> + * meaning sv48 is supported.
> + */
> +asmlinkage __init void set_satp_mode(uintptr_t load_pa)
> +{
> + u64 identity_satp, hw_satp;
> + int cpus_node;
> +
> + create_pgd_mapping(early_pg_dir, load_pa, (uintptr_t)early_pud,
> + PGDIR_SIZE, PAGE_TABLE);
> + create_pud_mapping(early_pud, load_pa, (uintptr_t)early_pmd,
> + PUD_SIZE, PAGE_TABLE);
> + create_pmd_mapping(early_pmd, load_pa, load_pa,
> + PMD_SIZE, PAGE_KERNEL_EXEC);
> +
> + identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode;
> + local_flush_tlb_all();
> + csr_write(CSR_SATP, identity_satp);
> +
> + hw_satp = csr_read(CSR_SATP);
> + csr_write(CSR_SATP, 0ULL);
> + local_flush_tlb_all();
> +
> + if (hw_satp != identity_satp)
> + disable_pgtable_l4();
> +
> + memset(early_pg_dir, 0, PAGE_SIZE);
> + memset(early_pud, 0, PAGE_SIZE);
> + memset(early_pmd, 0, PAGE_SIZE);
> +}
> +#endif
> #endif
>
> static uintptr_t load_pa, load_sz;
> @@ -442,9 +560,14 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
> load_pa = (uintptr_t)(&_start);
> load_sz = (uintptr_t)(&_end) - load_pa;
>
> +#if defined(CONFIG_64BIT) && !defined(CONFIG_MAXPHYSMEM_2GB)
> + set_satp_mode(load_pa);
> +#endif
> +
> + kernel_virt_addr = KERNEL_VIRT_ADDR;
> +
> va_pa_offset = PAGE_OFFSET - load_pa;
> va_kernel_pa_offset = kernel_virt_addr - load_pa;
> -
> pfn_base = PFN_DOWN(load_pa);
>
> #ifdef CONFIG_RELOCATABLE
> @@ -473,15 +596,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
>
> /* Setup early PGD for fixmap */
> create_pgd_mapping(early_pg_dir, FIXADDR_START,
> - (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
> + fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
>
> #ifndef __PAGETABLE_PMD_FOLDED
> - /* Setup fixmap PMD */
> + /* Setup fixmap PUD and PMD */
> + if (pgtable_l4_enabled)
> + create_pud_mapping(fixmap_pud, FIXADDR_START,
> + (uintptr_t)fixmap_pmd, PUD_SIZE, PAGE_TABLE);
> create_pmd_mapping(fixmap_pmd, FIXADDR_START,
> (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE);
> +
> /* Setup trampoline PGD and PMD */
> create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr,
> - (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
> + trampoline_pgd_next, PGDIR_SIZE, PAGE_TABLE);
> + if (pgtable_l4_enabled)
> + create_pud_mapping(trampoline_pud, kernel_virt_addr,
> + (uintptr_t)trampoline_pmd, PUD_SIZE, PAGE_TABLE);
> create_pmd_mapping(trampoline_pmd, kernel_virt_addr,
> load_pa, PMD_SIZE, PAGE_KERNEL_EXEC);
> #else
> @@ -558,12 +688,13 @@ static void __init setup_vm_final(void)
>
> vm_area_add_early(&vm_kernel);
>
> - /* Clear fixmap PTE and PMD mappings */
> + /* Clear fixmap page table mappings */
> clear_fixmap(FIX_PTE);
> clear_fixmap(FIX_PMD);
> + clear_fixmap(FIX_PUD);
>
> /* Move to swapper page table */
> - csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE);
> + csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | satp_mode);
> local_flush_tlb_all();
> }
>
> --
> 2.20.1
>
Hi Anup,
Le 5/25/20 à 2:21 AM, Anup Patel a écrit :
> On Sun, May 24, 2020 at 2:47 PM Alexandre Ghiti <[email protected]> wrote:
>> Now that the mmu type is determined at runtime using SATP
>> characteristic, use the global variable pgtable_l4_enabled to output
>> mmu type of the processor through /proc/cpuinfo instead of relying on
>> device tree infos.
>>
>> Signed-off-by: Alexandre Ghiti <[email protected]>
>> Reviewed-by: Anup Patel <[email protected]>
>> Reviewed-by: Palmer Dabbelt <[email protected]>
>> ---
>> arch/riscv/boot/dts/sifive/fu540-c000.dtsi | 4 ----
>> arch/riscv/kernel/cpu.c | 24 ++++++++++++----------
>> 2 files changed, 13 insertions(+), 15 deletions(-)
>>
>> diff --git a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi
>> index 7db861053483..6138590a2229 100644
>> --- a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi
>> +++ b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi
>> @@ -50,7 +50,6 @@
>> i-cache-size = <32768>;
>> i-tlb-sets = <1>;
>> i-tlb-size = <32>;
>> - mmu-type = "riscv,sv39";
>> reg = <1>;
>> riscv,isa = "rv64imafdc";
>> tlb-split;
>> @@ -74,7 +73,6 @@
>> i-cache-size = <32768>;
>> i-tlb-sets = <1>;
>> i-tlb-size = <32>;
>> - mmu-type = "riscv,sv39";
>> reg = <2>;
>> riscv,isa = "rv64imafdc";
>> tlb-split;
>> @@ -98,7 +96,6 @@
>> i-cache-size = <32768>;
>> i-tlb-sets = <1>;
>> i-tlb-size = <32>;
>> - mmu-type = "riscv,sv39";
>> reg = <3>;
>> riscv,isa = "rv64imafdc";
>> tlb-split;
>> @@ -122,7 +119,6 @@
>> i-cache-size = <32768>;
>> i-tlb-sets = <1>;
>> i-tlb-size = <32>;
>> - mmu-type = "riscv,sv39";
>> reg = <4>;
>> riscv,isa = "rv64imafdc";
>> tlb-split;
> Your PATCH6 is already doing the right thing by skipping CPU DT
> nodes that don't have "mmu-type" DT property.
>
> The "mmu-type" DT property is very critical for RUNTIME M-mode
> firmware (OpenSBI) because it tells whether a given CPU has MMU
> (or not). This is also in agreement with the current DT bindings
> document for RISC-V CPUs.
>
> I suggest to drop the change in sifive/fu540-c000.dtsi and rest of
> the patch is fine so my Reviewed-by still holds.
Ok I'll do that in v2, thanks.
Alex
> Regards,
> Anup
>
>> diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
>> index 40a3c442ac5f..38a699b997a8 100644
>> --- a/arch/riscv/kernel/cpu.c
>> +++ b/arch/riscv/kernel/cpu.c
>> @@ -8,6 +8,8 @@
>> #include <linux/of.h>
>> #include <asm/smp.h>
>>
>> +extern bool pgtable_l4_enabled;
>> +
>> /*
>> * Returns the hart ID of the given device tree node, or -ENODEV if the node
>> * isn't an enabled and valid RISC-V hart node.
>> @@ -54,18 +56,19 @@ static void print_isa(struct seq_file *f, const char *isa)
>> seq_puts(f, "\n");
>> }
>>
>> -static void print_mmu(struct seq_file *f, const char *mmu_type)
>> +static void print_mmu(struct seq_file *f)
>> {
>> + char sv_type[16];
>> +
>> #if defined(CONFIG_32BIT)
>> - if (strcmp(mmu_type, "riscv,sv32") != 0)
>> - return;
>> + strncpy(sv_type, "sv32", 5);
>> #elif defined(CONFIG_64BIT)
>> - if (strcmp(mmu_type, "riscv,sv39") != 0 &&
>> - strcmp(mmu_type, "riscv,sv48") != 0)
>> - return;
>> + if (pgtable_l4_enabled)
>> + strncpy(sv_type, "sv48", 5);
>> + else
>> + strncpy(sv_type, "sv39", 5);
>> #endif
>> -
>> - seq_printf(f, "mmu\t\t: %s\n", mmu_type+6);
>> + seq_printf(f, "mmu\t\t: %s\n", sv_type);
>> }
>>
>> static void *c_start(struct seq_file *m, loff_t *pos)
>> @@ -90,14 +93,13 @@ static int c_show(struct seq_file *m, void *v)
>> {
>> unsigned long cpu_id = (unsigned long)v - 1;
>> struct device_node *node = of_get_cpu_node(cpu_id, NULL);
>> - const char *compat, *isa, *mmu;
>> + const char *compat, *isa;
>>
>> seq_printf(m, "processor\t: %lu\n", cpu_id);
>> seq_printf(m, "hart\t\t: %lu\n", cpuid_to_hartid_map(cpu_id));
>> if (!of_property_read_string(node, "riscv,isa", &isa))
>> print_isa(m, isa);
>> - if (!of_property_read_string(node, "mmu-type", &mmu))
>> - print_mmu(m, mmu);
>> + print_mmu(m);
>> if (!of_property_read_string(node, "compatible", &compat)
>> && strcmp(compat, "riscv"))
>> seq_printf(m, "uarch\t\t: %s\n", compat);
>> --
>> 2.20.1
>>
Le 5/25/20 à 2:45 AM, Anup Patel a écrit :
> On Sun, May 24, 2020 at 2:45 PM Alexandre Ghiti <[email protected]> wrote:
>> By adding a new 4th level of page table, give the possibility to 64bit
>> kernel to address 2^48 bytes of virtual address: in practice, that roughly
>> offers ~160TB of virtual address space to userspace and allows up to 64TB
>> of physical memory.
>>
>> If the underlying hardware does not support sv48, we will automatically
>> fallback to a standard 3-level page table by folding the new PUD level into
>> PGDIR level. In order to detect HW capabilities at runtime, we
>> use SATP feature that ignores writes with an unsupported mode.
>>
>> Signed-off-by: Alexandre Ghiti <[email protected]>
>> ---
>> arch/riscv/Kconfig | 6 +-
>> arch/riscv/include/asm/csr.h | 3 +-
>> arch/riscv/include/asm/fixmap.h | 1 +
>> arch/riscv/include/asm/page.h | 15 +++
>> arch/riscv/include/asm/pgalloc.h | 36 +++++++
>> arch/riscv/include/asm/pgtable-64.h | 97 ++++++++++++++++-
>> arch/riscv/include/asm/pgtable.h | 9 +-
>> arch/riscv/kernel/head.S | 3 +-
>> arch/riscv/mm/context.c | 4 +-
>> arch/riscv/mm/init.c | 159 +++++++++++++++++++++++++---
>> 10 files changed, 309 insertions(+), 24 deletions(-)
>>
>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>> index e167f16131f4..3f73f60e9732 100644
>> --- a/arch/riscv/Kconfig
>> +++ b/arch/riscv/Kconfig
>> @@ -68,6 +68,7 @@ config RISCV
>> select ARCH_HAS_GCOV_PROFILE_ALL
>> select HAVE_COPY_THREAD_TLS
>> select HAVE_ARCH_KASAN if MMU && 64BIT
>> + select RELOCATABLE if 64BIT
>>
>> config ARCH_MMAP_RND_BITS_MIN
>> default 18 if 64BIT
>> @@ -106,7 +107,7 @@ config PAGE_OFFSET
>> default 0xC0000000 if 32BIT && MAXPHYSMEM_2GB
>> default 0x80000000 if 64BIT && !MMU
>> default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB
>> - default 0xffffffe000000000 if 64BIT && !MAXPHYSMEM_2GB
>> + default 0xffffc00000000000 if 64BIT && !MAXPHYSMEM_2GB
>>
>> config ARCH_FLATMEM_ENABLE
>> def_bool y
>> @@ -155,8 +156,11 @@ config GENERIC_HWEIGHT
>> config FIX_EARLYCON_MEM
>> def_bool MMU
>>
>> +# On a 64BIT relocatable kernel, the 4-level page table is at runtime folded
>> +# on a 3-level page table when sv48 is not supported.
>> config PGTABLE_LEVELS
>> int
>> + default 4 if 64BIT && RELOCATABLE
>> default 3 if 64BIT
>> default 2
>>
>> diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h
>> index cec462e198ce..d41536c3f8d4 100644
>> --- a/arch/riscv/include/asm/csr.h
>> +++ b/arch/riscv/include/asm/csr.h
>> @@ -40,11 +40,10 @@
>> #ifndef CONFIG_64BIT
>> #define SATP_PPN _AC(0x003FFFFF, UL)
>> #define SATP_MODE_32 _AC(0x80000000, UL)
>> -#define SATP_MODE SATP_MODE_32
>> #else
>> #define SATP_PPN _AC(0x00000FFFFFFFFFFF, UL)
>> #define SATP_MODE_39 _AC(0x8000000000000000, UL)
>> -#define SATP_MODE SATP_MODE_39
>> +#define SATP_MODE_48 _AC(0x9000000000000000, UL)
>> #endif
>>
>> /* Exception cause high bit - is an interrupt if set */
>> diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
>> index 2368d49eb4ef..d891cf9c73c5 100644
>> --- a/arch/riscv/include/asm/fixmap.h
>> +++ b/arch/riscv/include/asm/fixmap.h
>> @@ -27,6 +27,7 @@ enum fixed_addresses {
>> FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1,
>> FIX_PTE,
>> FIX_PMD,
>> + FIX_PUD,
>> FIX_TEXT_POKE1,
>> FIX_TEXT_POKE0,
>> FIX_EARLYCON_MEM_BASE,
>> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
>> index 48bb09b6a9b7..5e77fe7f0d6d 100644
>> --- a/arch/riscv/include/asm/page.h
>> +++ b/arch/riscv/include/asm/page.h
>> @@ -31,7 +31,19 @@
>> * When not using MMU this corresponds to the first free page in
>> * physical memory (aligned on a page boundary).
>> */
>> +#ifdef CONFIG_RELOCATABLE
>> +#define PAGE_OFFSET __page_offset
>> +
>> +#ifdef CONFIG_64BIT
>> +/*
>> + * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so
>> + * define the PAGE_OFFSET value for SV39.
>> + */
>> +#define PAGE_OFFSET_L3 0xffffffe000000000
>> +#endif /* CONFIG_64BIT */
>> +#else
>> #define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL)
>> +#endif /* CONFIG_RELOCATABLE */
>>
>> #define KERN_VIRT_SIZE (-PAGE_OFFSET)
>>
>> @@ -102,6 +114,9 @@ extern unsigned long pfn_base;
>> extern unsigned long max_low_pfn;
>> extern unsigned long min_low_pfn;
>> extern unsigned long kernel_virt_addr;
>> +#ifdef CONFIG_RELOCATABLE
>> +extern unsigned long __page_offset;
>> +#endif
>>
>> #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset))
>> #define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset)
>> diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
>> index 3f601ee8233f..540eaa5a8658 100644
>> --- a/arch/riscv/include/asm/pgalloc.h
>> +++ b/arch/riscv/include/asm/pgalloc.h
>> @@ -36,6 +36,42 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
>>
>> set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
>> }
>> +
>> +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
>> +{
>> + if (pgtable_l4_enabled) {
>> + unsigned long pfn = virt_to_pfn(pud);
>> +
>> + set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
>> + }
>> +}
>> +
>> +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d,
>> + pud_t *pud)
>> +{
>> + if (pgtable_l4_enabled) {
>> + unsigned long pfn = virt_to_pfn(pud);
>> +
>> + set_p4d_safe(p4d,
>> + __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
>> + }
>> +}
>> +
>> +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
>> +{
>> + if (pgtable_l4_enabled)
>> + return (pud_t *)__get_free_page(
>> + GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_ZERO);
>> + return NULL;
>> +}
>> +
>> +static inline void pud_free(struct mm_struct *mm, pud_t *pud)
>> +{
>> + if (pgtable_l4_enabled)
>> + free_page((unsigned long)pud);
>> +}
>> +
>> +#define __pud_free_tlb(tlb, pud, addr) pud_free((tlb)->mm, pud)
>> #endif /* __PAGETABLE_PMD_FOLDED */
>>
>> #define pmd_pgtable(pmd) pmd_page(pmd)
>> diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
>> index b15f70a1fdfa..c84c31fbf8da 100644
>> --- a/arch/riscv/include/asm/pgtable-64.h
>> +++ b/arch/riscv/include/asm/pgtable-64.h
>> @@ -8,16 +8,32 @@
>>
>> #include <linux/const.h>
>>
>> -#define PGDIR_SHIFT 30
>> +extern bool pgtable_l4_enabled;
>> +
>> +#define PGDIR_SHIFT (pgtable_l4_enabled ? 39 : 30)
>> /* Size of region mapped by a page global directory */
>> #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
>> #define PGDIR_MASK (~(PGDIR_SIZE - 1))
>>
>> +/* pud is folded into pgd in case of 3-level page table */
>> +#define PUD_SHIFT 30
>> +#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
>> +#define PUD_MASK (~(PUD_SIZE - 1))
>> +
>> #define PMD_SHIFT 21
>> /* Size of region mapped by a page middle directory */
>> #define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
>> #define PMD_MASK (~(PMD_SIZE - 1))
>>
>> +/* Page Upper Directory entry */
>> +typedef struct {
>> + unsigned long pud;
>> +} pud_t;
>> +
>> +#define pud_val(x) ((x).pud)
>> +#define __pud(x) ((pud_t) { (x) })
>> +#define PTRS_PER_PUD (PAGE_SIZE / sizeof(pud_t))
>> +
>> /* Page Middle Directory entry */
>> typedef struct {
>> unsigned long pmd;
>> @@ -60,6 +76,16 @@ static inline void pud_clear(pud_t *pudp)
>> set_pud(pudp, __pud(0));
>> }
>>
>> +static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot)
>> +{
>> + return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
>> +}
>> +
>> +static inline unsigned long _pud_pfn(pud_t pud)
>> +{
>> + return pud_val(pud) >> _PAGE_PFN_SHIFT;
>> +}
>> +
>> static inline unsigned long pud_page_vaddr(pud_t pud)
>> {
>> return (unsigned long)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT);
>> @@ -70,6 +96,15 @@ static inline struct page *pud_page(pud_t pud)
>> return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
>> }
>>
>> +#define mm_pud_folded mm_pud_folded
>> +static inline bool mm_pud_folded(struct mm_struct *mm)
>> +{
>> + if (pgtable_l4_enabled)
>> + return false;
>> +
>> + return true;
>> +}
>> +
>> #define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
>>
>> static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
>> @@ -90,4 +125,64 @@ static inline unsigned long _pmd_pfn(pmd_t pmd)
>> #define pmd_ERROR(e) \
>> pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
>>
>> +#define pud_ERROR(e) \
>> + pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
>> +
>> +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
>> +{
>> + if (pgtable_l4_enabled)
>> + *p4dp = p4d;
>> + else
>> + set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) });
>> +}
>> +
>> +static inline int p4d_none(p4d_t p4d)
>> +{
>> + if (pgtable_l4_enabled)
>> + return (p4d_val(p4d) == 0);
>> +
>> + return 0;
>> +}
>> +
>> +static inline int p4d_present(p4d_t p4d)
>> +{
>> + if (pgtable_l4_enabled)
>> + return (p4d_val(p4d) & _PAGE_PRESENT);
>> +
>> + return 1;
>> +}
>> +
>> +static inline int p4d_bad(p4d_t p4d)
>> +{
>> + if (pgtable_l4_enabled)
>> + return !p4d_present(p4d);
>> +
>> + return 0;
>> +}
>> +
>> +static inline void p4d_clear(p4d_t *p4d)
>> +{
>> + if (pgtable_l4_enabled)
>> + set_p4d(p4d, __p4d(0));
>> +}
>> +
>> +static inline unsigned long p4d_page_vaddr(p4d_t p4d)
>> +{
>> + if (pgtable_l4_enabled)
>> + return (unsigned long)pfn_to_virt(
>> + p4d_val(p4d) >> _PAGE_PFN_SHIFT);
>> +
>> + return pud_page_vaddr((pud_t) { p4d_val(p4d) });
>> +}
>> +
>> +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
>> +
>> +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
>> +{
>> + if (pgtable_l4_enabled)
>> + return (pud_t *)p4d_page_vaddr(*p4d) + pud_index(address);
>> +
>> + return (pud_t *)p4d;
>> +}
>> +
>> #endif /* _ASM_RISCV_PGTABLE_64_H */
>> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
>> index 8e96315b3366..b8a8ba69d0a2 100644
>> --- a/arch/riscv/include/asm/pgtable.h
>> +++ b/arch/riscv/include/asm/pgtable.h
>> @@ -20,12 +20,14 @@
>> * the kernel.
>> */
>> #define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1)
>> -#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR
>> +#define KERNEL_LINK_ADDR (VMALLOC_LINK_END - SZ_2G + 1)
>>
>> #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1)
>> #define VMALLOC_END (PAGE_OFFSET - 1)
>> #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE)
>>
>> +#define VMALLOC_LINK_END (_AC(CONFIG_PAGE_OFFSET, UL) - 1)
>> +
>> #define BPF_JIT_REGION_SIZE (SZ_128M)
>> #define BPF_JIT_REGION_START (kernel_virt_addr)
>> #define BPF_JIT_REGION_END (kernel_virt_addr + BPF_JIT_REGION_SIZE)
>> @@ -67,8 +69,7 @@
>>
>> #ifndef __ASSEMBLY__
>>
>> -/* Page Upper Directory not used in RISC-V */
>> -#include <asm-generic/pgtable-nopud.h>
>> +#include <asm-generic/pgtable-nop4d.h>
>> #include <asm/page.h>
>> #include <asm/tlbflush.h>
>> #include <linux/mm_types.h>
>> @@ -81,7 +82,7 @@
>>
>> #ifdef CONFIG_MMU
>> #ifdef CONFIG_64BIT
>> -#define VA_BITS 39
>> +#define VA_BITS (pgtable_l4_enabled ? 48 : 39)
>> #define PA_BITS 56
>> #else
>> #define VA_BITS 32
>> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
>> index 8f5bb7731327..0632c4834c68 100644
>> --- a/arch/riscv/kernel/head.S
>> +++ b/arch/riscv/kernel/head.S
>> @@ -62,7 +62,8 @@ relocate:
>>
>> /* Compute satp for kernel page tables, but don't load it yet */
>> srl a2, a0, PAGE_SHIFT
>> - li a1, SATP_MODE
>> + la a1, satp_mode
>> + REG_L a1, 0(a1)
>> or a2, a2, a1
>>
>> /*
>> diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
>> index 613ec81a8979..6830504f8b11 100644
>> --- a/arch/riscv/mm/context.c
>> +++ b/arch/riscv/mm/context.c
>> @@ -9,6 +9,8 @@
>> #include <asm/cacheflush.h>
>> #include <asm/mmu_context.h>
>>
>> +extern u64 satp_mode;
> Please move this to asm/pgtable.h next to "extern void *dtb_early_va".
>
> Same thing can be done for "pgtable_l4_enabled" to help PATCH7.
>
> I forgot to mention this in previous emails.
Ok, I'll do that in v2 too, thanks.
Anup, do you have time to take a look at the relocatable series I have
posted earlier ?
As sv48 support depends on that, it would be nice to have your review too.
Thanks,
Alex
>
> Regards,
> Anup
>
>
>
>> +
>> /*
>> * When necessary, performs a deferred icache flush for the given MM context,
>> * on the local CPU. RISC-V has no direct mechanism for instruction cache
>> @@ -59,7 +61,7 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
>> cpumask_set_cpu(cpu, mm_cpumask(next));
>>
>> #ifdef CONFIG_MMU
>> - csr_write(CSR_SATP, virt_to_pfn(next->pgd) | SATP_MODE);
>> + csr_write(CSR_SATP, virt_to_pfn(next->pgd) | satp_mode);
>> local_flush_tlb_all();
>> #endif
>>
>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>> index 5782cae58ac2..bad8da099ff6 100644
>> --- a/arch/riscv/mm/init.c
>> +++ b/arch/riscv/mm/init.c
>> @@ -25,8 +25,23 @@
>>
>> #include "../kernel/head.h"
>>
>> -unsigned long kernel_virt_addr = KERNEL_VIRT_ADDR;
>> +#ifdef CONFIG_64BIT
>> +u64 satp_mode = IS_ENABLED(CONFIG_MAXPHYSMEM_2GB) ?
>> + SATP_MODE_39 : SATP_MODE_48;
>> +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_MAXPHYSMEM_2GB) ? false : true;
>> +#else
>> +u64 satp_mode = SATP_MODE_32;
>> +bool pgtable_l4_enabled;
>> +#endif
>> +EXPORT_SYMBOL(pgtable_l4_enabled);
>> +EXPORT_SYMBOL(satp_mode);
>> +
>> +unsigned long kernel_virt_addr;
>> EXPORT_SYMBOL(kernel_virt_addr);
>> +#ifdef CONFIG_RELOCATABLE
>> +unsigned long __page_offset = _AC(CONFIG_PAGE_OFFSET, UL);
>> +EXPORT_SYMBOL(__page_offset);
>> +#endif
>>
>> unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
>> __page_aligned_bss;
>> @@ -254,9 +269,12 @@ static void __init create_pte_mapping(pte_t *ptep,
>>
>> #ifndef __PAGETABLE_PMD_FOLDED
>>
>> +pud_t trampoline_pud[PTRS_PER_PUD] __page_aligned_bss;
>> pmd_t trampoline_pmd[PTRS_PER_PMD] __page_aligned_bss;
>> +pud_t fixmap_pud[PTRS_PER_PUD] __page_aligned_bss;
>> pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss;
>> pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
>> +pud_t early_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE);
>>
>> static pmd_t *__init get_pmd_virt(phys_addr_t pa)
>> {
>> @@ -273,7 +291,8 @@ static phys_addr_t __init alloc_pmd(uintptr_t va)
>> if (mmu_enabled)
>> return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
>>
>> - BUG_ON((va - kernel_virt_addr) >> PGDIR_SHIFT);
>> + /* Only one PMD is available for early mapping */
>> + BUG_ON((va - kernel_virt_addr) >> PUD_SHIFT);
>>
>> return (uintptr_t)early_pmd;
>> }
>> @@ -305,19 +324,70 @@ static void __init create_pmd_mapping(pmd_t *pmdp,
>> create_pte_mapping(ptep, va, pa, sz, prot);
>> }
>>
>> -#define pgd_next_t pmd_t
>> -#define alloc_pgd_next(__va) alloc_pmd(__va)
>> -#define get_pgd_next_virt(__pa) get_pmd_virt(__pa)
>> +static pud_t *__init get_pud_virt(phys_addr_t pa)
>> +{
>> + if (mmu_enabled) {
>> + clear_fixmap(FIX_PUD);
>> + return (pud_t *)set_fixmap_offset(FIX_PUD, pa);
>> + } else {
>> + return (pud_t *)((uintptr_t)pa);
>> + }
>> +}
>> +
>> +static phys_addr_t __init alloc_pud(uintptr_t va)
>> +{
>> + if (mmu_enabled)
>> + return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
>> +
>> + /* Only one PUD is available for early mapping */
>> + BUG_ON((va - kernel_virt_addr) >> PGDIR_SHIFT);
>> +
>> + return (uintptr_t)early_pud;
>> +}
>> +
>> +static void __init create_pud_mapping(pud_t *pudp,
>> + uintptr_t va, phys_addr_t pa,
>> + phys_addr_t sz, pgprot_t prot)
>> +{
>> + pmd_t *nextp;
>> + phys_addr_t next_phys;
>> + uintptr_t pud_index = pud_index(va);
>> +
>> + if (sz == PUD_SIZE) {
>> + if (pud_val(pudp[pud_index]) == 0)
>> + pudp[pud_index] = pfn_pud(PFN_DOWN(pa), prot);
>> + return;
>> + }
>> +
>> + if (pud_val(pudp[pud_index]) == 0) {
>> + next_phys = alloc_pmd(va);
>> + pudp[pud_index] = pfn_pud(PFN_DOWN(next_phys), PAGE_TABLE);
>> + nextp = get_pmd_virt(next_phys);
>> + memset(nextp, 0, PAGE_SIZE);
>> + } else {
>> + next_phys = PFN_PHYS(_pud_pfn(pudp[pud_index]));
>> + nextp = get_pmd_virt(next_phys);
>> + }
>> +
>> + create_pmd_mapping(nextp, va, pa, sz, prot);
>> +}
>> +
>> +#define pgd_next_t pud_t
>> +#define alloc_pgd_next(__va) alloc_pud(__va)
>> +#define get_pgd_next_virt(__pa) get_pud_virt(__pa)
>> #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \
>> - create_pmd_mapping(__nextp, __va, __pa, __sz, __prot)
>> -#define fixmap_pgd_next fixmap_pmd
>> + create_pud_mapping(__nextp, __va, __pa, __sz, __prot)
>> +#define fixmap_pgd_next (pgtable_l4_enabled ? \
>> + (uintptr_t)fixmap_pud : (uintptr_t)fixmap_pmd)
>> +#define trampoline_pgd_next (pgtable_l4_enabled ? \
>> + (uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd)
>> #else
>> #define pgd_next_t pte_t
>> #define alloc_pgd_next(__va) alloc_pte(__va)
>> #define get_pgd_next_virt(__pa) get_pte_virt(__pa)
>> #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \
>> create_pte_mapping(__nextp, __va, __pa, __sz, __prot)
>> -#define fixmap_pgd_next fixmap_pte
>> +#define fixmap_pgd_next ((uintptr_t)fixmap_pte)
>> #endif
>>
>> static void __init create_pgd_mapping(pgd_t *pgdp,
>> @@ -328,6 +398,13 @@ static void __init create_pgd_mapping(pgd_t *pgdp,
>> phys_addr_t next_phys;
>> uintptr_t pgd_index = pgd_index(va);
>>
>> +#ifndef __PAGETABLE_PMD_FOLDED
>> + if (!pgtable_l4_enabled) {
>> + create_pud_mapping((pud_t *)pgdp, va, pa, sz, prot);
>> + return;
>> + }
>> +#endif
>> +
>> if (sz == PGDIR_SIZE) {
>> if (pgd_val(pgdp[pgd_index]) == 0)
>> pgdp[pgd_index] = pfn_pgd(PFN_DOWN(pa), prot);
>> @@ -419,6 +496,47 @@ void __init relocate_kernel(uintptr_t load_pa)
>> }
>> }
>>
>> +#if defined(CONFIG_64BIT) && !defined(CONFIG_MAXPHYSMEM_2GB)
>> +void disable_pgtable_l4(void)
>> +{
>> + pgtable_l4_enabled = false;
>> + __page_offset = PAGE_OFFSET_L3;
>> + satp_mode = SATP_MODE_39;
>> +}
>> +
>> +/* There is a simple way to determine if 4-level is supported by the
>> + * underlying hardware: establish 1:1 mapping in 4-level page table mode
>> + * then read SATP to see if the configuration was taken into account
>> + * meaning sv48 is supported.
>> + */
>> +asmlinkage __init void set_satp_mode(uintptr_t load_pa)
>> +{
>> + u64 identity_satp, hw_satp;
>> + int cpus_node;
>> +
>> + create_pgd_mapping(early_pg_dir, load_pa, (uintptr_t)early_pud,
>> + PGDIR_SIZE, PAGE_TABLE);
>> + create_pud_mapping(early_pud, load_pa, (uintptr_t)early_pmd,
>> + PUD_SIZE, PAGE_TABLE);
>> + create_pmd_mapping(early_pmd, load_pa, load_pa,
>> + PMD_SIZE, PAGE_KERNEL_EXEC);
>> +
>> + identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode;
>> + local_flush_tlb_all();
>> + csr_write(CSR_SATP, identity_satp);
>> +
>> + hw_satp = csr_read(CSR_SATP);
>> + csr_write(CSR_SATP, 0ULL);
>> + local_flush_tlb_all();
>> +
>> + if (hw_satp != identity_satp)
>> + disable_pgtable_l4();
>> +
>> + memset(early_pg_dir, 0, PAGE_SIZE);
>> + memset(early_pud, 0, PAGE_SIZE);
>> + memset(early_pmd, 0, PAGE_SIZE);
>> +}
>> +#endif
>> #endif
>>
>> static uintptr_t load_pa, load_sz;
>> @@ -442,9 +560,14 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
>> load_pa = (uintptr_t)(&_start);
>> load_sz = (uintptr_t)(&_end) - load_pa;
>>
>> +#if defined(CONFIG_64BIT) && !defined(CONFIG_MAXPHYSMEM_2GB)
>> + set_satp_mode(load_pa);
>> +#endif
>> +
>> + kernel_virt_addr = KERNEL_VIRT_ADDR;
>> +
>> va_pa_offset = PAGE_OFFSET - load_pa;
>> va_kernel_pa_offset = kernel_virt_addr - load_pa;
>> -
>> pfn_base = PFN_DOWN(load_pa);
>>
>> #ifdef CONFIG_RELOCATABLE
>> @@ -473,15 +596,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
>>
>> /* Setup early PGD for fixmap */
>> create_pgd_mapping(early_pg_dir, FIXADDR_START,
>> - (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
>> + fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
>>
>> #ifndef __PAGETABLE_PMD_FOLDED
>> - /* Setup fixmap PMD */
>> + /* Setup fixmap PUD and PMD */
>> + if (pgtable_l4_enabled)
>> + create_pud_mapping(fixmap_pud, FIXADDR_START,
>> + (uintptr_t)fixmap_pmd, PUD_SIZE, PAGE_TABLE);
>> create_pmd_mapping(fixmap_pmd, FIXADDR_START,
>> (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE);
>> +
>> /* Setup trampoline PGD and PMD */
>> create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr,
>> - (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
>> + trampoline_pgd_next, PGDIR_SIZE, PAGE_TABLE);
>> + if (pgtable_l4_enabled)
>> + create_pud_mapping(trampoline_pud, kernel_virt_addr,
>> + (uintptr_t)trampoline_pmd, PUD_SIZE, PAGE_TABLE);
>> create_pmd_mapping(trampoline_pmd, kernel_virt_addr,
>> load_pa, PMD_SIZE, PAGE_KERNEL_EXEC);
>> #else
>> @@ -558,12 +688,13 @@ static void __init setup_vm_final(void)
>>
>> vm_area_add_early(&vm_kernel);
>>
>> - /* Clear fixmap PTE and PMD mappings */
>> + /* Clear fixmap page table mappings */
>> clear_fixmap(FIX_PTE);
>> clear_fixmap(FIX_PMD);
>> + clear_fixmap(FIX_PUD);
>>
>> /* Move to swapper page table */
>> - csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE);
>> + csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | satp_mode);
>> local_flush_tlb_all();
>> }
>>
>> --
>> 2.20.1
>>
> -----Original Message-----
> From: [email protected] <linux-kernel-
> [email protected]> On Behalf Of Alex Ghiti
> Sent: 26 May 2020 22:00
> To: Anup Patel <[email protected]>
> Cc: Paul Walmsley <[email protected]>; Palmer Dabbelt
> <[email protected]>; Zong Li <[email protected]>; Christoph Hellwig
> <[email protected]>; linux-riscv <[email protected]>; linux-
> [email protected] List <[email protected]>
> Subject: Re: [PATCH 5/8] riscv: Implement sv48 support
>
> Le 5/25/20 à 2:45 AM, Anup Patel a écrit :
> > On Sun, May 24, 2020 at 2:45 PM Alexandre Ghiti <[email protected]> wrote:
> >> By adding a new 4th level of page table, give the possibility to
> >> 64bit kernel to address 2^48 bytes of virtual address: in practice,
> >> that roughly offers ~160TB of virtual address space to userspace and
> >> allows up to 64TB of physical memory.
> >>
> >> If the underlying hardware does not support sv48, we will
> >> automatically fallback to a standard 3-level page table by folding
> >> the new PUD level into PGDIR level. In order to detect HW
> >> capabilities at runtime, we use SATP feature that ignores writes with an
> unsupported mode.
> >>
> >> Signed-off-by: Alexandre Ghiti <[email protected]>
> >> ---
> >> arch/riscv/Kconfig | 6 +-
> >> arch/riscv/include/asm/csr.h | 3 +-
> >> arch/riscv/include/asm/fixmap.h | 1 +
> >> arch/riscv/include/asm/page.h | 15 +++
> >> arch/riscv/include/asm/pgalloc.h | 36 +++++++
> >> arch/riscv/include/asm/pgtable-64.h | 97 ++++++++++++++++-
> >> arch/riscv/include/asm/pgtable.h | 9 +-
> >> arch/riscv/kernel/head.S | 3 +-
> >> arch/riscv/mm/context.c | 4 +-
> >> arch/riscv/mm/init.c | 159 +++++++++++++++++++++++++---
> >> 10 files changed, 309 insertions(+), 24 deletions(-)
> >>
> >> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index
> >> e167f16131f4..3f73f60e9732 100644
> >> --- a/arch/riscv/Kconfig
> >> +++ b/arch/riscv/Kconfig
> >> @@ -68,6 +68,7 @@ config RISCV
> >> select ARCH_HAS_GCOV_PROFILE_ALL
> >> select HAVE_COPY_THREAD_TLS
> >> select HAVE_ARCH_KASAN if MMU && 64BIT
> >> + select RELOCATABLE if 64BIT
> >>
> >> config ARCH_MMAP_RND_BITS_MIN
> >> default 18 if 64BIT
> >> @@ -106,7 +107,7 @@ config PAGE_OFFSET
> >> default 0xC0000000 if 32BIT && MAXPHYSMEM_2GB
> >> default 0x80000000 if 64BIT && !MMU
> >> default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB
> >> - default 0xffffffe000000000 if 64BIT && !MAXPHYSMEM_2GB
> >> + default 0xffffc00000000000 if 64BIT && !MAXPHYSMEM_2GB
> >>
> >> config ARCH_FLATMEM_ENABLE
> >> def_bool y
> >> @@ -155,8 +156,11 @@ config GENERIC_HWEIGHT
> >> config FIX_EARLYCON_MEM
> >> def_bool MMU
> >>
> >> +# On a 64BIT relocatable kernel, the 4-level page table is at
> >> +runtime folded # on a 3-level page table when sv48 is not supported.
> >> config PGTABLE_LEVELS
> >> int
> >> + default 4 if 64BIT && RELOCATABLE
> >> default 3 if 64BIT
> >> default 2
> >>
> >> diff --git a/arch/riscv/include/asm/csr.h
> >> b/arch/riscv/include/asm/csr.h index cec462e198ce..d41536c3f8d4
> >> 100644
> >> --- a/arch/riscv/include/asm/csr.h
> >> +++ b/arch/riscv/include/asm/csr.h
> >> @@ -40,11 +40,10 @@
> >> #ifndef CONFIG_64BIT
> >> #define SATP_PPN _AC(0x003FFFFF, UL)
> >> #define SATP_MODE_32 _AC(0x80000000, UL)
> >> -#define SATP_MODE SATP_MODE_32
> >> #else
> >> #define SATP_PPN _AC(0x00000FFFFFFFFFFF, UL)
> >> #define SATP_MODE_39 _AC(0x8000000000000000, UL)
> >> -#define SATP_MODE SATP_MODE_39
> >> +#define SATP_MODE_48 _AC(0x9000000000000000, UL)
> >> #endif
> >>
> >> /* Exception cause high bit - is an interrupt if set */ diff --git
> >> a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
> >> index 2368d49eb4ef..d891cf9c73c5 100644
> >> --- a/arch/riscv/include/asm/fixmap.h
> >> +++ b/arch/riscv/include/asm/fixmap.h
> >> @@ -27,6 +27,7 @@ enum fixed_addresses {
> >> FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1,
> >> FIX_PTE,
> >> FIX_PMD,
> >> + FIX_PUD,
> >> FIX_TEXT_POKE1,
> >> FIX_TEXT_POKE0,
> >> FIX_EARLYCON_MEM_BASE,
> >> diff --git a/arch/riscv/include/asm/page.h
> >> b/arch/riscv/include/asm/page.h index 48bb09b6a9b7..5e77fe7f0d6d
> >> 100644
> >> --- a/arch/riscv/include/asm/page.h
> >> +++ b/arch/riscv/include/asm/page.h
> >> @@ -31,7 +31,19 @@
> >> * When not using MMU this corresponds to the first free page in
> >> * physical memory (aligned on a page boundary).
> >> */
> >> +#ifdef CONFIG_RELOCATABLE
> >> +#define PAGE_OFFSET __page_offset
> >> +
> >> +#ifdef CONFIG_64BIT
> >> +/*
> >> + * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address
> >> +space so
> >> + * define the PAGE_OFFSET value for SV39.
> >> + */
> >> +#define PAGE_OFFSET_L3 0xffffffe000000000
> >> +#endif /* CONFIG_64BIT */
> >> +#else
> >> #define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL)
> >> +#endif /* CONFIG_RELOCATABLE */
> >>
> >> #define KERN_VIRT_SIZE (-PAGE_OFFSET)
> >>
> >> @@ -102,6 +114,9 @@ extern unsigned long pfn_base;
> >> extern unsigned long max_low_pfn;
> >> extern unsigned long min_low_pfn;
> >> extern unsigned long kernel_virt_addr;
> >> +#ifdef CONFIG_RELOCATABLE
> >> +extern unsigned long __page_offset;
> >> +#endif
> >>
> >> #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) +
> va_pa_offset))
> >> #define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset)
> >> diff --git a/arch/riscv/include/asm/pgalloc.h
> >> b/arch/riscv/include/asm/pgalloc.h
> >> index 3f601ee8233f..540eaa5a8658 100644
> >> --- a/arch/riscv/include/asm/pgalloc.h
> >> +++ b/arch/riscv/include/asm/pgalloc.h
> >> @@ -36,6 +36,42 @@ static inline void pud_populate(struct mm_struct
> >> *mm, pud_t *pud, pmd_t *pmd)
> >>
> >> set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> >> }
> >> +
> >> +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d,
> >> +pud_t *pud) {
> >> + if (pgtable_l4_enabled) {
> >> + unsigned long pfn = virt_to_pfn(pud);
> >> +
> >> + set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> >> + }
> >> +}
> >> +
> >> +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d,
> >> + pud_t *pud) {
> >> + if (pgtable_l4_enabled) {
> >> + unsigned long pfn = virt_to_pfn(pud);
> >> +
> >> + set_p4d_safe(p4d,
> >> + __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
> >> + }
> >> +}
> >> +
> >> +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned
> >> +long addr) {
> >> + if (pgtable_l4_enabled)
> >> + return (pud_t *)__get_free_page(
> >> + GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_ZERO);
> >> + return NULL;
> >> +}
> >> +
> >> +static inline void pud_free(struct mm_struct *mm, pud_t *pud) {
> >> + if (pgtable_l4_enabled)
> >> + free_page((unsigned long)pud); }
> >> +
> >> +#define __pud_free_tlb(tlb, pud, addr) pud_free((tlb)->mm, pud)
> >> #endif /* __PAGETABLE_PMD_FOLDED */
> >>
> >> #define pmd_pgtable(pmd) pmd_page(pmd)
> >> diff --git a/arch/riscv/include/asm/pgtable-64.h
> >> b/arch/riscv/include/asm/pgtable-64.h
> >> index b15f70a1fdfa..c84c31fbf8da 100644
> >> --- a/arch/riscv/include/asm/pgtable-64.h
> >> +++ b/arch/riscv/include/asm/pgtable-64.h
> >> @@ -8,16 +8,32 @@
> >>
> >> #include <linux/const.h>
> >>
> >> -#define PGDIR_SHIFT 30
> >> +extern bool pgtable_l4_enabled;
> >> +
> >> +#define PGDIR_SHIFT (pgtable_l4_enabled ? 39 : 30)
> >> /* Size of region mapped by a page global directory */
> >> #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
> >> #define PGDIR_MASK (~(PGDIR_SIZE - 1))
> >>
> >> +/* pud is folded into pgd in case of 3-level page table */
> >> +#define PUD_SHIFT 30
> >> +#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
> >> +#define PUD_MASK (~(PUD_SIZE - 1))
> >> +
> >> #define PMD_SHIFT 21
> >> /* Size of region mapped by a page middle directory */
> >> #define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
> >> #define PMD_MASK (~(PMD_SIZE - 1))
> >>
> >> +/* Page Upper Directory entry */
> >> +typedef struct {
> >> + unsigned long pud;
> >> +} pud_t;
> >> +
> >> +#define pud_val(x) ((x).pud)
> >> +#define __pud(x) ((pud_t) { (x) })
> >> +#define PTRS_PER_PUD (PAGE_SIZE / sizeof(pud_t))
> >> +
> >> /* Page Middle Directory entry */
> >> typedef struct {
> >> unsigned long pmd;
> >> @@ -60,6 +76,16 @@ static inline void pud_clear(pud_t *pudp)
> >> set_pud(pudp, __pud(0));
> >> }
> >>
> >> +static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot) {
> >> + return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot)); }
> >> +
> >> +static inline unsigned long _pud_pfn(pud_t pud) {
> >> + return pud_val(pud) >> _PAGE_PFN_SHIFT; }
> >> +
> >> static inline unsigned long pud_page_vaddr(pud_t pud)
> >> {
> >> return (unsigned long)pfn_to_virt(pud_val(pud) >>
> >> _PAGE_PFN_SHIFT); @@ -70,6 +96,15 @@ static inline struct page
> *pud_page(pud_t pud)
> >> return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
> >> }
> >>
> >> +#define mm_pud_folded mm_pud_folded static inline bool
> >> +mm_pud_folded(struct mm_struct *mm) {
> >> + if (pgtable_l4_enabled)
> >> + return false;
> >> +
> >> + return true;
> >> +}
> >> +
> >> #define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD -
> >> 1))
> >>
> >> static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr) @@
> >> -90,4 +125,64 @@ static inline unsigned long _pmd_pfn(pmd_t pmd)
> >> #define pmd_ERROR(e) \
> >> pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__,
> >> pmd_val(e))
> >>
> >> +#define pud_ERROR(e) \
> >> + pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__,
> >> +pud_val(e))
> >> +
> >> +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) {
> >> + if (pgtable_l4_enabled)
> >> + *p4dp = p4d;
> >> + else
> >> + set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) }); }
> >> +
> >> +static inline int p4d_none(p4d_t p4d) {
> >> + if (pgtable_l4_enabled)
> >> + return (p4d_val(p4d) == 0);
> >> +
> >> + return 0;
> >> +}
> >> +
> >> +static inline int p4d_present(p4d_t p4d) {
> >> + if (pgtable_l4_enabled)
> >> + return (p4d_val(p4d) & _PAGE_PRESENT);
> >> +
> >> + return 1;
> >> +}
> >> +
> >> +static inline int p4d_bad(p4d_t p4d) {
> >> + if (pgtable_l4_enabled)
> >> + return !p4d_present(p4d);
> >> +
> >> + return 0;
> >> +}
> >> +
> >> +static inline void p4d_clear(p4d_t *p4d) {
> >> + if (pgtable_l4_enabled)
> >> + set_p4d(p4d, __p4d(0)); }
> >> +
> >> +static inline unsigned long p4d_page_vaddr(p4d_t p4d) {
> >> + if (pgtable_l4_enabled)
> >> + return (unsigned long)pfn_to_virt(
> >> + p4d_val(p4d) >> _PAGE_PFN_SHIFT);
> >> +
> >> + return pud_page_vaddr((pud_t) { p4d_val(p4d) }); }
> >> +
> >> +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
> >> +
> >> +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) {
> >> + if (pgtable_l4_enabled)
> >> + return (pud_t *)p4d_page_vaddr(*p4d) +
> >> +pud_index(address);
> >> +
> >> + return (pud_t *)p4d;
> >> +}
> >> +
> >> #endif /* _ASM_RISCV_PGTABLE_64_H */ diff --git
> >> a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> >> index 8e96315b3366..b8a8ba69d0a2 100644
> >> --- a/arch/riscv/include/asm/pgtable.h
> >> +++ b/arch/riscv/include/asm/pgtable.h
> >> @@ -20,12 +20,14 @@
> >> * the kernel.
> >> */
> >> #define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1)
> >> -#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR
> >> +#define KERNEL_LINK_ADDR (VMALLOC_LINK_END - SZ_2G + 1)
> >>
> >> #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1)
> >> #define VMALLOC_END (PAGE_OFFSET - 1)
> >> #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE)
> >>
> >> +#define VMALLOC_LINK_END (_AC(CONFIG_PAGE_OFFSET, UL) - 1)
> >> +
> >> #define BPF_JIT_REGION_SIZE (SZ_128M)
> >> #define BPF_JIT_REGION_START (kernel_virt_addr)
> >> #define BPF_JIT_REGION_END (kernel_virt_addr +
> BPF_JIT_REGION_SIZE)
> >> @@ -67,8 +69,7 @@
> >>
> >> #ifndef __ASSEMBLY__
> >>
> >> -/* Page Upper Directory not used in RISC-V */ -#include
> >> <asm-generic/pgtable-nopud.h>
> >> +#include <asm-generic/pgtable-nop4d.h>
> >> #include <asm/page.h>
> >> #include <asm/tlbflush.h>
> >> #include <linux/mm_types.h>
> >> @@ -81,7 +82,7 @@
> >>
> >> #ifdef CONFIG_MMU
> >> #ifdef CONFIG_64BIT
> >> -#define VA_BITS 39
> >> +#define VA_BITS (pgtable_l4_enabled ? 48 : 39)
> >> #define PA_BITS 56
> >> #else
> >> #define VA_BITS 32
> >> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
> >> index 8f5bb7731327..0632c4834c68 100644
> >> --- a/arch/riscv/kernel/head.S
> >> +++ b/arch/riscv/kernel/head.S
> >> @@ -62,7 +62,8 @@ relocate:
> >>
> >> /* Compute satp for kernel page tables, but don't load it yet */
> >> srl a2, a0, PAGE_SHIFT
> >> - li a1, SATP_MODE
> >> + la a1, satp_mode
> >> + REG_L a1, 0(a1)
> >> or a2, a2, a1
> >>
> >> /*
> >> diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c index
> >> 613ec81a8979..6830504f8b11 100644
> >> --- a/arch/riscv/mm/context.c
> >> +++ b/arch/riscv/mm/context.c
> >> @@ -9,6 +9,8 @@
> >> #include <asm/cacheflush.h>
> >> #include <asm/mmu_context.h>
> >>
> >> +extern u64 satp_mode;
> > Please move this to asm/pgtable.h next to "extern void *dtb_early_va".
> >
> > Same thing can be done for "pgtable_l4_enabled" to help PATCH7.
> >
> > I forgot to mention this in previous emails.
>
>
> Ok, I'll do that in v2 too, thanks.
> Anup, do you have time to take a look at the relocatable series I have posted
> earlier ?
> As sv48 support depends on that, it would be nice to have your review too.
Sure, I will review tomorrow or day after.
Thanks,
Anup
>
> Thanks,
>
> Alex
>
>
> >
> > Regards,
> > Anup
> >
> >
> >
> >> +
> >> /*
> >> * When necessary, performs a deferred icache flush for the given MM
> context,
> >> * on the local CPU. RISC-V has no direct mechanism for
> >> instruction cache @@ -59,7 +61,7 @@ void switch_mm(struct mm_struct
> *prev, struct mm_struct *next,
> >> cpumask_set_cpu(cpu, mm_cpumask(next));
> >>
> >> #ifdef CONFIG_MMU
> >> - csr_write(CSR_SATP, virt_to_pfn(next->pgd) | SATP_MODE);
> >> + csr_write(CSR_SATP, virt_to_pfn(next->pgd) | satp_mode);
> >> local_flush_tlb_all();
> >> #endif
> >>
> >> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index
> >> 5782cae58ac2..bad8da099ff6 100644
> >> --- a/arch/riscv/mm/init.c
> >> +++ b/arch/riscv/mm/init.c
> >> @@ -25,8 +25,23 @@
> >>
> >> #include "../kernel/head.h"
> >>
> >> -unsigned long kernel_virt_addr = KERNEL_VIRT_ADDR;
> >> +#ifdef CONFIG_64BIT
> >> +u64 satp_mode = IS_ENABLED(CONFIG_MAXPHYSMEM_2GB) ?
> >> + SATP_MODE_39 : SATP_MODE_48; bool
> >> +pgtable_l4_enabled = IS_ENABLED(CONFIG_MAXPHYSMEM_2GB) ? false :
> >> +true; #else
> >> +u64 satp_mode = SATP_MODE_32;
> >> +bool pgtable_l4_enabled;
> >> +#endif
> >> +EXPORT_SYMBOL(pgtable_l4_enabled);
> >> +EXPORT_SYMBOL(satp_mode);
> >> +
> >> +unsigned long kernel_virt_addr;
> >> EXPORT_SYMBOL(kernel_virt_addr);
> >> +#ifdef CONFIG_RELOCATABLE
> >> +unsigned long __page_offset = _AC(CONFIG_PAGE_OFFSET, UL);
> >> +EXPORT_SYMBOL(__page_offset); #endif
> >>
> >> unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
> >>
> >> __page_aligned_bss; @@ -254,9 +269,12 @@ static void __init
> >> create_pte_mapping(pte_t *ptep,
> >>
> >> #ifndef __PAGETABLE_PMD_FOLDED
> >>
> >> +pud_t trampoline_pud[PTRS_PER_PUD] __page_aligned_bss;
> >> pmd_t trampoline_pmd[PTRS_PER_PMD] __page_aligned_bss;
> >> +pud_t fixmap_pud[PTRS_PER_PUD] __page_aligned_bss;
> >> pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss;
> >> pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
> >> +pud_t early_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE);
> >>
> >> static pmd_t *__init get_pmd_virt(phys_addr_t pa)
> >> {
> >> @@ -273,7 +291,8 @@ static phys_addr_t __init alloc_pmd(uintptr_t va)
> >> if (mmu_enabled)
> >> return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
> >>
> >> - BUG_ON((va - kernel_virt_addr) >> PGDIR_SHIFT);
> >> + /* Only one PMD is available for early mapping */
> >> + BUG_ON((va - kernel_virt_addr) >> PUD_SHIFT);
> >>
> >> return (uintptr_t)early_pmd;
> >> }
> >> @@ -305,19 +324,70 @@ static void __init create_pmd_mapping(pmd_t
> *pmdp,
> >> create_pte_mapping(ptep, va, pa, sz, prot);
> >> }
> >>
> >> -#define pgd_next_t pmd_t
> >> -#define alloc_pgd_next(__va) alloc_pmd(__va)
> >> -#define get_pgd_next_virt(__pa) get_pmd_virt(__pa)
> >> +static pud_t *__init get_pud_virt(phys_addr_t pa) {
> >> + if (mmu_enabled) {
> >> + clear_fixmap(FIX_PUD);
> >> + return (pud_t *)set_fixmap_offset(FIX_PUD, pa);
> >> + } else {
> >> + return (pud_t *)((uintptr_t)pa);
> >> + }
> >> +}
> >> +
> >> +static phys_addr_t __init alloc_pud(uintptr_t va) {
> >> + if (mmu_enabled)
> >> + return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
> >> +
> >> + /* Only one PUD is available for early mapping */
> >> + BUG_ON((va - kernel_virt_addr) >> PGDIR_SHIFT);
> >> +
> >> + return (uintptr_t)early_pud;
> >> +}
> >> +
> >> +static void __init create_pud_mapping(pud_t *pudp,
> >> + uintptr_t va, phys_addr_t pa,
> >> + phys_addr_t sz, pgprot_t prot)
> >> +{
> >> + pmd_t *nextp;
> >> + phys_addr_t next_phys;
> >> + uintptr_t pud_index = pud_index(va);
> >> +
> >> + if (sz == PUD_SIZE) {
> >> + if (pud_val(pudp[pud_index]) == 0)
> >> + pudp[pud_index] = pfn_pud(PFN_DOWN(pa), prot);
> >> + return;
> >> + }
> >> +
> >> + if (pud_val(pudp[pud_index]) == 0) {
> >> + next_phys = alloc_pmd(va);
> >> + pudp[pud_index] = pfn_pud(PFN_DOWN(next_phys),
> PAGE_TABLE);
> >> + nextp = get_pmd_virt(next_phys);
> >> + memset(nextp, 0, PAGE_SIZE);
> >> + } else {
> >> + next_phys = PFN_PHYS(_pud_pfn(pudp[pud_index]));
> >> + nextp = get_pmd_virt(next_phys);
> >> + }
> >> +
> >> + create_pmd_mapping(nextp, va, pa, sz, prot); }
> >> +
> >> +#define pgd_next_t pud_t
> >> +#define alloc_pgd_next(__va) alloc_pud(__va)
> >> +#define get_pgd_next_virt(__pa) get_pud_virt(__pa)
> >> #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \
> >> - create_pmd_mapping(__nextp, __va, __pa, __sz, __prot)
> >> -#define fixmap_pgd_next fixmap_pmd
> >> + create_pud_mapping(__nextp, __va, __pa, __sz, __prot)
> >> +#define fixmap_pgd_next (pgtable_l4_enabled ? \
> >> + (uintptr_t)fixmap_pud : (uintptr_t)fixmap_pmd)
> >> +#define trampoline_pgd_next (pgtable_l4_enabled ? \
> >> + (uintptr_t)trampoline_pud :
> >> +(uintptr_t)trampoline_pmd)
> >> #else
> >> #define pgd_next_t pte_t
> >> #define alloc_pgd_next(__va) alloc_pte(__va)
> >> #define get_pgd_next_virt(__pa) get_pte_virt(__pa)
> >> #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \
> >> create_pte_mapping(__nextp, __va, __pa, __sz, __prot)
> >> -#define fixmap_pgd_next fixmap_pte
> >> +#define fixmap_pgd_next ((uintptr_t)fixmap_pte)
> >> #endif
> >>
> >> static void __init create_pgd_mapping(pgd_t *pgdp, @@ -328,6
> >> +398,13 @@ static void __init create_pgd_mapping(pgd_t *pgdp,
> >> phys_addr_t next_phys;
> >> uintptr_t pgd_index = pgd_index(va);
> >>
> >> +#ifndef __PAGETABLE_PMD_FOLDED
> >> + if (!pgtable_l4_enabled) {
> >> + create_pud_mapping((pud_t *)pgdp, va, pa, sz, prot);
> >> + return;
> >> + }
> >> +#endif
> >> +
> >> if (sz == PGDIR_SIZE) {
> >> if (pgd_val(pgdp[pgd_index]) == 0)
> >> pgdp[pgd_index] = pfn_pgd(PFN_DOWN(pa),
> >> prot); @@ -419,6 +496,47 @@ void __init relocate_kernel(uintptr_t
> load_pa)
> >> }
> >> }
> >>
> >> +#if defined(CONFIG_64BIT) && !defined(CONFIG_MAXPHYSMEM_2GB)
> void
> >> +disable_pgtable_l4(void) {
> >> + pgtable_l4_enabled = false;
> >> + __page_offset = PAGE_OFFSET_L3;
> >> + satp_mode = SATP_MODE_39;
> >> +}
> >> +
> >> +/* There is a simple way to determine if 4-level is supported by the
> >> + * underlying hardware: establish 1:1 mapping in 4-level page table
> >> +mode
> >> + * then read SATP to see if the configuration was taken into account
> >> + * meaning sv48 is supported.
> >> + */
> >> +asmlinkage __init void set_satp_mode(uintptr_t load_pa) {
> >> + u64 identity_satp, hw_satp;
> >> + int cpus_node;
> >> +
> >> + create_pgd_mapping(early_pg_dir, load_pa, (uintptr_t)early_pud,
> >> + PGDIR_SIZE, PAGE_TABLE);
> >> + create_pud_mapping(early_pud, load_pa, (uintptr_t)early_pmd,
> >> + PUD_SIZE, PAGE_TABLE);
> >> + create_pmd_mapping(early_pmd, load_pa, load_pa,
> >> + PMD_SIZE, PAGE_KERNEL_EXEC);
> >> +
> >> + identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode;
> >> + local_flush_tlb_all();
> >> + csr_write(CSR_SATP, identity_satp);
> >> +
> >> + hw_satp = csr_read(CSR_SATP);
> >> + csr_write(CSR_SATP, 0ULL);
> >> + local_flush_tlb_all();
> >> +
> >> + if (hw_satp != identity_satp)
> >> + disable_pgtable_l4();
> >> +
> >> + memset(early_pg_dir, 0, PAGE_SIZE);
> >> + memset(early_pud, 0, PAGE_SIZE);
> >> + memset(early_pmd, 0, PAGE_SIZE); } #endif
> >> #endif
> >>
> >> static uintptr_t load_pa, load_sz;
> >> @@ -442,9 +560,14 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
> >> load_pa = (uintptr_t)(&_start);
> >> load_sz = (uintptr_t)(&_end) - load_pa;
> >>
> >> +#if defined(CONFIG_64BIT) && !defined(CONFIG_MAXPHYSMEM_2GB)
> >> + set_satp_mode(load_pa);
> >> +#endif
> >> +
> >> + kernel_virt_addr = KERNEL_VIRT_ADDR;
> >> +
> >> va_pa_offset = PAGE_OFFSET - load_pa;
> >> va_kernel_pa_offset = kernel_virt_addr - load_pa;
> >> -
> >> pfn_base = PFN_DOWN(load_pa);
> >>
> >> #ifdef CONFIG_RELOCATABLE
> >> @@ -473,15 +596,22 @@ asmlinkage void __init setup_vm(uintptr_t
> >> dtb_pa)
> >>
> >> /* Setup early PGD for fixmap */
> >> create_pgd_mapping(early_pg_dir, FIXADDR_START,
> >> - (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
> >> + fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
> >>
> >> #ifndef __PAGETABLE_PMD_FOLDED
> >> - /* Setup fixmap PMD */
> >> + /* Setup fixmap PUD and PMD */
> >> + if (pgtable_l4_enabled)
> >> + create_pud_mapping(fixmap_pud, FIXADDR_START,
> >> + (uintptr_t)fixmap_pmd, PUD_SIZE,
> >> + PAGE_TABLE);
> >> create_pmd_mapping(fixmap_pmd, FIXADDR_START,
> >> (uintptr_t)fixmap_pte, PMD_SIZE,
> >> PAGE_TABLE);
> >> +
> >> /* Setup trampoline PGD and PMD */
> >> create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr,
> >> - (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
> >> + trampoline_pgd_next, PGDIR_SIZE, PAGE_TABLE);
> >> + if (pgtable_l4_enabled)
> >> + create_pud_mapping(trampoline_pud, kernel_virt_addr,
> >> + (uintptr_t)trampoline_pmd, PUD_SIZE,
> >> + PAGE_TABLE);
> >> create_pmd_mapping(trampoline_pmd, kernel_virt_addr,
> >> load_pa, PMD_SIZE, PAGE_KERNEL_EXEC);
> >> #else
> >> @@ -558,12 +688,13 @@ static void __init setup_vm_final(void)
> >>
> >> vm_area_add_early(&vm_kernel);
> >>
> >> - /* Clear fixmap PTE and PMD mappings */
> >> + /* Clear fixmap page table mappings */
> >> clear_fixmap(FIX_PTE);
> >> clear_fixmap(FIX_PMD);
> >> + clear_fixmap(FIX_PUD);
> >>
> >> /* Move to swapper page table */
> >> - csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) |
> SATP_MODE);
> >> + csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) |
> >> + satp_mode);
> >> local_flush_tlb_all();
> >> }
> >>
> >> --
> >> 2.20.1
> >>