This patchset intends to improve tlb utilization by using hugepages for
the linear mapping.
As reported by Anup in v6, when STRICT_KERNEL_RWX is enabled, we must
take care of isolating the kernel text and rodata so that they are not
mapped with a PUD mapping which would then assign wrong permissions to
the whole region: it is achieved by introducing a new memblock API.
Another patch makes use of this new API in arm64 which used some sort of
hack to solve this issue: it was built/boot tested successfully.
base-commit-tag: v6.3-rc1
v7:
- Fix Anup bug report by introducing memblock_isolate_memory which
allows us to split the memblock mappings and then avoid to map the
the PUD which contains the kernel as read only
- Add a patch to arm64 to use this newly introduced API
v6:
- quiet LLVM warning by casting phys_ram_base into an unsigned long
v5:
- Fix nommu builds by getting rid of riscv_pfn_base in patch 1, thanks
Conor
- Add RB from Andrew
v4:
- Rebase on top of v6.2-rc3, as noted by Conor
- Add Acked-by Rob
v3:
- Change the comment about initrd_start VA conversion so that it fits
ARM64 and RISCV64 (and others in the future if needed), as suggested
by Rob
v2:
- Add a comment on why RISCV64 does not need to set initrd_start/end that
early in the boot process, as asked by Rob
Alexandre Ghiti (4):
riscv: Get rid of riscv_pfn_base variable
mm: Introduce memblock_isolate_memory
arm64: Make use of memblock_isolate_memory for the linear mapping
riscv: Use PUD/P4D/PGD pages for the linear mapping
arch/arm64/mm/mmu.c | 4 ++--
arch/riscv/include/asm/page.h | 19 +++++++++++++++--
arch/riscv/mm/init.c | 39 ++++++++++++++++++++++++++---------
arch/riscv/mm/physaddr.c | 16 ++++++++++++++
drivers/of/fdt.c | 11 +++++-----
include/linux/memblock.h | 1 +
mm/memblock.c | 22 +++++++++++++++++++-
7 files changed, 92 insertions(+), 20 deletions(-)
--
2.37.2
Use directly phys_ram_base instead, riscv_pfn_base is just the pfn of
the address contained in phys_ram_base.
Even if there is no functional change intended in this patch, actually
setting phys_ram_base that early changes the behaviour of
kernel_mapping_pa_to_va during the early boot: phys_ram_base used to be
zero before this patch and now it is set to the physical start address of
the kernel. But it does not break the conversion of a kernel physical
address into a virtual address since kernel_mapping_pa_to_va should only
be used on kernel physical addresses, i.e. addresses greater than the
physical start address of the kernel.
Signed-off-by: Alexandre Ghiti <[email protected]>
Reviewed-by: Andrew Jones <[email protected]>
---
arch/riscv/include/asm/page.h | 3 +--
arch/riscv/mm/init.c | 6 +-----
2 files changed, 2 insertions(+), 7 deletions(-)
diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
index 7fed7c431928..8dc686f549b6 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -91,8 +91,7 @@ typedef struct page *pgtable_t;
#endif
#ifdef CONFIG_MMU
-extern unsigned long riscv_pfn_base;
-#define ARCH_PFN_OFFSET (riscv_pfn_base)
+#define ARCH_PFN_OFFSET (PFN_DOWN((unsigned long)phys_ram_base))
#else
#define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT)
#endif /* CONFIG_MMU */
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 478d6763a01a..225a7d2b65cc 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -271,9 +271,6 @@ static void __init setup_bootmem(void)
#ifdef CONFIG_MMU
struct pt_alloc_ops pt_ops __initdata;
-unsigned long riscv_pfn_base __ro_after_init;
-EXPORT_SYMBOL(riscv_pfn_base);
-
pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss;
@@ -285,7 +282,6 @@ static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAG
#ifdef CONFIG_XIP_KERNEL
#define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&pt_ops))
-#define riscv_pfn_base (*(unsigned long *)XIP_FIXUP(&riscv_pfn_base))
#define trampoline_pg_dir ((pgd_t *)XIP_FIXUP(trampoline_pg_dir))
#define fixmap_pte ((pte_t *)XIP_FIXUP(fixmap_pte))
#define early_pg_dir ((pgd_t *)XIP_FIXUP(early_pg_dir))
@@ -985,7 +981,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
kernel_map.va_pa_offset = PAGE_OFFSET - kernel_map.phys_addr;
kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr;
- riscv_pfn_base = PFN_DOWN(kernel_map.phys_addr);
+ phys_ram_base = kernel_map.phys_addr;
/*
* The default maximal physical memory size is KERN_VIRT_SIZE for 32-bit
--
2.37.2
This function allows to split a region in memblock.memory and will be
useful when setting up the linear mapping with STRICT_KERNEL_RWX: it
allows to isolate the kernel text/rodata and then avoid to map those
regions with a PUD/P4D/PGD.
Signed-off-by: Alexandre Ghiti <[email protected]>
---
include/linux/memblock.h | 1 +
mm/memblock.c | 22 +++++++++++++++++++++-
2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 50ad19662a32..2f7ef97c0da7 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -125,6 +125,7 @@ int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
+int memblock_isolate_memory(phys_addr_t base, phys_addr_t size);
void memblock_free_all(void);
void memblock_free(void *ptr, size_t size);
diff --git a/mm/memblock.c b/mm/memblock.c
index 25fd0626a9e7..d8cf1c9eccf0 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -753,7 +753,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
int idx;
struct memblock_region *rgn;
- *start_rgn = *end_rgn = 0;
+ if (start_rgn && end_rgn)
+ *start_rgn = *end_rgn = 0;
if (!size)
return 0;
@@ -795,6 +796,9 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
memblock_get_region_node(rgn),
rgn->flags);
} else {
+ if (!end_rgn || !start_rgn)
+ continue;
+
/* @rgn is fully contained, record it */
if (!*end_rgn)
*start_rgn = idx;
@@ -805,6 +809,22 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
return 0;
}
+/**
+ * memblock_isolate_memory - isolate given range from memblock.memory
+ * @base: base of range to isolate
+ * @size: size of range to isolate
+ *
+ * Call memblock_isolate_range on memblock.memory to isolate the given range.
+ *
+ * Return:
+ * 0 on success, -errno on failure.
+ */
+
+int __init_memblock memblock_isolate_memory(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_isolate_range(&memblock.memory, base, size, NULL, NULL);
+}
+
static int __init_memblock memblock_remove_range(struct memblock_type *type,
phys_addr_t base, phys_addr_t size)
{
--
2.37.2
In order to isolate the kernel text mapping, we used some sort of hack
to isolate the kernel text range which consisted in marking this region
as not mappable with memblock_mark_nomap. Simply use the newly introduced
memblock_isolate_memory function which does exactly the same but does not
uselessly mark the region as not mappable.
Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/mm/mmu.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 6f9d8898a025..408dc852805c 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -552,7 +552,7 @@ static void __init map_mem(pgd_t *pgdp)
* So temporarily mark them as NOMAP to skip mappings in
* the following for-loop
*/
- memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
+ memblock_isolate_memory(kernel_start, kernel_end - kernel_start);
#ifdef CONFIG_KEXEC_CORE
if (crash_mem_map) {
@@ -568,6 +568,7 @@ static void __init map_mem(pgd_t *pgdp)
for_each_mem_range(i, &start, &end) {
if (start >= end)
break;
+
/*
* The linear map must allow allocation tags reading/writing
* if MTE is present. Otherwise, it has the same attributes as
@@ -589,7 +590,6 @@ static void __init map_mem(pgd_t *pgdp)
*/
__map_memblock(pgdp, kernel_start, kernel_end,
PAGE_KERNEL, NO_CONT_MAPPINGS);
- memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
/*
* Use page-level mappings here so that we can shrink the region
--
2.37.2
During the early page table creation, we used to set the mapping for
PAGE_OFFSET to the kernel load address: but the kernel load address is
always offseted by PMD_SIZE which makes it impossible to use PUD/P4D/PGD
pages as this physical address is not aligned on PUD/P4D/PGD size (whereas
PAGE_OFFSET is).
But actually we don't have to establish this mapping (ie set va_pa_offset)
that early in the boot process because:
- first, setup_vm installs a temporary kernel mapping and among other
things, discovers the system memory,
- then, setup_vm_final creates the final kernel mapping and takes
advantage of the discovered system memory to create the linear
mapping.
During the first phase, we don't know the start of the system memory and
then until the second phase is finished, we can't use the linear mapping at
all and phys_to_virt/virt_to_phys translations must not be used because it
would result in a different translation from the 'real' one once the final
mapping is installed.
So here we simply delay the initialization of va_pa_offset to after the
system memory discovery. But to make sure noone uses the linear mapping
before, we add some guard in the DEBUG_VIRTUAL config.
Finally we can use PUD/P4D/PGD hugepages when possible, which will result
in a better TLB utilization.
Note that we rely on the firmware to protect itself using PMP.
Signed-off-by: Alexandre Ghiti <[email protected]>
Acked-by: Rob Herring <[email protected]> # DT bits
Reviewed-by: Andrew Jones <[email protected]>
---
arch/riscv/include/asm/page.h | 16 ++++++++++++++++
arch/riscv/mm/init.c | 35 +++++++++++++++++++++++++++++------
arch/riscv/mm/physaddr.c | 16 ++++++++++++++++
drivers/of/fdt.c | 11 ++++++-----
4 files changed, 67 insertions(+), 11 deletions(-)
diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
index 8dc686f549b6..ea1a0e237211 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -90,6 +90,14 @@ typedef struct page *pgtable_t;
#define PTE_FMT "%08lx"
#endif
+#ifdef CONFIG_64BIT
+/*
+ * We override this value as its generic definition uses __pa too early in
+ * the boot process (before kernel_map.va_pa_offset is set).
+ */
+#define MIN_MEMBLOCK_ADDR 0
+#endif
+
#ifdef CONFIG_MMU
#define ARCH_PFN_OFFSET (PFN_DOWN((unsigned long)phys_ram_base))
#else
@@ -121,7 +129,11 @@ extern phys_addr_t phys_ram_base;
#define is_linear_mapping(x) \
((x) >= PAGE_OFFSET && (!IS_ENABLED(CONFIG_64BIT) || (x) < PAGE_OFFSET + KERN_VIRT_SIZE))
+#ifndef CONFIG_DEBUG_VIRTUAL
#define linear_mapping_pa_to_va(x) ((void *)((unsigned long)(x) + kernel_map.va_pa_offset))
+#else
+void *linear_mapping_pa_to_va(unsigned long x);
+#endif
#define kernel_mapping_pa_to_va(y) ({ \
unsigned long _y = (unsigned long)(y); \
(IS_ENABLED(CONFIG_XIP_KERNEL) && _y < phys_ram_base) ? \
@@ -130,7 +142,11 @@ extern phys_addr_t phys_ram_base;
})
#define __pa_to_va_nodebug(x) linear_mapping_pa_to_va(x)
+#ifndef CONFIG_DEBUG_VIRTUAL
#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - kernel_map.va_pa_offset)
+#else
+phys_addr_t linear_mapping_va_to_pa(unsigned long x);
+#endif
#define kernel_mapping_va_to_pa(y) ({ \
unsigned long _y = (unsigned long)(y); \
(IS_ENABLED(CONFIG_XIP_KERNEL) && _y < kernel_map.virt_addr + XIP_OFFSET) ? \
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 225a7d2b65cc..fed5109d27db 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -213,6 +213,13 @@ static void __init setup_bootmem(void)
phys_ram_end = memblock_end_of_DRAM();
if (!IS_ENABLED(CONFIG_XIP_KERNEL))
phys_ram_base = memblock_start_of_DRAM();
+
+ /*
+ * Any use of __va/__pa before this point is wrong as we did not know the
+ * start of DRAM before.
+ */
+ kernel_map.va_pa_offset = PAGE_OFFSET - phys_ram_base;
+
/*
* memblock allocator is not aware of the fact that last 4K bytes of
* the addressable memory can not be mapped because of IS_ERR_VALUE
@@ -667,9 +674,16 @@ void __init create_pgd_mapping(pgd_t *pgdp,
static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size)
{
- /* Upgrade to PMD_SIZE mappings whenever possible */
- base &= PMD_SIZE - 1;
- if (!base && size >= PMD_SIZE)
+ if (!(base & (PGDIR_SIZE - 1)) && size >= PGDIR_SIZE)
+ return PGDIR_SIZE;
+
+ if (!(base & (P4D_SIZE - 1)) && size >= P4D_SIZE)
+ return P4D_SIZE;
+
+ if (!(base & (PUD_SIZE - 1)) && size >= PUD_SIZE)
+ return PUD_SIZE;
+
+ if (!(base & (PMD_SIZE - 1)) && size >= PMD_SIZE)
return PMD_SIZE;
return PAGE_SIZE;
@@ -978,11 +992,9 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
set_satp_mode();
#endif
- kernel_map.va_pa_offset = PAGE_OFFSET - kernel_map.phys_addr;
+ kernel_map.va_pa_offset = 0UL;
kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr;
- phys_ram_base = kernel_map.phys_addr;
-
/*
* The default maximal physical memory size is KERN_VIRT_SIZE for 32-bit
* kernel, whereas for 64-bit kernel, the end of the virtual address
@@ -1097,6 +1109,17 @@ static void __init setup_vm_final(void)
__pa_symbol(fixmap_pgd_next),
PGDIR_SIZE, PAGE_TABLE);
+#ifdef CONFIG_STRICT_KERNEL_RWX
+ /*
+ * Isolate the kernel text and rodata linear so they don't
+ * get mapped with a PUD in the linear mapping.
+ */
+ memblock_isolate_memory(__pa_symbol(_start),
+ __init_data_begin - _start);
+ memblock_isolate_memory(__pa_symbol(__start_rodata),
+ __start_rodata - _data);
+#endif
+
/* Map all memory banks in the linear mapping */
for_each_mem_range(i, &start, &end) {
if (start >= end)
diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c
index 9b18bda74154..18706f457da7 100644
--- a/arch/riscv/mm/physaddr.c
+++ b/arch/riscv/mm/physaddr.c
@@ -33,3 +33,19 @@ phys_addr_t __phys_addr_symbol(unsigned long x)
return __va_to_pa_nodebug(x);
}
EXPORT_SYMBOL(__phys_addr_symbol);
+
+phys_addr_t linear_mapping_va_to_pa(unsigned long x)
+{
+ BUG_ON(!kernel_map.va_pa_offset);
+
+ return ((unsigned long)(x) - kernel_map.va_pa_offset);
+}
+EXPORT_SYMBOL(linear_mapping_va_to_pa);
+
+void *linear_mapping_pa_to_va(unsigned long x)
+{
+ BUG_ON(!kernel_map.va_pa_offset);
+
+ return ((void *)((unsigned long)(x) + kernel_map.va_pa_offset));
+}
+EXPORT_SYMBOL(linear_mapping_pa_to_va);
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index d1a68b6d03b3..d14735a81301 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -887,12 +887,13 @@ const void * __init of_flat_dt_match_machine(const void *default_match,
static void __early_init_dt_declare_initrd(unsigned long start,
unsigned long end)
{
- /* ARM64 would cause a BUG to occur here when CONFIG_DEBUG_VM is
- * enabled since __va() is called too early. ARM64 does make use
- * of phys_initrd_start/phys_initrd_size so we can skip this
- * conversion.
+ /*
+ * __va() is not yet available this early on some platforms. In that
+ * case, the platform uses phys_initrd_start/phys_initrd_size instead
+ * and does the VA conversion itself.
*/
- if (!IS_ENABLED(CONFIG_ARM64)) {
+ if (!IS_ENABLED(CONFIG_ARM64) &&
+ !(IS_ENABLED(CONFIG_RISCV) && IS_ENABLED(CONFIG_64BIT))) {
initrd_start = (unsigned long)__va(start);
initrd_end = (unsigned long)__va(end);
initrd_below_start_ok = 1;
--
2.37.2
On Fri, Mar 10, 2023 at 3:15 PM Alexandre Ghiti <[email protected]> wrote:
>
> This patchset intends to improve tlb utilization by using hugepages for
> the linear mapping.
>
> As reported by Anup in v6, when STRICT_KERNEL_RWX is enabled, we must
> take care of isolating the kernel text and rodata so that they are not
> mapped with a PUD mapping which would then assign wrong permissions to
> the whole region: it is achieved by introducing a new memblock API.
>
> Another patch makes use of this new API in arm64 which used some sort of
> hack to solve this issue: it was built/boot tested successfully.
>
> base-commit-tag: v6.3-rc1
>
> v7:
> - Fix Anup bug report by introducing memblock_isolate_memory which
> allows us to split the memblock mappings and then avoid to map the
> the PUD which contains the kernel as read only
> - Add a patch to arm64 to use this newly introduced API
>
> v6:
> - quiet LLVM warning by casting phys_ram_base into an unsigned long
>
> v5:
> - Fix nommu builds by getting rid of riscv_pfn_base in patch 1, thanks
> Conor
> - Add RB from Andrew
>
> v4:
> - Rebase on top of v6.2-rc3, as noted by Conor
> - Add Acked-by Rob
>
> v3:
> - Change the comment about initrd_start VA conversion so that it fits
> ARM64 and RISCV64 (and others in the future if needed), as suggested
> by Rob
>
> v2:
> - Add a comment on why RISCV64 does not need to set initrd_start/end that
> early in the boot process, as asked by Rob
>
> Alexandre Ghiti (4):
> riscv: Get rid of riscv_pfn_base variable
> mm: Introduce memblock_isolate_memory
> arm64: Make use of memblock_isolate_memory for the linear mapping
> riscv: Use PUD/P4D/PGD pages for the linear mapping
This works fine on QEMU RV64. I have tested on QEMU RV64 with
busybox and Fedora rootfs. I also tested KVM RV64 which works
fine as well.
But, QEMU RV32 with busybox rootfs is broken so you might want
to try that.
In any case, I have reviewed and tested this series at my end.
Reviewed-by: Anup Patel <[email protected]>
Tested-by: Anup Patel <[email protected]>
Regards,
Anup
>
> arch/arm64/mm/mmu.c | 4 ++--
> arch/riscv/include/asm/page.h | 19 +++++++++++++++--
> arch/riscv/mm/init.c | 39 ++++++++++++++++++++++++++---------
> arch/riscv/mm/physaddr.c | 16 ++++++++++++++
> drivers/of/fdt.c | 11 +++++-----
> include/linux/memblock.h | 1 +
> mm/memblock.c | 22 +++++++++++++++++++-
> 7 files changed, 92 insertions(+), 20 deletions(-)
>
> --
> 2.37.2
>
Hi Alexandre,
On Fri, Mar 10, 2023 at 10:45:37AM +0100, Alexandre Ghiti wrote:
> This function allows to split a region in memblock.memory and will be
> useful when setting up the linear mapping with STRICT_KERNEL_RWX: it
> allows to isolate the kernel text/rodata and then avoid to map those
> regions with a PUD/P4D/PGD.
>
> Signed-off-by: Alexandre Ghiti <[email protected]>
> ---
> include/linux/memblock.h | 1 +
> mm/memblock.c | 22 +++++++++++++++++++++-
> 2 files changed, 22 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index 50ad19662a32..2f7ef97c0da7 100644
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -125,6 +125,7 @@ int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
> int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
> int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
> int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
> +int memblock_isolate_memory(phys_addr_t base, phys_addr_t size);
>
> void memblock_free_all(void);
> void memblock_free(void *ptr, size_t size);
> diff --git a/mm/memblock.c b/mm/memblock.c
> index 25fd0626a9e7..d8cf1c9eccf0 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -753,7 +753,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
> int idx;
> struct memblock_region *rgn;
>
> - *start_rgn = *end_rgn = 0;
> + if (start_rgn && end_rgn)
> + *start_rgn = *end_rgn = 0;
Generally, it's possible that either start_rgn or end_rgn will be a valid
pointer and this should be handled here and below.
My preference, though would be to leave memblock_isolate_range() as is and
have unused start_rgn and end_rgn in memblock_isolate_memory().
>
> if (!size)
> return 0;
> @@ -795,6 +796,9 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
> memblock_get_region_node(rgn),
> rgn->flags);
> } else {
> + if (!end_rgn || !start_rgn)
> + continue;
> +
> /* @rgn is fully contained, record it */
> if (!*end_rgn)
> *start_rgn = idx;
> @@ -805,6 +809,22 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
> return 0;
> }
>
> +/**
> + * memblock_isolate_memory - isolate given range from memblock.memory
I think it'd better to use "... range in memblock.memory"
> + * @base: base of range to isolate
> + * @size: size of range to isolate
> + *
> + * Call memblock_isolate_range on memblock.memory to isolate the given range.
Please elaborate that isolate means that the range does not share regions
with other ranges.
> + *
> + * Return:
> + * 0 on success, -errno on failure.
> + */
> +
> +int __init_memblock memblock_isolate_memory(phys_addr_t base, phys_addr_t size)
> +{
> + return memblock_isolate_range(&memblock.memory, base, size, NULL, NULL);
> +}
> +
> static int __init_memblock memblock_remove_range(struct memblock_type *type,
> phys_addr_t base, phys_addr_t size)
> {
> --
> 2.37.2
>
--
Sincerely yours,
Mike.
On 3/12/23 10:35, Mike Rapoport wrote:
> Hi Alexandre,
>
> On Fri, Mar 10, 2023 at 10:45:37AM +0100, Alexandre Ghiti wrote:
>> This function allows to split a region in memblock.memory and will be
>> useful when setting up the linear mapping with STRICT_KERNEL_RWX: it
>> allows to isolate the kernel text/rodata and then avoid to map those
>> regions with a PUD/P4D/PGD.
>>
>> Signed-off-by: Alexandre Ghiti <[email protected]>
>> ---
>> include/linux/memblock.h | 1 +
>> mm/memblock.c | 22 +++++++++++++++++++++-
>> 2 files changed, 22 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
>> index 50ad19662a32..2f7ef97c0da7 100644
>> --- a/include/linux/memblock.h
>> +++ b/include/linux/memblock.h
>> @@ -125,6 +125,7 @@ int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
>> int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
>> int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
>> int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
>> +int memblock_isolate_memory(phys_addr_t base, phys_addr_t size);
>>
>> void memblock_free_all(void);
>> void memblock_free(void *ptr, size_t size);
>> diff --git a/mm/memblock.c b/mm/memblock.c
>> index 25fd0626a9e7..d8cf1c9eccf0 100644
>> --- a/mm/memblock.c
>> +++ b/mm/memblock.c
>> @@ -753,7 +753,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
>> int idx;
>> struct memblock_region *rgn;
>>
>> - *start_rgn = *end_rgn = 0;
>> + if (start_rgn && end_rgn)
>> + *start_rgn = *end_rgn = 0;
> Generally, it's possible that either start_rgn or end_rgn will be a valid
> pointer and this should be handled here and below.
>
> My preference, though would be to leave memblock_isolate_range() as is and
> have unused start_rgn and end_rgn in memblock_isolate_memory().
Sure, I'll do that then.
>
>>
>> if (!size)
>> return 0;
>> @@ -795,6 +796,9 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
>> memblock_get_region_node(rgn),
>> rgn->flags);
>> } else {
>> + if (!end_rgn || !start_rgn)
>> + continue;
>> +
>> /* @rgn is fully contained, record it */
>> if (!*end_rgn)
>> *start_rgn = idx;
>> @@ -805,6 +809,22 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
>> return 0;
>> }
>>
>> +/**
>> + * memblock_isolate_memory - isolate given range from memblock.memory
> I think it'd better to use "... range in memblock.memory"
Ok
>
>> + * @base: base of range to isolate
>> + * @size: size of range to isolate
>> + *
>> + * Call memblock_isolate_range on memblock.memory to isolate the given range.
> Please elaborate that isolate means that the range does not share regions
> with other ranges.
Sure, thanks, I'll come up with a v8 tomorrow along with other fixes
(rv32 and arm64 comments that I forgot to remove).
Thanks again,
Alex
>> + *
>> + * Return:
>> + * 0 on success, -errno on failure.
>> + */
>> +
>> +int __init_memblock memblock_isolate_memory(phys_addr_t base, phys_addr_t size)
>> +{
>> + return memblock_isolate_range(&memblock.memory, base, size, NULL, NULL);
>> +}
>> +
>> static int __init_memblock memblock_remove_range(struct memblock_type *type,
>> phys_addr_t base, phys_addr_t size)
>> {
>> --
>> 2.37.2
>>
On Fri, Mar 10, 2023 at 10:45:38AM +0100, Alexandre Ghiti wrote:
> In order to isolate the kernel text mapping, we used some sort of hack
> to isolate the kernel text range which consisted in marking this region
> as not mappable with memblock_mark_nomap. Simply use the newly introduced
> memblock_isolate_memory function which does exactly the same but does not
> uselessly mark the region as not mappable.
>
> Signed-off-by: Alexandre Ghiti <[email protected]>
> ---
> arch/arm64/mm/mmu.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 6f9d8898a025..408dc852805c 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -552,7 +552,7 @@ static void __init map_mem(pgd_t *pgdp)
> * So temporarily mark them as NOMAP to skip mappings in
> * the following for-loop
> */
The comment above doesn't apply anymore.
> - memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
> + memblock_isolate_memory(kernel_start, kernel_end - kernel_start);
>
> #ifdef CONFIG_KEXEC_CORE
> if (crash_mem_map) {
> @@ -568,6 +568,7 @@ static void __init map_mem(pgd_t *pgdp)
> for_each_mem_range(i, &start, &end) {
> if (start >= end)
> break;
> +
Mark nomap is also used for the crash kernel. Does the new API not work
for it?
Thanks,
drew
> /*
> * The linear map must allow allocation tags reading/writing
> * if MTE is present. Otherwise, it has the same attributes as
> @@ -589,7 +590,6 @@ static void __init map_mem(pgd_t *pgdp)
> */
> __map_memblock(pgdp, kernel_start, kernel_end,
> PAGE_KERNEL, NO_CONT_MAPPINGS);
> - memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
>
> /*
> * Use page-level mappings here so that we can shrink the region
> --
> 2.37.2
>
>
> _______________________________________________
> linux-riscv mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Hi Andrew,
On 3/13/23 10:43, Andrew Jones wrote:
> On Fri, Mar 10, 2023 at 10:45:38AM +0100, Alexandre Ghiti wrote:
>> In order to isolate the kernel text mapping, we used some sort of hack
>> to isolate the kernel text range which consisted in marking this region
>> as not mappable with memblock_mark_nomap. Simply use the newly introduced
>> memblock_isolate_memory function which does exactly the same but does not
>> uselessly mark the region as not mappable.
>>
>> Signed-off-by: Alexandre Ghiti <[email protected]>
>> ---
>> arch/arm64/mm/mmu.c | 4 ++--
>> 1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index 6f9d8898a025..408dc852805c 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -552,7 +552,7 @@ static void __init map_mem(pgd_t *pgdp)
>> * So temporarily mark them as NOMAP to skip mappings in
>> * the following for-loop
>> */
> The comment above doesn't apply anymore.
Yep, noticed this one after sending, thanks anyway!
>
>> - memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
>> + memblock_isolate_memory(kernel_start, kernel_end - kernel_start);
>>
>> #ifdef CONFIG_KEXEC_CORE
>> if (crash_mem_map) {
>> @@ -568,6 +568,7 @@ static void __init map_mem(pgd_t *pgdp)
>> for_each_mem_range(i, &start, &end) {
>> if (start >= end)
>> break;
>> +
> Mark nomap is also used for the crash kernel. Does the new API not work
> for it?
Seems you're right, I missed it.
Thanks,
Alex
>
> Thanks,
> drew
>
>> /*
>> * The linear map must allow allocation tags reading/writing
>> * if MTE is present. Otherwise, it has the same attributes as
>> @@ -589,7 +590,6 @@ static void __init map_mem(pgd_t *pgdp)
>> */
>> __map_memblock(pgdp, kernel_start, kernel_end,
>> PAGE_KERNEL, NO_CONT_MAPPINGS);
>> - memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
>>
>> /*
>> * Use page-level mappings here so that we can shrink the region
>> --
>> 2.37.2
>>
>>
>> _______________________________________________
>> linux-riscv mailing list
>> [email protected]
>> http://lists.infradead.org/mailman/listinfo/linux-riscv
> _______________________________________________
> linux-riscv mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-riscv