LinuxLists.cc - [PATCH v3] arm64: mm: fix linear mapping mem access performance degradation

2022-06-30 11:47:13

Subject: [PATCH v3] arm64: mm: fix linear mapping mem access performance degradation

The arm64 can build 2M/1G block/sectiion mapping. When using DMA/DMA32 zone
(enable crashkernel, disable rodata full, disable kfence), the mem_map will
use non block/section mapping(for crashkernel requires to shrink the region
in page granularity). But it will degrade performance when doing larging
continuous mem access in kernel(memcpy/memmove, etc).

There are many changes and discussions:
commit 031495635b46 ("arm64: Do not defer reserve_crashkernel() for
platforms with no DMA memory zones")
commit 0a30c53573b0 ("arm64: mm: Move reserve_crashkernel() into
mem_init()")
commit 2687275a5843 ("arm64: Force NO_BLOCK_MAPPINGS if crashkernel
reservation is required")

This patch changes mem_map to use block/section mapping with crashkernel.
Firstly, do block/section mapping(normally 2M or 1G) for all avail mem at
mem_map, reserve crashkernel memory. And then walking pagetable to split
block/section mapping to non block/section mapping(normally 4K) [[[only]]]
for crashkernel mem. So the linear mem mapping use block/section mapping
as more as possible. We will reduce the cpu dTLB miss conspicuously, and
accelerate mem access about 10-20% performance improvement.

I have tested it with pft(Page Fault Test) and fio, obtained great
performace improvement.

For fio test:
1.prepare ramdisk
modprobe -r brd
modprobe brd rd_nr=1 rd_size=67108864
dmsetup remove_all
wipefs -a --force /dev/ram0
mkfs -t ext4 -E lazy_itable_init=0,lazy_journal_init=0 -q -F /dev/ram0
mkdir -p /fs/ram0
mount -t ext4 /dev/ram0 /fs/ram0

2.prepare fio paremeter in x.fio file:
[global]
bs=4k
ioengine=psync
iodepth=128
size=32G
direct=1
invalidate=1
group_reporting
thread=1
rw=read
directory=/fs/ram0
numjobs=1

[task_0]
cpus_allowed=16
stonewall=1

3.run testcase:
perf stat -e dTLB-load-misses fio x.fio

4.contrast
------------------------
without patch with patch
fio READ aggrb=1493.2MB/s aggrb=1775.3MB/s
dTLB-load-misses 1,818,320,693 438,729,774
time elapsed(s) 70.500326434 62.877316408
user(s) 15.926332000 15.684721000
sys(s) 54.211939000 47.046165000

5.conclusion
Using this patch will reduce dTLB misses and improve performace greatly.

Signed-off-by: Guanghui Feng <[email protected]>
---
arch/arm64/include/asm/mmu.h | 1 +
arch/arm64/mm/init.c | 8 +-
arch/arm64/mm/mmu.c | 231 ++++++++++++++++++++++++++++++-------------
3 files changed, 168 insertions(+), 72 deletions(-)

diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 48f8466..1a46b81 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -63,6 +63,7 @@ static inline bool arm64_kernel_unmapped_at_el0(void)
extern void arm64_memblock_init(void);
extern void paging_init(void);
extern void bootmem_init(void);
+extern void map_crashkernel(void);
extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt);
extern void init_mem_pgprot(void);
extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 339ee84..241d27e 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -190,6 +190,7 @@ static void __init reserve_crashkernel(void)
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
insert_resource(&iomem_resource, &crashk_res);
+ map_crashkernel();
}

/*
@@ -388,10 +389,6 @@ void __init arm64_memblock_init(void)
}

early_init_fdt_scan_reserved_mem();
-
- if (!IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32))
- reserve_crashkernel();
-
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
}

@@ -438,8 +435,7 @@ void __init bootmem_init(void)
* request_standard_resources() depends on crashkernel's memory being
* reserved, so do it here.
*/
- if (IS_ENABLED(CONFIG_ZONE_DMA) || IS_ENABLED(CONFIG_ZONE_DMA32))
- reserve_crashkernel();
+ reserve_crashkernel();

memblock_dump_all();
}
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 626ec32..4b779cf 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -42,6 +42,7 @@
#define NO_BLOCK_MAPPINGS BIT(0)
#define NO_CONT_MAPPINGS BIT(1)
#define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */
+#define NO_SEC_REMAPPINGS BIT(3) /* rebuild with non block/sec mapping*/

u64 idmap_t0sz = TCR_T0SZ(VA_BITS_MIN);
u64 idmap_ptrs_per_pgd = PTRS_PER_PGD;
@@ -156,11 +157,12 @@ static bool pgattr_change_is_safe(u64 old, u64 new)
}

static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
- phys_addr_t phys, pgprot_t prot)
+ phys_addr_t phys, pgprot_t prot, int flags)
{
pte_t *ptep;

- ptep = pte_set_fixmap_offset(pmdp, addr);
+ ptep = (flags & NO_SEC_REMAPPINGS) ? pte_offset_kernel(pmdp, addr) :
+ pte_set_fixmap_offset(pmdp, addr);
do {
pte_t old_pte = READ_ONCE(*ptep);

@@ -176,7 +178,8 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
phys += PAGE_SIZE;
} while (ptep++, addr += PAGE_SIZE, addr != end);

- pte_clear_fixmap();
+ if (!(flags & NO_SEC_REMAPPINGS))
+ pte_clear_fixmap();
}

static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
@@ -208,16 +211,59 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
next = pte_cont_addr_end(addr, end);

/* use a contiguous mapping if the range is suitably aligned */
- if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
+ if (!(flags & NO_SEC_REMAPPINGS) &&
+ (((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
(flags & NO_CONT_MAPPINGS) == 0)
__prot = __pgprot(pgprot_val(prot) | PTE_CONT);

- init_pte(pmdp, addr, next, phys, __prot);
+ init_pte(pmdp, addr, next, phys, __prot, flags);

phys += next - addr;
} while (addr = next, addr != end);
}

+static void init_pmd_remap(pud_t *pudp, unsigned long addr, unsigned long end,
+ phys_addr_t phys, pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(int), int flags)
+{
+ unsigned long next;
+ pmd_t *pmdp;
+ phys_addr_t map_offset;
+ pmdval_t pmdval;
+
+ pmdp = pmd_offset(pudp, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+
+ if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
+ phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
+ pmd_clear(pmdp);
+ pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
+ if (flags & NO_EXEC_MAPPINGS)
+ pmdval |= PMD_TABLE_PXN;
+ __pmd_populate(pmdp, pte_phys, pmdval);
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+
+ map_offset = addr - (addr & PMD_MASK);
+ if (map_offset)
+ alloc_init_cont_pte(pmdp, addr & PMD_MASK, addr,
+ phys - map_offset, prot,
+ pgtable_alloc,
+ flags & (~NO_SEC_REMAPPINGS));
+
+ if (next < (addr & PMD_MASK) + PMD_SIZE)
+ alloc_init_cont_pte(pmdp, next,
+ (addr & PUD_MASK) + PUD_SIZE,
+ next - addr + phys,
+ prot, pgtable_alloc,
+ flags & (~NO_SEC_REMAPPINGS));
+ }
+ alloc_init_cont_pte(pmdp, addr, next, phys, prot,
+ pgtable_alloc, flags);
+ phys += next - addr;
+ } while (pmdp++, addr = next, addr != end);
+}
+
static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
phys_addr_t phys, pgprot_t prot,
phys_addr_t (*pgtable_alloc)(int), int flags)
@@ -286,16 +332,87 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
next = pmd_cont_addr_end(addr, end);

/* use a contiguous mapping if the range is suitably aligned */
- if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) &&
+ if (!(flags & NO_SEC_REMAPPINGS) &&
+ (((addr | next | phys) & ~CONT_PMD_MASK) == 0) &&
(flags & NO_CONT_MAPPINGS) == 0)
__prot = __pgprot(pgprot_val(prot) | PTE_CONT);

- init_pmd(pudp, addr, next, phys, __prot, pgtable_alloc, flags);
+ if (flags & NO_SEC_REMAPPINGS)
+ init_pmd_remap(pudp, addr, next, phys, __prot,
+ pgtable_alloc, flags);
+ else
+ init_pmd(pudp, addr, next, phys, __prot,
+ pgtable_alloc, flags);

phys += next - addr;
} while (addr = next, addr != end);
}

+static void init_pud_remap(pud_t *pudp, unsigned long addr, unsigned long next,
+ phys_addr_t phys, pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(int),
+ int flags)
+{
+ pudval_t pudval;
+ phys_addr_t map_offset;
+
+ if (!pud_none(*pudp) && pud_sect(*pudp)) {
+ phys_addr_t pmd_phys = pgtable_alloc(PMD_SHIFT);
+ pud_clear(pudp);
+ pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
+ if (flags & NO_EXEC_MAPPINGS)
+ pudval |= PUD_TABLE_PXN;
+
+ __pud_populate(pudp, pmd_phys, pudval);
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+
+ map_offset = addr - (addr & PUD_MASK);
+ if (map_offset)
+ alloc_init_cont_pmd(pudp, addr & PUD_MASK,
+ addr, phys - map_offset,
+ prot, pgtable_alloc,
+ flags & (~NO_SEC_REMAPPINGS));
+
+ if (next < (addr & PUD_MASK) + PUD_SIZE)
+ alloc_init_cont_pmd(pudp, next,
+ (addr & PUD_MASK) + PUD_SIZE,
+ next - addr + phys,
+ prot, pgtable_alloc,
+ flags & (~NO_SEC_REMAPPINGS));
+ }
+ alloc_init_cont_pmd(pudp, addr, next, phys, prot,
+ pgtable_alloc, flags);
+}
+
+static void init_pud(pud_t *pudp, unsigned long addr, unsigned long next,
+ phys_addr_t phys, pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(int),
+ int flags)
+{
+ pud_t old_pud = READ_ONCE(*pudp);
+ /*
+ * For 4K granule only, attempt to put down a 1GB block
+ */
+ if (pud_sect_supported() &&
+ ((addr | next | phys) & ~PUD_MASK) == 0 &&
+ (flags & NO_BLOCK_MAPPINGS) == 0) {
+ pud_set_huge(pudp, phys, prot);
+
+ /*
+ * After the PUD entry has been populated once, we
+ * only allow updates to the permission attributes.
+ */
+ BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
+ READ_ONCE(pud_val(*pudp))));
+ } else {
+ alloc_init_cont_pmd(pudp, addr, next, phys, prot,
+ pgtable_alloc, flags);
+
+ BUG_ON(pud_val(old_pud) != 0 &&
+ pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
+ }
+}
+
static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
phys_addr_t phys, pgprot_t prot,
phys_addr_t (*pgtable_alloc)(int),
@@ -325,37 +442,24 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
*/
if (system_state != SYSTEM_BOOTING)
mutex_lock(&fixmap_lock);
- pudp = pud_set_fixmap_offset(p4dp, addr);
- do {
- pud_t old_pud = READ_ONCE(*pudp);

+ pudp = (flags & NO_SEC_REMAPPINGS) ? pud_offset(p4dp, addr) :
+ pud_set_fixmap_offset(p4dp, addr);
+ do {
next = pud_addr_end(addr, end);

- /*
- * For 4K granule only, attempt to put down a 1GB block
- */
- if (pud_sect_supported() &&
- ((addr | next | phys) & ~PUD_MASK) == 0 &&
- (flags & NO_BLOCK_MAPPINGS) == 0) {
- pud_set_huge(pudp, phys, prot);
-
- /*
- * After the PUD entry has been populated once, we
- * only allow updates to the permission attributes.
- */
- BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
- READ_ONCE(pud_val(*pudp))));
- } else {
- alloc_init_cont_pmd(pudp, addr, next, phys, prot,
- pgtable_alloc, flags);
-
- BUG_ON(pud_val(old_pud) != 0 &&
- pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
- }
+ if (flags & NO_SEC_REMAPPINGS)
+ init_pud_remap(pudp, addr, next, phys, prot,
+ pgtable_alloc, flags);
+ else
+ init_pud(pudp, addr, next, phys, prot, pgtable_alloc,
+ flags);
phys += next - addr;
} while (pudp++, addr = next, addr != end);

- pud_clear_fixmap();
+ if (!(flags & NO_SEC_REMAPPINGS))
+ pud_clear_fixmap();
+
if (system_state != SYSTEM_BOOTING)
mutex_unlock(&fixmap_lock);
}
@@ -483,20 +587,39 @@ void __init mark_linear_text_alias_ro(void)
PAGE_KERNEL_RO);
}

-static bool crash_mem_map __initdata;
+#ifdef CONFIG_KEXEC_CORE
+static phys_addr_t __init early_crashkernel_pgtable_alloc(int shift)
+{
+ phys_addr_t phys;
+ void *ptr;

-static int __init enable_crash_mem_map(char *arg)
+ phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
+ MEMBLOCK_ALLOC_NOLEAKTRACE);
+ if (!phys)
+ panic("Failed to allocate page table page\n");
+
+ ptr = (void *)__phys_to_virt(phys);
+ memset(ptr, 0, PAGE_SIZE);
+ return phys;
+}
+
+void __init map_crashkernel(void)
{
- /*
- * Proper parameter parsing is done by reserve_crashkernel(). We only
- * need to know if the linear map has to avoid block mappings so that
- * the crashkernel reservations can be unmapped later.
- */
- crash_mem_map = true;
+ if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
+ return;

- return 0;
+ if (!crashk_res.end)
+ return;
+
+ __create_pgd_mapping(swapper_pg_dir, crashk_res.start,
+ __phys_to_virt(crashk_res.start),
+ crashk_res.end + 1 - crashk_res.start, PAGE_KERNEL,
+ early_crashkernel_pgtable_alloc,
+ NO_EXEC_MAPPINGS | NO_SEC_REMAPPINGS);
}
-early_param("crashkernel", enable_crash_mem_map);
+#else
+void __init map_crashkernel(void) {}
+#endif

static void __init map_mem(pgd_t *pgdp)
{
@@ -527,17 +650,6 @@ static void __init map_mem(pgd_t *pgdp)
*/
memblock_mark_nomap(kernel_start, kernel_end - kernel_start);

-#ifdef CONFIG_KEXEC_CORE
- if (crash_mem_map) {
- if (IS_ENABLED(CONFIG_ZONE_DMA) ||
- IS_ENABLED(CONFIG_ZONE_DMA32))
- flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
- else if (crashk_res.end)
- memblock_mark_nomap(crashk_res.start,
- resource_size(&crashk_res));
- }
-#endif
-
/* map all the memory banks */
for_each_mem_range(i, &start, &end) {
if (start >= end)
@@ -570,19 +682,6 @@ static void __init map_mem(pgd_t *pgdp)
* in page granularity and put back unused memory to buddy system
* through /sys/kernel/kexec_crash_size interface.
*/
-#ifdef CONFIG_KEXEC_CORE
- if (crash_mem_map &&
- !IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32)) {
- if (crashk_res.end) {
- __map_memblock(pgdp, crashk_res.start,
- crashk_res.end + 1,
- PAGE_KERNEL,
- NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
- memblock_clear_nomap(crashk_res.start,
- resource_size(&crashk_res));
- }
- }
-#endif
}

void mark_rodata_ro(void)
--
1.8.3.1

2022-06-30 11:56:17

by Kefeng Wang

[permalink] [raw]

Subject: Re: [PATCH v3] arm64: mm: fix linear mapping mem access performance degradation

On 2022/6/30 18:50, Guanghui Feng wrote:
> The arm64 can build 2M/1G block/sectiion mapping. When using DMA/DMA32 zone
> (enable crashkernel, disable rodata full, disable kfence), the mem_map will
> use non block/section mapping(for crashkernel requires to shrink the region
> in page granularity). But it will degrade performance when doing larging
> continuous mem access in kernel(memcpy/memmove, etc).
>
> There are many changes and discussions:
> commit 031495635b46 ("arm64: Do not defer reserve_crashkernel() for
> platforms with no DMA memory zones")
> commit 0a30c53573b0 ("arm64: mm: Move reserve_crashkernel() into
> mem_init()")
> commit 2687275a5843 ("arm64: Force NO_BLOCK_MAPPINGS if crashkernel
> reservation is required")
>
> This patch changes mem_map to use block/section mapping with crashkernel.
> Firstly, do block/section mapping(normally 2M or 1G) for all avail mem at
> mem_map, reserve crashkernel memory. And then walking pagetable to split
> block/section mapping to non block/section mapping(normally 4K) [[[only]]]
> for crashkernel mem. So the linear mem mapping use block/section mapping
> as more as possible. We will reduce the cpu dTLB miss conspicuously, and
> accelerate mem access about 10-20% performance improvement.
>
> I have tested it with pft(Page Fault Test) and fio, obtained great
> performace improvement.
>
> For fio test:
> 1.prepare ramdisk
> modprobe -r brd
> modprobe brd rd_nr=1 rd_size=67108864
> dmsetup remove_all
> wipefs -a --force /dev/ram0
> mkfs -t ext4 -E lazy_itable_init=0,lazy_journal_init=0 -q -F /dev/ram0
> mkdir -p /fs/ram0
> mount -t ext4 /dev/ram0 /fs/ram0
>
> 2.prepare fio paremeter in x.fio file:
> [global]
> bs=4k
> ioengine=psync
> iodepth=128
> size=32G
> direct=1
> invalidate=1
> group_reporting
> thread=1
> rw=read
> directory=/fs/ram0
> numjobs=1
>
> [task_0]
> cpus_allowed=16
> stonewall=1
>
> 3.run testcase:
> perf stat -e dTLB-load-misses fio x.fio
>
> 4.contrast
> ------------------------
> without patch with patch
> fio READ aggrb=1493.2MB/s aggrb=1775.3MB/s
> dTLB-load-misses 1,818,320,693 438,729,774
> time elapsed(s) 70.500326434 62.877316408
> user(s) 15.926332000 15.684721000
> sys(s) 54.211939000 47.046165000
>
> 5.conclusion
> Using this patch will reduce dTLB misses and improve performace greatly.
Hi Guanhui， I do some basic test based on v1，it works well.
>
> Signed-off-by: Guanghui Feng <[email protected]>
> ---
> arch/arm64/include/asm/mmu.h | 1 +
> arch/arm64/mm/init.c | 8 +-
> arch/arm64/mm/mmu.c | 231 ++++++++++++++++++++++++++++++-------------
> 3 files changed, 168 insertions(+), 72 deletions(-)
>
> diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
> index 48f8466..1a46b81 100644
> --- a/arch/arm64/include/asm/mmu.h
> +++ b/arch/arm64/include/asm/mmu.h
> @@ -63,6 +63,7 @@ static inline bool arm64_kernel_unmapped_at_el0(void)
> extern void arm64_memblock_init(void);
> extern void paging_init(void);
> extern void bootmem_init(void);
> +extern void map_crashkernel(void);
> extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt);
> extern void init_mem_pgprot(void);
> extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 339ee84..241d27e 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -190,6 +190,7 @@ static void __init reserve_crashkernel(void)
> crashk_res.start = crash_base;
> crashk_res.end = crash_base + crash_size - 1;
> insert_resource(&iomem_resource, &crashk_res);
> + map_crashkernel();
> }
>
There are many comment in init.c which added by commit 031495635b46 ("arm64:
Do not defer reserve_crashkernel() for platforms with no DMA memory zones"),
with this patch, there are not quite right, we should update or kill them.
> /*
> @@ -388,10 +389,6 @@ void __init arm64_memblock_init(void)
> }
>
> early_init_fdt_scan_reserved_mem();
> -
> - if (!IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32))
> - reserve_crashkernel();
> -
> high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
> }
>
> @@ -438,8 +435,7 @@ void __init bootmem_init(void)
> * request_standard_resources() depends on crashkernel's memory being
> * reserved, so do it here.
> */
ditto
> - if (IS_ENABLED(CONFIG_ZONE_DMA) || IS_ENABLED(CONFIG_ZONE_DMA32))
> - reserve_crashkernel();
> + reserve_crashkernel();
>
> memblock_dump_all();
> }
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 626ec32..4b779cf 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -42,6 +42,7 @@
> #define NO_BLOCK_MAPPINGS BIT(0)
> #define NO_CONT_MAPPINGS BIT(1)
> #define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */
> +#define NO_SEC_REMAPPINGS BIT(3) /* rebuild with non block/sec mapping*/
>
> u64 idmap_t0sz = TCR_T0SZ(VA_BITS_MIN);
> u64 idmap_ptrs_per_pgd = PTRS_PER_PGD;
> @@ -156,11 +157,12 @@ static bool pgattr_change_is_safe(u64 old, u64 new)
> }
>
> static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
> - phys_addr_t phys, pgprot_t prot)
> + phys_addr_t phys, pgprot_t prot, int flags)
> {
> pte_t *ptep;
>
> - ptep = pte_set_fixmap_offset(pmdp, addr);
> + ptep = (flags & NO_SEC_REMAPPINGS) ? pte_offset_kernel(pmdp, addr) :
> + pte_set_fixmap_offset(pmdp, addr);
> do {
> pte_t old_pte = READ_ONCE(*ptep);
>
> @@ -176,7 +178,8 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
> phys += PAGE_SIZE;
> } while (ptep++, addr += PAGE_SIZE, addr != end);
>
> - pte_clear_fixmap();
> + if (!(flags & NO_SEC_REMAPPINGS))
> + pte_clear_fixmap();
> }
>
> static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
> @@ -208,16 +211,59 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
> next = pte_cont_addr_end(addr, end);
>
> /* use a contiguous mapping if the range is suitably aligned */
> - if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
> + if (!(flags & NO_SEC_REMAPPINGS) &&
> + (((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
> (flags & NO_CONT_MAPPINGS) == 0)
> __prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>
> - init_pte(pmdp, addr, next, phys, __prot);
> + init_pte(pmdp, addr, next, phys, __prot, flags);
>
> phys += next - addr;
> } while (addr = next, addr != end);
> }
>
> +static void init_pmd_remap(pud_t *pudp, unsigned long addr, unsigned long end,
> + phys_addr_t phys, pgprot_t prot,
> + phys_addr_t (*pgtable_alloc)(int), int flags)
> +{
> + unsigned long next;
> + pmd_t *pmdp;
> + phys_addr_t map_offset;
> + pmdval_t pmdval;
> +
> + pmdp = pmd_offset(pudp, addr);
> + do {
> + next = pmd_addr_end(addr, end);
> +
> + if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
> + phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
> + pmd_clear(pmdp);
> + pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
> + if (flags & NO_EXEC_MAPPINGS)
> + pmdval |= PMD_TABLE_PXN;
> + __pmd_populate(pmdp, pte_phys, pmdval);
> + flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> +
> + map_offset = addr - (addr & PMD_MASK);
> + if (map_offset)
> + alloc_init_cont_pte(pmdp, addr & PMD_MASK, addr,
> + phys - map_offset, prot,
> + pgtable_alloc,
> + flags & (~NO_SEC_REMAPPINGS));
> +
> + if (next < (addr & PMD_MASK) + PMD_SIZE)
> + alloc_init_cont_pte(pmdp, next,
> + (addr & PUD_MASK) + PUD_SIZE,
> + next - addr + phys,
> + prot, pgtable_alloc,
> + flags & (~NO_SEC_REMAPPINGS));
> + }
> + alloc_init_cont_pte(pmdp, addr, next, phys, prot,
> + pgtable_alloc, flags);
> + phys += next - addr;
> + } while (pmdp++, addr = next, addr != end);
> +}
> +
> static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
> phys_addr_t phys, pgprot_t prot,
> phys_addr_t (*pgtable_alloc)(int), int flags)
> @@ -286,16 +332,87 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
> next = pmd_cont_addr_end(addr, end);
>
> /* use a contiguous mapping if the range is suitably aligned */
> - if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) &&
> + if (!(flags & NO_SEC_REMAPPINGS) &&
> + (((addr | next | phys) & ~CONT_PMD_MASK) == 0) &&
> (flags & NO_CONT_MAPPINGS) == 0)
> __prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>
> - init_pmd(pudp, addr, next, phys, __prot, pgtable_alloc, flags);
> + if (flags & NO_SEC_REMAPPINGS)
> + init_pmd_remap(pudp, addr, next, phys, __prot,
> + pgtable_alloc, flags);
> + else
> + init_pmd(pudp, addr, next, phys, __prot,
> + pgtable_alloc, flags);
>
> phys += next - addr;
> } while (addr = next, addr != end);
> }
>
> +static void init_pud_remap(pud_t *pudp, unsigned long addr, unsigned long next,
> + phys_addr_t phys, pgprot_t prot,
> + phys_addr_t (*pgtable_alloc)(int),
> + int flags)
> +{
> + pudval_t pudval;
> + phys_addr_t map_offset;
> +
> + if (!pud_none(*pudp) && pud_sect(*pudp)) {
> + phys_addr_t pmd_phys = pgtable_alloc(PMD_SHIFT);
> + pud_clear(pudp);
> + pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
> + if (flags & NO_EXEC_MAPPINGS)
> + pudval |= PUD_TABLE_PXN;
> +
> + __pud_populate(pudp, pmd_phys, pudval);
> + flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> +
> + map_offset = addr - (addr & PUD_MASK);
> + if (map_offset)
> + alloc_init_cont_pmd(pudp, addr & PUD_MASK,
> + addr, phys - map_offset,
> + prot, pgtable_alloc,
> + flags & (~NO_SEC_REMAPPINGS));
> +
> + if (next < (addr & PUD_MASK) + PUD_SIZE)
> + alloc_init_cont_pmd(pudp, next,
> + (addr & PUD_MASK) + PUD_SIZE,
> + next - addr + phys,
> + prot, pgtable_alloc,
> + flags & (~NO_SEC_REMAPPINGS));
> + }
> + alloc_init_cont_pmd(pudp, addr, next, phys, prot,
> + pgtable_alloc, flags);
> +}
> +
> +static void init_pud(pud_t *pudp, unsigned long addr, unsigned long next,
> + phys_addr_t phys, pgprot_t prot,
> + phys_addr_t (*pgtable_alloc)(int),
> + int flags)
> +{
> + pud_t old_pud = READ_ONCE(*pudp);
> + /*
> + * For 4K granule only, attempt to put down a 1GB block
> + */
> + if (pud_sect_supported() &&
> + ((addr | next | phys) & ~PUD_MASK) == 0 &&
> + (flags & NO_BLOCK_MAPPINGS) == 0) {
> + pud_set_huge(pudp, phys, prot);
> +
> + /*
> + * After the PUD entry has been populated once, we
> + * only allow updates to the permission attributes.
> + */
> + BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
> + READ_ONCE(pud_val(*pudp))));
> + } else {
> + alloc_init_cont_pmd(pudp, addr, next, phys, prot,
> + pgtable_alloc, flags);
> +
> + BUG_ON(pud_val(old_pud) != 0 &&
> + pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
> + }
> +}
> +
> static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
> phys_addr_t phys, pgprot_t prot,
> phys_addr_t (*pgtable_alloc)(int),
> @@ -325,37 +442,24 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
> */
> if (system_state != SYSTEM_BOOTING)
> mutex_lock(&fixmap_lock);
> - pudp = pud_set_fixmap_offset(p4dp, addr);
> - do {
> - pud_t old_pud = READ_ONCE(*pudp);
>
> + pudp = (flags & NO_SEC_REMAPPINGS) ? pud_offset(p4dp, addr) :
> + pud_set_fixmap_offset(p4dp, addr);
Skip mutex_lock too as v1 said.
> + do {
> next = pud_addr_end(addr, end);
>
> - /*
> - * For 4K granule only, attempt to put down a 1GB block
> - */
> - if (pud_sect_supported() &&
> - ((addr | next | phys) & ~PUD_MASK) == 0 &&
> - (flags & NO_BLOCK_MAPPINGS) == 0) {
> - pud_set_huge(pudp, phys, prot);
> -
> - /*
> - * After the PUD entry has been populated once, we
> - * only allow updates to the permission attributes.
> - */
> - BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
> - READ_ONCE(pud_val(*pudp))));
> - } else {
> - alloc_init_cont_pmd(pudp, addr, next, phys, prot,
> - pgtable_alloc, flags);
> -
> - BUG_ON(pud_val(old_pud) != 0 &&
> - pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
> - }
> + if (flags & NO_SEC_REMAPPINGS)
> + init_pud_remap(pudp, addr, next, phys, prot,
> + pgtable_alloc, flags);
> + else
> + init_pud(pudp, addr, next, phys, prot, pgtable_alloc,
> + flags);
> phys += next - addr;
> } while (pudp++, addr = next, addr != end);
>
> - pud_clear_fixmap();
> + if (!(flags & NO_SEC_REMAPPINGS))
> + pud_clear_fixmap();
> +
> if (system_state != SYSTEM_BOOTING)
> mutex_unlock(&fixmap_lock);
> }
> @@ -483,20 +587,39 @@ void __init mark_linear_text_alias_ro(void)
> PAGE_KERNEL_RO);
> }
>
> -static bool crash_mem_map __initdata;
> +#ifdef CONFIG_KEXEC_CORE
> +static phys_addr_t __init early_crashkernel_pgtable_alloc(int shift)
> +{
> + phys_addr_t phys;
> + void *ptr;
>
> -static int __init enable_crash_mem_map(char *arg)
> + phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
> + MEMBLOCK_ALLOC_NOLEAKTRACE);
> + if (!phys)
> + panic("Failed to allocate page table page\n");
> +
> + ptr = (void *)__phys_to_virt(phys);
> + memset(ptr, 0, PAGE_SIZE);
> + return phys;
> +}
> +
> +void __init map_crashkernel(void)
> {
> - /*
> - * Proper parameter parsing is done by reserve_crashkernel(). We only
> - * need to know if the linear map has to avoid block mappings so that
> - * the crashkernel reservations can be unmapped later.
> - */
> - crash_mem_map = true;
> + if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
> + return;
>
> - return 0;
> + if (!crashk_res.end)
> + return;
> +
> + __create_pgd_mapping(swapper_pg_dir, crashk_res.start,
> + __phys_to_virt(crashk_res.start),
> + crashk_res.end + 1 - crashk_res.start, PAGE_KERNEL,
> + early_crashkernel_pgtable_alloc,
> + NO_EXEC_MAPPINGS | NO_SEC_REMAPPINGS);
> }
> -early_param("crashkernel", enable_crash_mem_map);
> +#else
> +void __init map_crashkernel(void) {}
> +#endif
>
> static void __init map_mem(pgd_t *pgdp)
> {
> @@ -527,17 +650,6 @@ static void __init map_mem(pgd_t *pgdp)
> */
> memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
>
> -#ifdef CONFIG_KEXEC_CORE
> - if (crash_mem_map) {
> - if (IS_ENABLED(CONFIG_ZONE_DMA) ||
> - IS_ENABLED(CONFIG_ZONE_DMA32))
> - flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
> - else if (crashk_res.end)
> - memblock_mark_nomap(crashk_res.start,
> - resource_size(&crashk_res));
> - }
> -#endif
> -
> /* map all the memory banks */
> for_each_mem_range(i, &start, &end) {
> if (start >= end)
> @@ -570,19 +682,6 @@ static void __init map_mem(pgd_t *pgdp)
> * in page granularity and put back unused memory to buddy system
> * through /sys/kernel/kexec_crash_size interface.
> */

kill comment too.

Thanks.

> -#ifdef CONFIG_KEXEC_CORE
> - if (crash_mem_map &&
> - !IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32)) {
> - if (crashk_res.end) {
> - __map_memblock(pgdp, crashk_res.start,
> - crashk_res.end + 1,
> - PAGE_KERNEL,
> - NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
> - memblock_clear_nomap(crashk_res.start,
> - resource_size(&crashk_res));
> - }
> - }
> -#endif
> }
>
> void mark_rodata_ro(void)

2022-06-30 13:06:39

by Guanghui Feng

[permalink] [raw]

Subject: Re: [PATCH v3] arm64: mm: fix linear mapping mem access performance degradation

在 2022/6/30 18:50, Guanghui Feng 写道:
> The arm64 can build 2M/1G block/sectiion mapping. When using DMA/DMA32 zone
> (enable crashkernel, disable rodata full, disable kfence), the mem_map will
> use non block/section mapping(for crashkernel requires to shrink the region
> in page granularity). But it will degrade performance when doing larging
> continuous mem access in kernel(memcpy/memmove, etc).
>
> There are many changes and discussions:
> commit 031495635b46 ("arm64: Do not defer reserve_crashkernel() for
> platforms with no DMA memory zones")
> commit 0a30c53573b0 ("arm64: mm: Move reserve_crashkernel() into
> mem_init()")
> commit 2687275a5843 ("arm64: Force NO_BLOCK_MAPPINGS if crashkernel
> reservation is required")
>
> This patch changes mem_map to use block/section mapping with crashkernel.
> Firstly, do block/section mapping(normally 2M or 1G) for all avail mem at
> mem_map, reserve crashkernel memory. And then walking pagetable to split
> block/section mapping to non block/section mapping(normally 4K) [[[only]]]
> for crashkernel mem. So the linear mem mapping use block/section mapping
> as more as possible. We will reduce the cpu dTLB miss conspicuously, and
> accelerate mem access about 10-20% performance improvement.
>
> I have tested it with pft(Page Fault Test) and fio, obtained great
> performace improvement.
>
> For fio test:
> 1.prepare ramdisk
> modprobe -r brd
> modprobe brd rd_nr=1 rd_size=67108864
> dmsetup remove_all
> wipefs -a --force /dev/ram0
> mkfs -t ext4 -E lazy_itable_init=0,lazy_journal_init=0 -q -F /dev/ram0
> mkdir -p /fs/ram0
> mount -t ext4 /dev/ram0 /fs/ram0
>
> 2.prepare fio paremeter in x.fio file:
> [global]
> bs=4k
> ioengine=psync
> iodepth=128
> size=32G
> direct=1
> invalidate=1
> group_reporting
> thread=1
> rw=read
> directory=/fs/ram0
> numjobs=1
>
> [task_0]
> cpus_allowed=16
> stonewall=1
>
> 3.run testcase:
> perf stat -e dTLB-load-misses fio x.fio
>
> 4.contrast
> ------------------------
> without patch with patch
> fio READ aggrb=1493.2MB/s aggrb=1775.3MB/s
> dTLB-load-misses 1,818,320,693 438,729,774
> time elapsed(s) 70.500326434 62.877316408
> user(s) 15.926332000 15.684721000
> sys(s) 54.211939000 47.046165000
>
> 5.conclusion
> Using this patch will reduce dTLB misses and improve performace greatly.
>
> Signed-off-by: Guanghui Feng <[email protected]>
> ---
> arch/arm64/include/asm/mmu.h | 1 +
> arch/arm64/mm/init.c | 8 +-
> arch/arm64/mm/mmu.c | 231 ++++++++++++++++++++++++++++++-------------
> 3 files changed, 168 insertions(+), 72 deletions(-)
>
> diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
> index 48f8466..1a46b81 100644
> --- a/arch/arm64/include/asm/mmu.h
> +++ b/arch/arm64/include/asm/mmu.h
> @@ -63,6 +63,7 @@ static inline bool arm64_kernel_unmapped_at_el0(void)
> extern void arm64_memblock_init(void);
> extern void paging_init(void);
> extern void bootmem_init(void);
> +extern void map_crashkernel(void);
> extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt);
> extern void init_mem_pgprot(void);
> extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 339ee84..241d27e 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -190,6 +190,7 @@ static void __init reserve_crashkernel(void)
> crashk_res.start = crash_base;
> crashk_res.end = crash_base + crash_size - 1;
> insert_resource(&iomem_resource, &crashk_res);
> + map_crashkernel();
> }
>
> /*
> @@ -388,10 +389,6 @@ void __init arm64_memblock_init(void)
> }
>
> early_init_fdt_scan_reserved_mem();
> -
> - if (!IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32))
> - reserve_crashkernel();
> -
> high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
> }
>
> @@ -438,8 +435,7 @@ void __init bootmem_init(void)
> * request_standard_resources() depends on crashkernel's memory being
> * reserved, so do it here.
> */
> - if (IS_ENABLED(CONFIG_ZONE_DMA) || IS_ENABLED(CONFIG_ZONE_DMA32))
> - reserve_crashkernel();
> + reserve_crashkernel();
>
> memblock_dump_all();
> }
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 626ec32..4b779cf 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -42,6 +42,7 @@
> #define NO_BLOCK_MAPPINGS BIT(0)
> #define NO_CONT_MAPPINGS BIT(1)
> #define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */
> +#define NO_SEC_REMAPPINGS BIT(3) /* rebuild with non block/sec mapping*/
>
> u64 idmap_t0sz = TCR_T0SZ(VA_BITS_MIN);
> u64 idmap_ptrs_per_pgd = PTRS_PER_PGD;
> @@ -156,11 +157,12 @@ static bool pgattr_change_is_safe(u64 old, u64 new)
> }
>
> static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
> - phys_addr_t phys, pgprot_t prot)
> + phys_addr_t phys, pgprot_t prot, int flags)
> {
> pte_t *ptep;
>
> - ptep = pte_set_fixmap_offset(pmdp, addr);
> + ptep = (flags & NO_SEC_REMAPPINGS) ? pte_offset_kernel(pmdp, addr) :
> + pte_set_fixmap_offset(pmdp, addr);

I'm sorry.
There is a bug.
In the init_pud_remap function rebuild left/right margin mem mapping(out
of crashkernel mem) without NO_SEC_REMAPPINGS, so it will call the
init_pmd/init_pte with using pmd_set_fixmap_offset/pte_set_fixmap_offset.

But when rebuilding, it should't use fixmap(because the pgdir have
switch to swapper_pg_dir)
I'll fix it soon.

> do {
> pte_t old_pte = READ_ONCE(*ptep);
>
> @@ -176,7 +178,8 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
> phys += PAGE_SIZE;
> } while (ptep++, addr += PAGE_SIZE, addr != end);
>
> - pte_clear_fixmap();
> + if (!(flags & NO_SEC_REMAPPINGS))
> + pte_clear_fixmap();
> }
>
> static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
> @@ -208,16 +211,59 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
> next = pte_cont_addr_end(addr, end);
>
> /* use a contiguous mapping if the range is suitably aligned */
> - if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
> + if (!(flags & NO_SEC_REMAPPINGS) &&
> + (((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
> (flags & NO_CONT_MAPPINGS) == 0)
> __prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>
> - init_pte(pmdp, addr, next, phys, __prot);
> + init_pte(pmdp, addr, next, phys, __prot, flags);
>
> phys += next - addr;
> } while (addr = next, addr != end);
> }
>
> +static void init_pmd_remap(pud_t *pudp, unsigned long addr, unsigned long end,
> + phys_addr_t phys, pgprot_t prot,
> + phys_addr_t (*pgtable_alloc)(int), int flags)
> +{
> + unsigned long next;
> + pmd_t *pmdp;
> + phys_addr_t map_offset;
> + pmdval_t pmdval;
> +
> + pmdp = pmd_offset(pudp, addr);
> + do {
> + next = pmd_addr_end(addr, end);
> +
> + if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
> + phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
> + pmd_clear(pmdp);
> + pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
> + if (flags & NO_EXEC_MAPPINGS)
> + pmdval |= PMD_TABLE_PXN;
> + __pmd_populate(pmdp, pte_phys, pmdval);
> + flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> +
> + map_offset = addr - (addr & PMD_MASK);
> + if (map_offset)
> + alloc_init_cont_pte(pmdp, addr & PMD_MASK, addr,
> + phys - map_offset, prot,
> + pgtable_alloc,
> + flags & (~NO_SEC_REMAPPINGS));
> +
> + if (next < (addr & PMD_MASK) + PMD_SIZE)
> + alloc_init_cont_pte(pmdp, next,
> + (addr & PUD_MASK) + PUD_SIZE,
> + next - addr + phys,
> + prot, pgtable_alloc,
> + flags & (~NO_SEC_REMAPPINGS));
> + }
> + alloc_init_cont_pte(pmdp, addr, next, phys, prot,
> + pgtable_alloc, flags);
> + phys += next - addr;
> + } while (pmdp++, addr = next, addr != end);
> +}
> +
> static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
> phys_addr_t phys, pgprot_t prot,
> phys_addr_t (*pgtable_alloc)(int), int flags)
> @@ -286,16 +332,87 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
> next = pmd_cont_addr_end(addr, end);
>
> /* use a contiguous mapping if the range is suitably aligned */
> - if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) &&
> + if (!(flags & NO_SEC_REMAPPINGS) &&
> + (((addr | next | phys) & ~CONT_PMD_MASK) == 0) &&
> (flags & NO_CONT_MAPPINGS) == 0)
> __prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>
> - init_pmd(pudp, addr, next, phys, __prot, pgtable_alloc, flags);
> + if (flags & NO_SEC_REMAPPINGS)
> + init_pmd_remap(pudp, addr, next, phys, __prot,
> + pgtable_alloc, flags);
> + else
> + init_pmd(pudp, addr, next, phys, __prot,
> + pgtable_alloc, flags);
>
> phys += next - addr;
> } while (addr = next, addr != end);
> }
>
> +static void init_pud_remap(pud_t *pudp, unsigned long addr, unsigned long next,
> + phys_addr_t phys, pgprot_t prot,
> + phys_addr_t (*pgtable_alloc)(int),
> + int flags)
> +{
> + pudval_t pudval;
> + phys_addr_t map_offset;
> +
> + if (!pud_none(*pudp) && pud_sect(*pudp)) {
> + phys_addr_t pmd_phys = pgtable_alloc(PMD_SHIFT);
> + pud_clear(pudp);
> + pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
> + if (flags & NO_EXEC_MAPPINGS)
> + pudval |= PUD_TABLE_PXN;
> +
> + __pud_populate(pudp, pmd_phys, pudval);
> + flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> +
> + map_offset = addr - (addr & PUD_MASK);
> + if (map_offset)
> + alloc_init_cont_pmd(pudp, addr & PUD_MASK,
> + addr, phys - map_offset,
> + prot, pgtable_alloc,
> + flags & (~NO_SEC_REMAPPINGS));
> +
> + if (next < (addr & PUD_MASK) + PUD_SIZE)
> + alloc_init_cont_pmd(pudp, next,
> + (addr & PUD_MASK) + PUD_SIZE,
> + next - addr + phys,
> + prot, pgtable_alloc,
> + flags & (~NO_SEC_REMAPPINGS));
> + }
> + alloc_init_cont_pmd(pudp, addr, next, phys, prot,
> + pgtable_alloc, flags);
> +}
> +
> +static void init_pud(pud_t *pudp, unsigned long addr, unsigned long next,
> + phys_addr_t phys, pgprot_t prot,
> + phys_addr_t (*pgtable_alloc)(int),
> + int flags)
> +{
> + pud_t old_pud = READ_ONCE(*pudp);
> + /*
> + * For 4K granule only, attempt to put down a 1GB block
> + */
> + if (pud_sect_supported() &&
> + ((addr | next | phys) & ~PUD_MASK) == 0 &&
> + (flags & NO_BLOCK_MAPPINGS) == 0) {
> + pud_set_huge(pudp, phys, prot);
> +
> + /*
> + * After the PUD entry has been populated once, we
> + * only allow updates to the permission attributes.
> + */
> + BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
> + READ_ONCE(pud_val(*pudp))));
> + } else {
> + alloc_init_cont_pmd(pudp, addr, next, phys, prot,
> + pgtable_alloc, flags);
> +
> + BUG_ON(pud_val(old_pud) != 0 &&
> + pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
> + }
> +}
> +
> static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
> phys_addr_t phys, pgprot_t prot,
> phys_addr_t (*pgtable_alloc)(int),
> @@ -325,37 +442,24 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
> */
> if (system_state != SYSTEM_BOOTING)
> mutex_lock(&fixmap_lock);
> - pudp = pud_set_fixmap_offset(p4dp, addr);
> - do {
> - pud_t old_pud = READ_ONCE(*pudp);
>
> + pudp = (flags & NO_SEC_REMAPPINGS) ? pud_offset(p4dp, addr) :
> + pud_set_fixmap_offset(p4dp, addr);
> + do {
> next = pud_addr_end(addr, end);
>
> - /*
> - * For 4K granule only, attempt to put down a 1GB block
> - */
> - if (pud_sect_supported() &&
> - ((addr | next | phys) & ~PUD_MASK) == 0 &&
> - (flags & NO_BLOCK_MAPPINGS) == 0) {
> - pud_set_huge(pudp, phys, prot);
> -
> - /*
> - * After the PUD entry has been populated once, we
> - * only allow updates to the permission attributes.
> - */
> - BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
> - READ_ONCE(pud_val(*pudp))));
> - } else {
> - alloc_init_cont_pmd(pudp, addr, next, phys, prot,
> - pgtable_alloc, flags);
> -
> - BUG_ON(pud_val(old_pud) != 0 &&
> - pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
> - }
> + if (flags & NO_SEC_REMAPPINGS)
> + init_pud_remap(pudp, addr, next, phys, prot,
> + pgtable_alloc, flags);
> + else
> + init_pud(pudp, addr, next, phys, prot, pgtable_alloc,
> + flags);
> phys += next - addr;
> } while (pudp++, addr = next, addr != end);
>
> - pud_clear_fixmap();
> + if (!(flags & NO_SEC_REMAPPINGS))
> + pud_clear_fixmap();
> +
> if (system_state != SYSTEM_BOOTING)
> mutex_unlock(&fixmap_lock);
> }
> @@ -483,20 +587,39 @@ void __init mark_linear_text_alias_ro(void)
> PAGE_KERNEL_RO);
> }
>
> -static bool crash_mem_map __initdata;
> +#ifdef CONFIG_KEXEC_CORE
> +static phys_addr_t __init early_crashkernel_pgtable_alloc(int shift)
> +{
> + phys_addr_t phys;
> + void *ptr;
>
> -static int __init enable_crash_mem_map(char *arg)
> + phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
> + MEMBLOCK_ALLOC_NOLEAKTRACE);
> + if (!phys)
> + panic("Failed to allocate page table page\n");
> +
> + ptr = (void *)__phys_to_virt(phys);
> + memset(ptr, 0, PAGE_SIZE);
> + return phys;
> +}
> +
> +void __init map_crashkernel(void)
> {
> - /*
> - * Proper parameter parsing is done by reserve_crashkernel(). We only
> - * need to know if the linear map has to avoid block mappings so that
> - * the crashkernel reservations can be unmapped later.
> - */
> - crash_mem_map = true;
> + if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
> + return;
>
> - return 0;
> + if (!crashk_res.end)
> + return;
> +
> + __create_pgd_mapping(swapper_pg_dir, crashk_res.start,
> + __phys_to_virt(crashk_res.start),
> + crashk_res.end + 1 - crashk_res.start, PAGE_KERNEL,
> + early_crashkernel_pgtable_alloc,
> + NO_EXEC_MAPPINGS | NO_SEC_REMAPPINGS);
> }
> -early_param("crashkernel", enable_crash_mem_map);
> +#else
> +void __init map_crashkernel(void) {}
> +#endif
>
> static void __init map_mem(pgd_t *pgdp)
> {
> @@ -527,17 +650,6 @@ static void __init map_mem(pgd_t *pgdp)
> */
> memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
>
> -#ifdef CONFIG_KEXEC_CORE
> - if (crash_mem_map) {
> - if (IS_ENABLED(CONFIG_ZONE_DMA) ||
> - IS_ENABLED(CONFIG_ZONE_DMA32))
> - flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
> - else if (crashk_res.end)
> - memblock_mark_nomap(crashk_res.start,
> - resource_size(&crashk_res));
> - }
> -#endif
> -
> /* map all the memory banks */
> for_each_mem_range(i, &start, &end) {
> if (start >= end)
> @@ -570,19 +682,6 @@ static void __init map_mem(pgd_t *pgdp)
> * in page granularity and put back unused memory to buddy system
> * through /sys/kernel/kexec_crash_size interface.
> */
> -#ifdef CONFIG_KEXEC_CORE
> - if (crash_mem_map &&
> - !IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32)) {
> - if (crashk_res.end) {
> - __map_memblock(pgdp, crashk_res.start,
> - crashk_res.end + 1,
> - PAGE_KERNEL,
> - NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
> - memblock_clear_nomap(crashk_res.start,
> - resource_size(&crashk_res));
> - }
> - }
> -#endif
> }
>
> void mark_rodata_ro(void)

2022-06-30 14:55:24

by Mike Rapoport

[permalink] [raw]

Subject: Re: [PATCH v3] arm64: mm: fix linear mapping mem access performance degradation

Hi,

On Thu, Jun 30, 2022 at 06:50:22PM +0800, Guanghui Feng wrote:
> The arm64 can build 2M/1G block/sectiion mapping. When using DMA/DMA32 zone
> (enable crashkernel, disable rodata full, disable kfence), the mem_map will
> use non block/section mapping(for crashkernel requires to shrink the region
> in page granularity). But it will degrade performance when doing larging
> continuous mem access in kernel(memcpy/memmove, etc).
>
> There are many changes and discussions:
> commit 031495635b46 ("arm64: Do not defer reserve_crashkernel() for
> platforms with no DMA memory zones")
> commit 0a30c53573b0 ("arm64: mm: Move reserve_crashkernel() into
> mem_init()")
> commit 2687275a5843 ("arm64: Force NO_BLOCK_MAPPINGS if crashkernel
> reservation is required")
>
> This patch changes mem_map to use block/section mapping with crashkernel.
> Firstly, do block/section mapping(normally 2M or 1G) for all avail mem at
> mem_map, reserve crashkernel memory. And then walking pagetable to split
> block/section mapping to non block/section mapping(normally 4K) [[[only]]]
> for crashkernel mem. So the linear mem mapping use block/section mapping
> as more as possible. We will reduce the cpu dTLB miss conspicuously, and
> accelerate mem access about 10-20% performance improvement.

...

> Signed-off-by: Guanghui Feng <[email protected]>
> ---
> arch/arm64/include/asm/mmu.h | 1 +
> arch/arm64/mm/init.c | 8 +-
> arch/arm64/mm/mmu.c | 231 ++++++++++++++++++++++++++++++-------------
> 3 files changed, 168 insertions(+), 72 deletions(-)

...

> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 626ec32..4b779cf 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -42,6 +42,7 @@
> #define NO_BLOCK_MAPPINGS BIT(0)
> #define NO_CONT_MAPPINGS BIT(1)
> #define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */
> +#define NO_SEC_REMAPPINGS BIT(3) /* rebuild with non block/sec mapping*/
>
> u64 idmap_t0sz = TCR_T0SZ(VA_BITS_MIN);
> u64 idmap_ptrs_per_pgd = PTRS_PER_PGD;
> @@ -156,11 +157,12 @@ static bool pgattr_change_is_safe(u64 old, u64 new)
> }
>
> static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
> - phys_addr_t phys, pgprot_t prot)
> + phys_addr_t phys, pgprot_t prot, int flags)
> {
> pte_t *ptep;
>
> - ptep = pte_set_fixmap_offset(pmdp, addr);
> + ptep = (flags & NO_SEC_REMAPPINGS) ? pte_offset_kernel(pmdp, addr) :
> + pte_set_fixmap_offset(pmdp, addr);
> do {
> pte_t old_pte = READ_ONCE(*ptep);
>
> @@ -176,7 +178,8 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
> phys += PAGE_SIZE;
> } while (ptep++, addr += PAGE_SIZE, addr != end);
>
> - pte_clear_fixmap();
> + if (!(flags & NO_SEC_REMAPPINGS))
> + pte_clear_fixmap();
> }
>
> static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
> @@ -208,16 +211,59 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
> next = pte_cont_addr_end(addr, end);
>
> /* use a contiguous mapping if the range is suitably aligned */
> - if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
> + if (!(flags & NO_SEC_REMAPPINGS) &&
> + (((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
> (flags & NO_CONT_MAPPINGS) == 0)
> __prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>
> - init_pte(pmdp, addr, next, phys, __prot);
> + init_pte(pmdp, addr, next, phys, __prot, flags);
>
> phys += next - addr;
> } while (addr = next, addr != end);
> }
>
> +static void init_pmd_remap(pud_t *pudp, unsigned long addr, unsigned long end,
> + phys_addr_t phys, pgprot_t prot,
> + phys_addr_t (*pgtable_alloc)(int), int flags)
> +{
> + unsigned long next;
> + pmd_t *pmdp;
> + phys_addr_t map_offset;
> + pmdval_t pmdval;
> +
> + pmdp = pmd_offset(pudp, addr);
> + do {
> + next = pmd_addr_end(addr, end);
> +
> + if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
> + phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
> + pmd_clear(pmdp);
> + pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
> + if (flags & NO_EXEC_MAPPINGS)
> + pmdval |= PMD_TABLE_PXN;
> + __pmd_populate(pmdp, pte_phys, pmdval);
> + flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> +
> + map_offset = addr - (addr & PMD_MASK);
> + if (map_offset)
> + alloc_init_cont_pte(pmdp, addr & PMD_MASK, addr,
> + phys - map_offset, prot,
> + pgtable_alloc,
> + flags & (~NO_SEC_REMAPPINGS));
> +
> + if (next < (addr & PMD_MASK) + PMD_SIZE)
> + alloc_init_cont_pte(pmdp, next,
> + (addr & PUD_MASK) + PUD_SIZE,
> + next - addr + phys,
> + prot, pgtable_alloc,
> + flags & (~NO_SEC_REMAPPINGS));
> + }
> + alloc_init_cont_pte(pmdp, addr, next, phys, prot,
> + pgtable_alloc, flags);
> + phys += next - addr;
> + } while (pmdp++, addr = next, addr != end);
> +}

There is still to much duplicated code here and in init_pud_remap().

Did you consider something like this:

void __init map_crashkernel(void)
{
int flags = NO_EXEC_MAPPINGS | NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
u64 size;

/*
* check if crash kernel supported, reserved etc
*/

size = crashk_res.end + 1 - crashk_res.start;

__remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
__create_pgd_mapping(swapper_pg_dir, crashk_res.start,
__phys_to_virt(crashk_res.start), size,
PAGE_KERNEL, early_pgtable_alloc, flags);
}

...

--
Sincerely yours,
Mike.

2022-07-01 04:41:55

by Guanghui Feng

[permalink] [raw]

Subject: Re: [PATCH v3] arm64: mm: fix linear mapping mem access performance degradation

Thanks.

在 2022/6/30 21:46, Mike Rapoport 写道:
> Hi,
>
> On Thu, Jun 30, 2022 at 06:50:22PM +0800, Guanghui Feng wrote:
>> The arm64 can build 2M/1G block/sectiion mapping. When using DMA/DMA32 zone
>> (enable crashkernel, disable rodata full, disable kfence), the mem_map will
>> use non block/section mapping(for crashkernel requires to shrink the region
>> in page granularity). But it will degrade performance when doing larging
>> continuous mem access in kernel(memcpy/memmove, etc).
>>
>> There are many changes and discussions:
>> commit 031495635b46 ("arm64: Do not defer reserve_crashkernel() for
>> platforms with no DMA memory zones")
>> commit 0a30c53573b0 ("arm64: mm: Move reserve_crashkernel() into
>> mem_init()")
>> commit 2687275a5843 ("arm64: Force NO_BLOCK_MAPPINGS if crashkernel
>> reservation is required")
>>
>> This patch changes mem_map to use block/section mapping with crashkernel.
>> Firstly, do block/section mapping(normally 2M or 1G) for all avail mem at
>> mem_map, reserve crashkernel memory. And then walking pagetable to split
>> block/section mapping to non block/section mapping(normally 4K) [[[only]]]
>> for crashkernel mem. So the linear mem mapping use block/section mapping
>> as more as possible. We will reduce the cpu dTLB miss conspicuously, and
>> accelerate mem access about 10-20% performance improvement.
>
> ...
>
>> Signed-off-by: Guanghui Feng <[email protected]>
>> ---
>> arch/arm64/include/asm/mmu.h | 1 +
>> arch/arm64/mm/init.c | 8 +-
>> arch/arm64/mm/mmu.c | 231 ++++++++++++++++++++++++++++++-------------
>> 3 files changed, 168 insertions(+), 72 deletions(-)
>
> ...
>
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index 626ec32..4b779cf 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -42,6 +42,7 @@
>> #define NO_BLOCK_MAPPINGS BIT(0)
>> #define NO_CONT_MAPPINGS BIT(1)
>> #define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */
>> +#define NO_SEC_REMAPPINGS BIT(3) /* rebuild with non block/sec mapping*/
>>
>> u64 idmap_t0sz = TCR_T0SZ(VA_BITS_MIN);
>> u64 idmap_ptrs_per_pgd = PTRS_PER_PGD;
>> @@ -156,11 +157,12 @@ static bool pgattr_change_is_safe(u64 old, u64 new)
>> }
>>
>> static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
>> - phys_addr_t phys, pgprot_t prot)
>> + phys_addr_t phys, pgprot_t prot, int flags)
>> {
>> pte_t *ptep;
>>
>> - ptep = pte_set_fixmap_offset(pmdp, addr);
>> + ptep = (flags & NO_SEC_REMAPPINGS) ? pte_offset_kernel(pmdp, addr) :
>> + pte_set_fixmap_offset(pmdp, addr);
>> do {
>> pte_t old_pte = READ_ONCE(*ptep);
>>
>> @@ -176,7 +178,8 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
>> phys += PAGE_SIZE;
>> } while (ptep++, addr += PAGE_SIZE, addr != end);
>>
>> - pte_clear_fixmap();
>> + if (!(flags & NO_SEC_REMAPPINGS))
>> + pte_clear_fixmap();
>> }
>>
>> static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>> @@ -208,16 +211,59 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>> next = pte_cont_addr_end(addr, end);
>>
>> /* use a contiguous mapping if the range is suitably aligned */
>> - if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
>> + if (!(flags & NO_SEC_REMAPPINGS) &&
>> + (((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
>> (flags & NO_CONT_MAPPINGS) == 0)
>> __prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>>
>> - init_pte(pmdp, addr, next, phys, __prot);
>> + init_pte(pmdp, addr, next, phys, __prot, flags);
>>
>> phys += next - addr;
>> } while (addr = next, addr != end);
>> }
>>
>> +static void init_pmd_remap(pud_t *pudp, unsigned long addr, unsigned long end,
>> + phys_addr_t phys, pgprot_t prot,
>> + phys_addr_t (*pgtable_alloc)(int), int flags)
>> +{
>> + unsigned long next;
>> + pmd_t *pmdp;
>> + phys_addr_t map_offset;
>> + pmdval_t pmdval;
>> +
>> + pmdp = pmd_offset(pudp, addr);
>> + do {
>> + next = pmd_addr_end(addr, end);
>> +
>> + if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
>> + phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
>> + pmd_clear(pmdp);
>> + pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
>> + if (flags & NO_EXEC_MAPPINGS)
>> + pmdval |= PMD_TABLE_PXN;
>> + __pmd_populate(pmdp, pte_phys, pmdval);
>> + flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>> +
>> + map_offset = addr - (addr & PMD_MASK);
>> + if (map_offset)
>> + alloc_init_cont_pte(pmdp, addr & PMD_MASK, addr,
>> + phys - map_offset, prot,
>> + pgtable_alloc,
>> + flags & (~NO_SEC_REMAPPINGS));
>> +
>> + if (next < (addr & PMD_MASK) + PMD_SIZE)
>> + alloc_init_cont_pte(pmdp, next,
>> + (addr & PUD_MASK) + PUD_SIZE,
>> + next - addr + phys,
>> + prot, pgtable_alloc,
>> + flags & (~NO_SEC_REMAPPINGS));
>> + }
>> + alloc_init_cont_pte(pmdp, addr, next, phys, prot,
>> + pgtable_alloc, flags);
>> + phys += next - addr;
>> + } while (pmdp++, addr = next, addr != end);
>> +}
>
> There is still to much duplicated code here and in init_pud_remap().
>
> Did you consider something like this:
>
> void __init map_crashkernel(void)
> {
> int flags = NO_EXEC_MAPPINGS | NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
> u64 size;
>
> /*
> * check if crash kernel supported, reserved etc
> */
>
>
> size = crashk_res.end + 1 - crashk_res.start;
>
> __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
> __create_pgd_mapping(swapper_pg_dir, crashk_res.start,
> __phys_to_virt(crashk_res.start), size,
> PAGE_KERNEL, early_pgtable_alloc, flags);
> }
>
I'm trying do this.
But I think it's the Inverse Process of mem mapping and also generates
duplicated code(Boundary judgment, pagetable modify).

When removing the pgd mapping, it may split pud/pmd section which also
needs [[[rebuild and clear]]] some pagetable.
> ...
>

2022-07-01 17:13:45

by Mike Rapoport

[permalink] [raw]

Subject: Re: [PATCH v3] arm64: mm: fix linear mapping mem access performance degradation

On Fri, Jul 01, 2022 at 12:36:00PM +0800, guanghui.fgh wrote:
> Thanks.
>
> 在 2022/6/30 21:46, Mike Rapoport 写道:
> > Hi,
> >
> > On Thu, Jun 30, 2022 at 06:50:22PM +0800, Guanghui Feng wrote:
> > > The arm64 can build 2M/1G block/sectiion mapping. When using DMA/DMA32 zone
> > > (enable crashkernel, disable rodata full, disable kfence), the mem_map will
> > > use non block/section mapping(for crashkernel requires to shrink the region
> > > in page granularity). But it will degrade performance when doing larging
> > > continuous mem access in kernel(memcpy/memmove, etc).
> > >
> > > There are many changes and discussions:
> > > commit 031495635b46 ("arm64: Do not defer reserve_crashkernel() for
> > > platforms with no DMA memory zones")
> > > commit 0a30c53573b0 ("arm64: mm: Move reserve_crashkernel() into
> > > mem_init()")
> > > commit 2687275a5843 ("arm64: Force NO_BLOCK_MAPPINGS if crashkernel
> > > reservation is required")
> > >
> > > This patch changes mem_map to use block/section mapping with crashkernel.
> > > Firstly, do block/section mapping(normally 2M or 1G) for all avail mem at
> > > mem_map, reserve crashkernel memory. And then walking pagetable to split
> > > block/section mapping to non block/section mapping(normally 4K) [[[only]]]
> > > for crashkernel mem. So the linear mem mapping use block/section mapping
> > > as more as possible. We will reduce the cpu dTLB miss conspicuously, and
> > > accelerate mem access about 10-20% performance improvement.
> >
> > ...
> > > Signed-off-by: Guanghui Feng <[email protected]>
> > > ---
> > > arch/arm64/include/asm/mmu.h | 1 +
> > > arch/arm64/mm/init.c | 8 +-
> > > arch/arm64/mm/mmu.c | 231 ++++++++++++++++++++++++++++++-------------
> > > 3 files changed, 168 insertions(+), 72 deletions(-)
> >
> > ...
> >
> > > diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> > > index 626ec32..4b779cf 100644
> > > --- a/arch/arm64/mm/mmu.c
> > > +++ b/arch/arm64/mm/mmu.c
> > > @@ -42,6 +42,7 @@
> > > #define NO_BLOCK_MAPPINGS BIT(0)
> > > #define NO_CONT_MAPPINGS BIT(1)
> > > #define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */
> > > +#define NO_SEC_REMAPPINGS BIT(3) /* rebuild with non block/sec mapping*/
> > > u64 idmap_t0sz = TCR_T0SZ(VA_BITS_MIN);
> > > u64 idmap_ptrs_per_pgd = PTRS_PER_PGD;
> > > @@ -156,11 +157,12 @@ static bool pgattr_change_is_safe(u64 old, u64 new)
> > > }
> > > static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
> > > - phys_addr_t phys, pgprot_t prot)
> > > + phys_addr_t phys, pgprot_t prot, int flags)
> > > {
> > > pte_t *ptep;
> > > - ptep = pte_set_fixmap_offset(pmdp, addr);
> > > + ptep = (flags & NO_SEC_REMAPPINGS) ? pte_offset_kernel(pmdp, addr) :
> > > + pte_set_fixmap_offset(pmdp, addr);
> > > do {
> > > pte_t old_pte = READ_ONCE(*ptep);
> > > @@ -176,7 +178,8 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
> > > phys += PAGE_SIZE;
> > > } while (ptep++, addr += PAGE_SIZE, addr != end);
> > > - pte_clear_fixmap();
> > > + if (!(flags & NO_SEC_REMAPPINGS))
> > > + pte_clear_fixmap();
> > > }
> > > static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
> > > @@ -208,16 +211,59 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
> > > next = pte_cont_addr_end(addr, end);
> > > /* use a contiguous mapping if the range is suitably aligned */
> > > - if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
> > > + if (!(flags & NO_SEC_REMAPPINGS) &&
> > > + (((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
> > > (flags & NO_CONT_MAPPINGS) == 0)
> > > __prot = __pgprot(pgprot_val(prot) | PTE_CONT);
> > > - init_pte(pmdp, addr, next, phys, __prot);
> > > + init_pte(pmdp, addr, next, phys, __prot, flags);
> > > phys += next - addr;
> > > } while (addr = next, addr != end);
> > > }
> > > +static void init_pmd_remap(pud_t *pudp, unsigned long addr, unsigned long end,
> > > + phys_addr_t phys, pgprot_t prot,
> > > + phys_addr_t (*pgtable_alloc)(int), int flags)
> > > +{
> > > + unsigned long next;
> > > + pmd_t *pmdp;
> > > + phys_addr_t map_offset;
> > > + pmdval_t pmdval;
> > > +
> > > + pmdp = pmd_offset(pudp, addr);
> > > + do {
> > > + next = pmd_addr_end(addr, end);
> > > +
> > > + if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
> > > + phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
> > > + pmd_clear(pmdp);
> > > + pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
> > > + if (flags & NO_EXEC_MAPPINGS)
> > > + pmdval |= PMD_TABLE_PXN;
> > > + __pmd_populate(pmdp, pte_phys, pmdval);
> > > + flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> > > +
> > > + map_offset = addr - (addr & PMD_MASK);
> > > + if (map_offset)
> > > + alloc_init_cont_pte(pmdp, addr & PMD_MASK, addr,
> > > + phys - map_offset, prot,
> > > + pgtable_alloc,
> > > + flags & (~NO_SEC_REMAPPINGS));
> > > +
> > > + if (next < (addr & PMD_MASK) + PMD_SIZE)
> > > + alloc_init_cont_pte(pmdp, next,
> > > + (addr & PUD_MASK) + PUD_SIZE,
> > > + next - addr + phys,
> > > + prot, pgtable_alloc,
> > > + flags & (~NO_SEC_REMAPPINGS));
> > > + }
> > > + alloc_init_cont_pte(pmdp, addr, next, phys, prot,
> > > + pgtable_alloc, flags);
> > > + phys += next - addr;
> > > + } while (pmdp++, addr = next, addr != end);
> > > +}
> >
> > There is still to much duplicated code here and in init_pud_remap().
> >
> > Did you consider something like this:
> >
> > void __init map_crashkernel(void)
> > {
> > int flags = NO_EXEC_MAPPINGS | NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
> > u64 size;
> >
> > /*
> > * check if crash kernel supported, reserved etc
> > */
> >
> >
> > size = crashk_res.end + 1 - crashk_res.start;
> >
> > __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
> > __create_pgd_mapping(swapper_pg_dir, crashk_res.start,
> > __phys_to_virt(crashk_res.start), size,
> > PAGE_KERNEL, early_pgtable_alloc, flags);
> > }
> >
> I'm trying do this.
> But I think it's the Inverse Process of mem mapping and also generates
> duplicated code(Boundary judgment, pagetable modify).
>
> When removing the pgd mapping, it may split pud/pmd section which also needs
> [[[rebuild and clear]]] some pagetable.

Well, __remove_pgd_mapping() is probably an overkill, but
unmap_hotplug_pmd_range() and unmap_hotplug_pud_range() should do, depending
on the size of the crash kernel.

--
Sincerely yours,
Mike.

2022-07-01 17:35:23

by Catalin Marinas

[permalink] [raw]

Subject: Re: [PATCH v3] arm64: mm: fix linear mapping mem access performance degradation

On Thu, Jun 30, 2022 at 06:50:22PM +0800, Guanghui Feng wrote:
> +static void init_pmd_remap(pud_t *pudp, unsigned long addr, unsigned long end,
> + phys_addr_t phys, pgprot_t prot,
> + phys_addr_t (*pgtable_alloc)(int), int flags)
> +{
> + unsigned long next;
> + pmd_t *pmdp;
> + phys_addr_t map_offset;
> + pmdval_t pmdval;
> +
> + pmdp = pmd_offset(pudp, addr);
> + do {
> + next = pmd_addr_end(addr, end);
> +
> + if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
> + phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
> + pmd_clear(pmdp);
> + pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
> + if (flags & NO_EXEC_MAPPINGS)
> + pmdval |= PMD_TABLE_PXN;
> + __pmd_populate(pmdp, pte_phys, pmdval);
> + flush_tlb_kernel_range(addr, addr + PAGE_SIZE);

This doesn't follow the architecture requirements for "break before
make" when changing live page tables. While it may work, it risks
triggering a TLB conflict abort. The correct sequence normally is:

pmd_clear();
flush_tlb_kernel_range();
__pmd_populate();

However, do we have any guarantees that the kernel doesn't access the
pmd range being unmapped temporarily? The page table itself might live
in one of these sections, so set_pmd() etc. can get a translation fault.

--
Catalin

2022-07-02 10:31:51

by Guanghui Feng

[permalink] [raw]

Subject: Re: [PATCH v3] arm64: mm: fix linear mapping mem access performance degradation

2022-07-02 11:09:27

by Guanghui Feng

[permalink] [raw]

Subject: Re: [PATCH v3] arm64: mm: fix linear mapping mem access performance degradation

2022-07-08 12:17:34

by Guanghui Feng

[permalink] [raw]

Subject: Re: [PATCH v3] arm64: mm: fix linear mapping mem access performance degradation

Thanks.

在 2022/7/2 1:24, Catalin Marinas 写道:
> On Thu, Jun 30, 2022 at 06:50:22PM +0800, Guanghui Feng wrote:
>> +static void init_pmd_remap(pud_t *pudp, unsigned long addr, unsigned long end,
>> + phys_addr_t phys, pgprot_t prot,
>> + phys_addr_t (*pgtable_alloc)(int), int flags)
>> +{
>> + unsigned long next;
>> + pmd_t *pmdp;
>> + phys_addr_t map_offset;
>> + pmdval_t pmdval;
>> +
>> + pmdp = pmd_offset(pudp, addr);
>> + do {
>> + next = pmd_addr_end(addr, end);
>> +
>> + if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
>> + phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
>> + pmd_clear(pmdp);
>> + pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
>> + if (flags & NO_EXEC_MAPPINGS)
>> + pmdval |= PMD_TABLE_PXN;
>> + __pmd_populate(pmdp, pte_phys, pmdval);
>> + flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>
> This doesn't follow the architecture requirements for "break before
> make" when changing live page tables. While it may work, it risks
> triggering a TLB conflict abort. The correct sequence normally is:
>
> pmd_clear();
> flush_tlb_kernel_range();
> __pmd_populate();
>
> However, do we have any guarantees that the kernel doesn't access the
> pmd range being unmapped temporarily? The page table itself might live
> in one of these sections, so set_pmd() etc. can get a translation fault.
>
Thanks.

The cpu can generate a TLB conflict abort if it detects that the address
being looked up in the TLB hits multiple entries.

(1).I think when gathering small page to block/section mapping, there
maybe tlb conflict if no complying with BBM.

Namely:
a.Map a 4KB page (address X)
Touch that page, in order to get the translation cached in the TLB

b.Modify the translation tables
replacing the mapping for address X with a 2MB mapping - DO NOT
INVALIDATE the TLB

c.Touch "X + 4KB"
This will/should miss in the TLB, causing a new walk returning the
2MB mapping

d.Touch X
Assuming they've not been evicted, you'll hit both on the 4KB and 2MB
mapping - as both cover address X.

There is tlb conflict.
(link:
https://community.arm.com/support-forums/f/dev-platforms-forum/13583/tlb-conflict-abort)

(2).But when spliting large block/section mapping to small granularity,
there maybe no tlb conflict.

Namely:
a.rebuild the pte level mapping without any change to orgin pagetable
(the relation between virtual address and physicall address keep same)

b.modify 1G mappting to use the new pte level mapping in the [[[mem]]]
without tlb flush

c.When the cpu access the 1G mem(anywhere),
If 1G tlb entry already cached in tlb, all the 1G mem will access
success(without any tlb loaded, no confilict)

If 1G tlb entry has been evicted, then the tlb will access pagetable
in mem(despite the cpu "catch" the old(1G) or new(4k) mapped pagetale in
the mem, all the 1G mem can access sucess)(load new tlb entry, no conflict)

d.Afterward, we flush the tlb and force cpu use the new pagetable.(no
conflict)

It seems that there are no two tlb entries for a same virtual address in
the tlb cache When spliting large block/section mapping.

(3).At the same time, I think we can use another way.
As the system linear maping is builded with init_pg_dir, we can also
resue the init_pg_dir to split the block/setion mapping sometime.
As init_pg_dir contain all kernel text/data access and we can comply
with the BBM requirement.

a.rebuild new pte level mapping without any change to the old
mapping(the cpu can't walk access the new page mapping, it's isolated)

b.change to use init_pg_dir

c.clear the old 1G block mapping and flush tlb

d.modify the linear mapping to use new pte level page mapping with
init_pg_dir(TLB BBM)

e.switch to swapper_pg_dir

Could you give me some advice?

Thanks.

2022-07-08 14:28:29

by Robin Murphy

[permalink] [raw]

Subject: Re: [PATCH v3] arm64: mm: fix linear mapping mem access performance degradation

On 2022-07-08 13:13, guanghui.fgh wrote:
> Thanks.
>
> 在 2022/7/2 1:24, Catalin Marinas 写道:
>> On Thu, Jun 30, 2022 at 06:50:22PM +0800, Guanghui Feng wrote:
>>> +static void init_pmd_remap(pud_t *pudp, unsigned long addr, unsigned
>>> long end,
>>> +               phys_addr_t phys, pgprot_t prot,
>>> +               phys_addr_t (*pgtable_alloc)(int), int flags)
>>> +{
>>> +    unsigned long next;
>>> +    pmd_t *pmdp;
>>> +    phys_addr_t map_offset;
>>> +    pmdval_t pmdval;
>>> +
>>> +    pmdp = pmd_offset(pudp, addr);
>>> +    do {
>>> +        next = pmd_addr_end(addr, end);
>>> +
>>> +        if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
>>> +            phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
>>> +            pmd_clear(pmdp);
>>> +            pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
>>> +            if (flags & NO_EXEC_MAPPINGS)
>>> +                pmdval |= PMD_TABLE_PXN;
>>> +            __pmd_populate(pmdp, pte_phys, pmdval);
>>> +            flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>>
>> This doesn't follow the architecture requirements for "break before
>> make" when changing live page tables. While it may work, it risks
>> triggering a TLB conflict abort. The correct sequence normally is:
>>
>>     pmd_clear();
>>     flush_tlb_kernel_range();
>>     __pmd_populate();
>>
>> However, do we have any guarantees that the kernel doesn't access the
>> pmd range being unmapped temporarily? The page table itself might live
>> in one of these sections, so set_pmd() etc. can get a translation fault.
>>
> Thanks.
>
> The cpu can generate a TLB conflict abort if it detects that the address
> being looked up in the TLB hits multiple entries.
>
> (1).I think when gathering small page to block/section mapping, there
> maybe tlb conflict if no complying with BBM.
>
> Namely:
> a.Map a 4KB page (address X)
> Touch that page, in order to get the translation cached in the TLB
>
> b.Modify the translation tables
> replacing the mapping for address X with a 2MB mapping - DO NOT
> INVALIDATE the TLB
>
> c.Touch "X + 4KB"
> This will/should miss in the TLB, causing a new walk returning the
> 2MB mapping
>
> d.Touch X
> Assuming they've not been evicted, you'll hit both on the 4KB and 2MB
> mapping - as both cover address X.
>
> There is tlb conflict.
> (link:
> https://community.arm.com/support-forums/f/dev-platforms-forum/13583/tlb-conflict-abort)
>
>
>
>
> (2).But when spliting large block/section mapping to small granularity,
> there maybe no tlb conflict.
>
> Namely:
> a.rebuild the pte level mapping without any change to orgin pagetable
> (the relation between virtual address and physicall address keep same)
>
> b.modify 1G mappting to use the new pte level mapping in the [[[mem]]]
> without tlb flush
>
> c.When the cpu access the 1G mem(anywhere),
> If 1G tlb entry already cached in tlb, all the 1G mem will access
> success(without any tlb loaded, no confilict)

No. The CPU could still have prefetched the new table entry for any
reason as soon as it became visible, then raise an abort when it
realises it already has a leaf entry cached for that address range. Or
worse, *not* raise an abort but instead use "An amalgamation of the old
and new data or control values" for the resulting translation (see
K1.2.3 "CONSTRAINED UNPREDICTABLE behaviors due to caching of control or
data values" in the Armv8 ARM) and quietly corrupt memory elsewhere.

Robin.

>
> If 1G tlb entry has been evicted, then the tlb will access pagetable
> in mem(despite the cpu "catch" the old(1G) or new(4k) mapped pagetale in
> the mem, all the 1G mem can access sucess)(load new tlb entry, no conflict)
>
> d.Afterward, we flush the tlb and force cpu use the new pagetable.(no
> conflict)
>
> It seems that there are no two tlb entries for a same virtual address in
> the tlb cache When spliting large block/section mapping.
>
>
>
> (3).At the same time, I think we can use another way.
> As the system linear maping is builded with init_pg_dir, we can also
> resue the init_pg_dir to split the block/setion mapping sometime.
> As init_pg_dir contain all kernel text/data access and we can comply
> with the BBM requirement.
>
> a.rebuild new pte level mapping without any change to the old
> mapping(the cpu can't walk access the new page mapping, it's isolated)
>
> b.change to use init_pg_dir
>
> c.clear the old 1G block mapping and flush tlb
>
> d.modify the linear mapping to use new pte level page mapping with
> init_pg_dir(TLB BBM)
>
> e.switch to swapper_pg_dir
>
>
> Could you give me some advice?
>
> Thanks.
>
> _______________________________________________
> linux-arm-kernel mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel