From: Mike Rapoport <[email protected]>
Hi,
There were several rounds of discussion how to remap with base pages only
the crash kernel area, the latest one here:
https://lore.kernel.org/all/[email protected]
and this is my attempt to allow having both large pages in the linear map
and protection for the crash kernel memory.
For server systems it is important to protect crash kernel memory for
post-mortem analysis, and for that protection to work the crash kernel
memory should be mapped with base pages in the linear map.
On the systems with ZONE_DMA/DMA32 enabled, crash kernel reservation
happens after the linear map is created and the current code forces using
base pages for the entire linear map, which results in performance
degradation.
These patches enable remapping of the crash kernel area with base pages
while keeping large pages in the rest of the linear map.
The idea is to align crash kernel reservation to PUD boundaries, remap that
PUD and then free the extra memory.
For now the remapping does not deal with the case when crash kernel base is
specified, but this won't be a problem to add if the idea is generally
acceptable.
RFC: https://lore.kernel.org/all/[email protected]
Mike Rapoport (5):
arm64: rename defer_reserve_crashkernel() to have_zone_dma()
arm64/mmu: drop _hotplug from unmap_hotplug_* function names
arm64/mmu: move helpers for hotplug page tables freeing close to callers
arm64/mm: remap crash kernel with base pages even if rodata_full disabled
arm64/mmu: simplify logic around crash kernel mapping in map_mem()
arch/arm64/include/asm/memory.h | 2 +-
arch/arm64/include/asm/mmu.h | 3 +
arch/arm64/kernel/machine_kexec.c | 6 ++
arch/arm64/mm/init.c | 69 +++++++++++---
arch/arm64/mm/mmu.c | 152 ++++++++++++++++--------------
5 files changed, 147 insertions(+), 85 deletions(-)
base-commit: 568035b01cfb107af8d2e4bd2fb9aea22cf5b868
--
2.35.3
From: Mike Rapoport <[email protected]>
so that they can be used for remapping crash kernel.
Signed-off-by: Mike Rapoport <[email protected]>
---
arch/arm64/mm/mmu.c | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index bf303f1dea25..ea81e40a25cd 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -911,7 +911,7 @@ static bool pgtable_range_aligned(unsigned long start, unsigned long end,
return true;
}
-static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
+static void unmap_pte_range(pmd_t *pmdp, unsigned long addr,
unsigned long end, bool free_mapped,
struct vmem_altmap *altmap)
{
@@ -932,7 +932,7 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
} while (addr += PAGE_SIZE, addr < end);
}
-static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
+static void unmap_pmd_range(pud_t *pudp, unsigned long addr,
unsigned long end, bool free_mapped,
struct vmem_altmap *altmap)
{
@@ -961,11 +961,11 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
continue;
}
WARN_ON(!pmd_table(pmd));
- unmap_hotplug_pte_range(pmdp, addr, next, free_mapped, altmap);
+ unmap_pte_range(pmdp, addr, next, free_mapped, altmap);
} while (addr = next, addr < end);
}
-static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
+static void unmap_pud_range(p4d_t *p4dp, unsigned long addr,
unsigned long end, bool free_mapped,
struct vmem_altmap *altmap)
{
@@ -994,11 +994,11 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
continue;
}
WARN_ON(!pud_table(pud));
- unmap_hotplug_pmd_range(pudp, addr, next, free_mapped, altmap);
+ unmap_pmd_range(pudp, addr, next, free_mapped, altmap);
} while (addr = next, addr < end);
}
-static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
+static void unmap_p4d_range(pgd_t *pgdp, unsigned long addr,
unsigned long end, bool free_mapped,
struct vmem_altmap *altmap)
{
@@ -1013,11 +1013,11 @@ static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
continue;
WARN_ON(!p4d_present(p4d));
- unmap_hotplug_pud_range(p4dp, addr, next, free_mapped, altmap);
+ unmap_pud_range(p4dp, addr, next, free_mapped, altmap);
} while (addr = next, addr < end);
}
-static void unmap_hotplug_range(unsigned long addr, unsigned long end,
+static void unmap_range(unsigned long addr, unsigned long end,
bool free_mapped, struct vmem_altmap *altmap)
{
unsigned long next;
@@ -1039,7 +1039,7 @@ static void unmap_hotplug_range(unsigned long addr, unsigned long end,
continue;
WARN_ON(!pgd_present(pgd));
- unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap);
+ unmap_p4d_range(pgdp, addr, next, free_mapped, altmap);
} while (addr = next, addr < end);
}
@@ -1258,7 +1258,7 @@ void vmemmap_free(unsigned long start, unsigned long end,
{
WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
- unmap_hotplug_range(start, end, true, altmap);
+ unmap_range(start, end, true, altmap);
free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
}
#endif /* CONFIG_MEMORY_HOTPLUG */
@@ -1522,7 +1522,7 @@ static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
WARN_ON(pgdir != init_mm.pgd);
WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END));
- unmap_hotplug_range(start, end, false, NULL);
+ unmap_range(start, end, false, NULL);
free_empty_tables(start, end, PAGE_OFFSET, PAGE_END);
}
--
2.35.3
From: Mike Rapoport <[email protected]>
The check for crashkernel command line parameter and presence of
CONFIG_ZONE_DMA[32] in mmu::map_mem() are not necessary because
crashk_res.end would be set by the time map_mem() runs only if
reserve_crashkernel() was called from arm64_memblock_init() and only if
there was proper crashkernel parameter in the command line.
Leave only check that crashk_res.end is non-zero to decide whether
crash kernel memory should be mapped with base pages.
Signed-off-by: Mike Rapoport <[email protected]>
---
arch/arm64/mm/mmu.c | 44 ++++++++++++--------------------------------
1 file changed, 12 insertions(+), 32 deletions(-)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 83f2f18f7f34..fa23cfa6b772 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -502,21 +502,6 @@ void __init mark_linear_text_alias_ro(void)
PAGE_KERNEL_RO);
}
-static bool crash_mem_map __initdata;
-
-static int __init enable_crash_mem_map(char *arg)
-{
- /*
- * Proper parameter parsing is done by reserve_crashkernel(). We only
- * need to know if the linear map has to avoid block mappings so that
- * the crashkernel reservations can be unmapped later.
- */
- crash_mem_map = true;
-
- return 0;
-}
-early_param("crashkernel", enable_crash_mem_map);
-
static void __init map_mem(pgd_t *pgdp)
{
static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
@@ -547,11 +532,9 @@ static void __init map_mem(pgd_t *pgdp)
memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
#ifdef CONFIG_KEXEC_CORE
- if (crash_mem_map && !have_zone_dma()) {
- if (crashk_res.end)
- memblock_mark_nomap(crashk_res.start,
- resource_size(&crashk_res));
- }
+ if (crashk_res.end)
+ memblock_mark_nomap(crashk_res.start,
+ resource_size(&crashk_res));
#endif
/* map all the memory banks */
@@ -582,20 +565,17 @@ static void __init map_mem(pgd_t *pgdp)
memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
/*
- * Use page-level mappings here so that we can shrink the region
- * in page granularity and put back unused memory to buddy system
- * through /sys/kernel/kexec_crash_size interface.
+ * Use page-level mappings here so that we can protect crash kernel
+ * memory to allow post-mortem analysis when things go awry.
*/
#ifdef CONFIG_KEXEC_CORE
- if (crash_mem_map && !have_zone_dma()) {
- if (crashk_res.end) {
- __map_memblock(pgdp, crashk_res.start,
- crashk_res.end + 1,
- PAGE_KERNEL,
- NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
- memblock_clear_nomap(crashk_res.start,
- resource_size(&crashk_res));
- }
+ if (crashk_res.end) {
+ __map_memblock(pgdp, crashk_res.start,
+ crashk_res.end + 1,
+ PAGE_KERNEL,
+ NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
+ memblock_clear_nomap(crashk_res.start,
+ resource_size(&crashk_res));
}
#endif
}
--
2.35.3
From: Mike Rapoport <[email protected]>
to minimize extra ifdefery when unmap_*() methods will be used to remap
crash kernel.
Signed-off-by: Mike Rapoport <[email protected]>
---
arch/arm64/mm/mmu.c | 50 ++++++++++++++++++++++-----------------------
1 file changed, 25 insertions(+), 25 deletions(-)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index ea81e40a25cd..92267e5e9b5f 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -887,30 +887,6 @@ static void free_hotplug_page_range(struct page *page, size_t size,
}
}
-static void free_hotplug_pgtable_page(struct page *page)
-{
- free_hotplug_page_range(page, PAGE_SIZE, NULL);
-}
-
-static bool pgtable_range_aligned(unsigned long start, unsigned long end,
- unsigned long floor, unsigned long ceiling,
- unsigned long mask)
-{
- start &= mask;
- if (start < floor)
- return false;
-
- if (ceiling) {
- ceiling &= mask;
- if (!ceiling)
- return false;
- }
-
- if (end - 1 > ceiling - 1)
- return false;
- return true;
-}
-
static void unmap_pte_range(pmd_t *pmdp, unsigned long addr,
unsigned long end, bool free_mapped,
struct vmem_altmap *altmap)
@@ -1043,6 +1019,30 @@ static void unmap_range(unsigned long addr, unsigned long end,
} while (addr = next, addr < end);
}
+static bool pgtable_range_aligned(unsigned long start, unsigned long end,
+ unsigned long floor, unsigned long ceiling,
+ unsigned long mask)
+{
+ start &= mask;
+ if (start < floor)
+ return false;
+
+ if (ceiling) {
+ ceiling &= mask;
+ if (!ceiling)
+ return false;
+ }
+
+ if (end - 1 > ceiling - 1)
+ return false;
+ return true;
+}
+
+static void free_hotplug_pgtable_page(struct page *page)
+{
+ free_hotplug_page_range(page, PAGE_SIZE, NULL);
+}
+
static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
unsigned long end, unsigned long floor,
unsigned long ceiling)
@@ -1196,7 +1196,7 @@ static void free_empty_tables(unsigned long addr, unsigned long end,
free_empty_p4d_table(pgdp, addr, next, floor, ceiling);
} while (addr = next, addr < end);
}
-#endif
+#endif /* CONFIG_MEMORY_HOTPLUG */
#if !ARM64_KERNEL_USES_PMD_MAPS
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
--
2.35.3
From: Mike Rapoport <[email protected]>
For server systems it is important to protect crash kernel memory for
post-mortem analysis. In order to protect this memory it should be mapped
at PTE level.
When CONFIG_ZONE_DMA or CONFIG_ZONE_DMA32 is enabled, usage of crash kernel
essentially forces mapping of the entire linear map with base pages even if
rodata_full is not set (commit 2687275a5843 ("arm64: Force
NO_BLOCK_MAPPINGS if crashkernel reservation is required")) and this causes
performance degradation.
With ZONE_DMA/DMA32 enabled, the crash kernel memory is reserved after
the linear map is created, but before multiprocessing and multithreading
are enabled, so it is safe to remap the crash kernel memory with base
pages as long as the page table entries that would be changed do not map
the memory that might be accessed during the remapping.
To ensure there are no memory accesses in the range that will be
remapped, align crash memory reservation to PUD_SIZE boundaries, remap
the entire PUD-aligned area and than free the memory that was allocated
beyond the crash_size requested by the user.
Signed-off-by: Mike Rapoport <[email protected]>
---
arch/arm64/include/asm/mmu.h | 3 ++
arch/arm64/kernel/machine_kexec.c | 6 +++
arch/arm64/mm/init.c | 65 +++++++++++++++++++++++++------
arch/arm64/mm/mmu.c | 40 ++++++++++++++++---
4 files changed, 98 insertions(+), 16 deletions(-)
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 48f8466a4be9..aba3c095272e 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -71,6 +71,9 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot);
extern void mark_linear_text_alias_ro(void);
extern bool kaslr_requires_kpti(void);
+extern int remap_crashkernel(phys_addr_t start, phys_addr_t size,
+ phys_addr_t aligned_size);
+extern bool crashkres_protection_possible;
#define INIT_MM_CONTEXT(name) \
.pgd = init_pg_dir,
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index 19c2d487cb08..68295403aa40 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -272,6 +272,9 @@ void arch_kexec_protect_crashkres(void)
{
int i;
+ if (!crashkres_protection_possible)
+ return;
+
for (i = 0; i < kexec_crash_image->nr_segments; i++)
set_memory_valid(
__phys_to_virt(kexec_crash_image->segment[i].mem),
@@ -282,6 +285,9 @@ void arch_kexec_unprotect_crashkres(void)
{
int i;
+ if (!crashkres_protection_possible)
+ return;
+
for (i = 0; i < kexec_crash_image->nr_segments; i++)
set_memory_valid(
__phys_to_virt(kexec_crash_image->segment[i].mem),
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index a6585d50a76c..d5d647aaf23b 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -40,6 +40,7 @@
#include <asm/memory.h>
#include <asm/numa.h>
#include <asm/sections.h>
+#include <asm/set_memory.h>
#include <asm/setup.h>
#include <linux/sizes.h>
#include <asm/tlb.h>
@@ -70,19 +71,19 @@ EXPORT_SYMBOL(memstart_addr);
* crash kernel memory which has a dependency on arm64_dma_phys_limit.
* Reserving memory early for crash kernel allows linear creation of block
* mappings (greater than page-granularity) for all the memory bank rangs.
- * In this scheme a comparatively quicker boot is observed.
+ * In this scheme a comparatively quicker boot is observed and overall
+ * memory access via the linear map is more efficient as there is less TLB
+ * pressure.
*
* If ZONE_DMA configs are defined, crash kernel memory reservation
* is delayed until DMA zone memory range size initialization performed in
* zone_sizes_init(). The defer is necessary to steer clear of DMA zone
- * memory range to avoid overlap allocation. So crash kernel memory boundaries
- * are not known when mapping all bank memory ranges, which otherwise means
- * not possible to exclude crash kernel range from creating block mappings
- * so page-granularity mappings are created for the entire memory range.
- * Hence a slightly slower boot is observed.
- *
- * Note: Page-granularity mappings are necessary for crash kernel memory
- * range for shrinking its size via /sys/kernel/kexec_crash_size interface.
+ * memory range to avoid overlap allocation. To keep block mappings in the
+ * linear map, the first reservation attempt tries to allocate PUD-aligned
+ * region so that it would be possible to remap crash kernel memory with
+ * base pages. If there is not enough memory for such extended reservation,
+ * the exact amount of memory is reserved and crash kernel protection is
+ * disabled.
*/
#if IS_ENABLED(CONFIG_ZONE_DMA) || IS_ENABLED(CONFIG_ZONE_DMA32)
phys_addr_t __ro_after_init arm64_dma_phys_limit;
@@ -90,6 +91,8 @@ phys_addr_t __ro_after_init arm64_dma_phys_limit;
phys_addr_t __ro_after_init arm64_dma_phys_limit = PHYS_MASK + 1;
#endif
+bool __ro_after_init crashkres_protection_possible;
+
/* Current arm64 boot protocol requires 2MB alignment */
#define CRASH_ALIGN SZ_2M
@@ -116,6 +119,43 @@ static int __init reserve_crashkernel_low(unsigned long long low_size)
return 0;
}
+static unsigned long long __init
+reserve_remap_crashkernel(unsigned long long crash_base,
+ unsigned long long crash_size,
+ unsigned long long crash_max)
+{
+ unsigned long long size;
+
+ /*
+ * If linear map uses base pages or there is no ZONE_DMA/ZONE_DMA32
+ * the crashk_res will be mapped with PTEs in mmu::map_mem()
+ */
+ if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE) ||
+ !have_zone_dma()) {
+ crashkres_protection_possible = true;
+ return 0;
+ }
+
+ if (crash_base)
+ return 0;
+
+ size = ALIGN(crash_size, PUD_SIZE);
+
+ crash_base = memblock_phys_alloc_range(size, PUD_SIZE, 0, crash_max);
+ if (!crash_base)
+ return 0;
+
+ if (remap_crashkernel(crash_base, crash_size, size)) {
+ memblock_phys_free(crash_base, size);
+ return 0;
+ }
+
+ crashkres_protection_possible = true;
+ memblock_phys_free(crash_base + crash_size, size - crash_size);
+
+ return crash_base;
+}
+
/*
* reserve_crashkernel() - reserves memory for crash kernel
*
@@ -162,8 +202,11 @@ static void __init reserve_crashkernel(void)
if (crash_base)
crash_max = crash_base + crash_size;
- crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
- crash_base, crash_max);
+ crash_base = reserve_remap_crashkernel(crash_base, crash_size,
+ crash_max);
+ if (!crash_base)
+ crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
+ crash_base, crash_max);
if (!crash_base) {
pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
crash_size);
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 92267e5e9b5f..83f2f18f7f34 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -547,10 +547,8 @@ static void __init map_mem(pgd_t *pgdp)
memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
#ifdef CONFIG_KEXEC_CORE
- if (crash_mem_map) {
- if (have_zone_dma())
- flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
- else if (crashk_res.end)
+ if (crash_mem_map && !have_zone_dma()) {
+ if (crashk_res.end)
memblock_mark_nomap(crashk_res.start,
resource_size(&crashk_res));
}
@@ -875,7 +873,7 @@ int kern_addr_valid(unsigned long addr)
return pfn_valid(pte_pfn(pte));
}
-#ifdef CONFIG_MEMORY_HOTPLUG
+#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_KEXEC_CORE)
static void free_hotplug_page_range(struct page *page, size_t size,
struct vmem_altmap *altmap)
{
@@ -1018,7 +1016,9 @@ static void unmap_range(unsigned long addr, unsigned long end,
unmap_p4d_range(pgdp, addr, next, free_mapped, altmap);
} while (addr = next, addr < end);
}
+#endif /* CONFIG_MEMORY_HOTPLUG || CONFIG_KEXEC_CORE */
+#ifdef CONFIG_MEMORY_HOTPLUG
static bool pgtable_range_aligned(unsigned long start, unsigned long end,
unsigned long floor, unsigned long ceiling,
unsigned long mask)
@@ -1263,6 +1263,36 @@ void vmemmap_free(unsigned long start, unsigned long end,
}
#endif /* CONFIG_MEMORY_HOTPLUG */
+int __init remap_crashkernel(phys_addr_t start, phys_addr_t size,
+ phys_addr_t aligned_size)
+{
+#ifdef CONFIG_KEXEC_CORE
+ phys_addr_t end = start + size;
+ phys_addr_t aligned_end = start + aligned_size;
+
+ if (!IS_ALIGNED(start, PUD_SIZE) || !IS_ALIGNED(aligned_end, PUD_SIZE))
+ return -EINVAL;
+
+ /* Clear PUDs containing crash kernel memory */
+ unmap_range(__phys_to_virt(start), __phys_to_virt(aligned_end),
+ false, NULL);
+
+ /* map crash kernel memory with base pages */
+ __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
+ size, PAGE_KERNEL, early_pgtable_alloc,
+ NO_EXEC_MAPPINGS | NO_BLOCK_MAPPINGS |
+ NO_CONT_MAPPINGS);
+
+ /* map area from end of crash kernel to PUD end with large pages */
+ size = aligned_end - end;
+ if (size)
+ __create_pgd_mapping(swapper_pg_dir, end, __phys_to_virt(end),
+ size, PAGE_KERNEL, early_pgtable_alloc, 0);
+#endif
+
+ return 0;
+}
+
static inline pud_t *fixmap_pud(unsigned long addr)
{
pgd_t *pgdp = pgd_offset_k(addr);
--
2.35.3
From: Mike Rapoport <[email protected]>
The new name better describes what the function does and does not
restrict its use to crash kernel reservations.
Signed-off-by: Mike Rapoport <[email protected]>
---
arch/arm64/include/asm/memory.h | 2 +-
arch/arm64/mm/init.c | 4 ++--
arch/arm64/mm/mmu.c | 4 ++--
3 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 9dd08cd339c3..27fce129b97e 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -364,7 +364,7 @@ static inline void *phys_to_virt(phys_addr_t x)
void dump_mem_limit(void);
-static inline bool defer_reserve_crashkernel(void)
+static inline bool have_zone_dma(void)
{
return IS_ENABLED(CONFIG_ZONE_DMA) || IS_ENABLED(CONFIG_ZONE_DMA32);
}
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index b9af30be813e..a6585d50a76c 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -389,7 +389,7 @@ void __init arm64_memblock_init(void)
early_init_fdt_scan_reserved_mem();
- if (!defer_reserve_crashkernel())
+ if (!have_zone_dma())
reserve_crashkernel();
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
@@ -438,7 +438,7 @@ void __init bootmem_init(void)
* request_standard_resources() depends on crashkernel's memory being
* reserved, so do it here.
*/
- if (defer_reserve_crashkernel())
+ if (have_zone_dma())
reserve_crashkernel();
memblock_dump_all();
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index db7c4e6ae57b..bf303f1dea25 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -548,7 +548,7 @@ static void __init map_mem(pgd_t *pgdp)
#ifdef CONFIG_KEXEC_CORE
if (crash_mem_map) {
- if (defer_reserve_crashkernel())
+ if (have_zone_dma())
flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
else if (crashk_res.end)
memblock_mark_nomap(crashk_res.start,
@@ -589,7 +589,7 @@ static void __init map_mem(pgd_t *pgdp)
* through /sys/kernel/kexec_crash_size interface.
*/
#ifdef CONFIG_KEXEC_CORE
- if (crash_mem_map && !defer_reserve_crashkernel()) {
+ if (crash_mem_map && !have_zone_dma()) {
if (crashk_res.end) {
__map_memblock(pgdp, crashk_res.start,
crashk_res.end + 1,
--
2.35.3
Add kexec list in CC
On 08/19/22 at 07:11am, Mike Rapoport wrote:
> From: Mike Rapoport <[email protected]>
>
> Hi,
>
> There were several rounds of discussion how to remap with base pages only
> the crash kernel area, the latest one here:
>
> https://lore.kernel.org/all/[email protected]
>
> and this is my attempt to allow having both large pages in the linear map
> and protection for the crash kernel memory.
>
> For server systems it is important to protect crash kernel memory for
> post-mortem analysis, and for that protection to work the crash kernel
> memory should be mapped with base pages in the linear map.
>
> On the systems with ZONE_DMA/DMA32 enabled, crash kernel reservation
> happens after the linear map is created and the current code forces using
> base pages for the entire linear map, which results in performance
> degradation.
>
> These patches enable remapping of the crash kernel area with base pages
> while keeping large pages in the rest of the linear map.
>
> The idea is to align crash kernel reservation to PUD boundaries, remap that
> PUD and then free the extra memory.
Hi Mike,
Thanks for the effort to work on this issue. While I have to say this
isnt's good because it can only be made relying on a prerequisite that
there's big enough memory. If on a system, say 2G memory, it's not easy
to succeed on getting one 1G memory. While we only require far smaller
region than 1G, e.g about 200M which should be easy to get. So the way
taken in this patchset is too quirky and will cause regression on
systemswith small memory. This kind of sytems with small memory exists
widely on virt guest instance.
The crashkernel reservation happens after linear map because the
reservation needs to know the dma zone boundary, arm64_dma_phys_limit.
If we can deduce that before bootmem_init(), the reservation can be
done before linear map. I will make an attempt on that. If still can't
be accepted, we would like to take off the crashkernel region protection
on arm64 for now.
Thanks
Baoquan
Hi Baoquan,
On Thu, Aug 25, 2022 at 03:35:04PM +0800, Baoquan He wrote:
> Add kexec list in CC
>
> On 08/19/22 at 07:11am, Mike Rapoport wrote:
> > From: Mike Rapoport <[email protected]>
> >
> > Hi,
> >
> > There were several rounds of discussion how to remap with base pages only
> > the crash kernel area, the latest one here:
> >
> > https://lore.kernel.org/all/[email protected]
> >
> > and this is my attempt to allow having both large pages in the linear map
> > and protection for the crash kernel memory.
> >
> > For server systems it is important to protect crash kernel memory for
> > post-mortem analysis, and for that protection to work the crash kernel
> > memory should be mapped with base pages in the linear map.
> >
> > On the systems with ZONE_DMA/DMA32 enabled, crash kernel reservation
> > happens after the linear map is created and the current code forces using
> > base pages for the entire linear map, which results in performance
> > degradation.
> >
> > These patches enable remapping of the crash kernel area with base pages
> > while keeping large pages in the rest of the linear map.
> >
> > The idea is to align crash kernel reservation to PUD boundaries, remap that
> > PUD and then free the extra memory.
>
> Hi Mike,
>
> Thanks for the effort to work on this issue. While I have to say this
> isnt's good because it can only be made relying on a prerequisite that
> there's big enough memory. If on a system, say 2G memory, it's not easy
> to succeed on getting one 1G memory. While we only require far smaller
> region than 1G, e.g about 200M which should be easy to get. So the way
> taken in this patchset is too quirky and will cause regression on
> systemswith small memory. This kind of sytems with small memory exists
> widely on virt guest instance.
I don't agree there is a regression. If the PUD-aligned allocation fails,
there is a fallback to the allocation of the exact size requested for crash
kernel. This allocation just won't get protected.
Also please note, that the changes are only for the case when user didn't
force base-size pages in the linear map, so anything that works now will
work the same way with this set applied.
> The crashkernel reservation happens after linear map because the
> reservation needs to know the dma zone boundary, arm64_dma_phys_limit.
> If we can deduce that before bootmem_init(), the reservation can be
> done before linear map. I will make an attempt on that. If still can't
> be accepted, we would like to take off the crashkernel region protection
> on arm64 for now.
I doubt it would be easy because arm64_dma_phys_limit is determined after
parsing of the device tree and there might be memory allocations of
possibly unmapped memory during the parsing.
> Thanks
> Baoquan
>
--
Sincerely yours,
Mike.
On 08/25/22 at 10:48am, Mike Rapoport wrote:
......
> > > There were several rounds of discussion how to remap with base pages only
> > > the crash kernel area, the latest one here:
> > >
> > > https://lore.kernel.org/all/[email protected]
> > >
> > > and this is my attempt to allow having both large pages in the linear map
> > > and protection for the crash kernel memory.
> > >
> > > For server systems it is important to protect crash kernel memory for
> > > post-mortem analysis, and for that protection to work the crash kernel
> > > memory should be mapped with base pages in the linear map.
> > >
> > > On the systems with ZONE_DMA/DMA32 enabled, crash kernel reservation
> > > happens after the linear map is created and the current code forces using
> > > base pages for the entire linear map, which results in performance
> > > degradation.
> > >
> > > These patches enable remapping of the crash kernel area with base pages
> > > while keeping large pages in the rest of the linear map.
> > >
> > > The idea is to align crash kernel reservation to PUD boundaries, remap that
> > > PUD and then free the extra memory.
> >
> > Hi Mike,
> >
> > Thanks for the effort to work on this issue. While I have to say this
> > isnt's good because it can only be made relying on a prerequisite that
> > there's big enough memory. If on a system, say 2G memory, it's not easy
> > to succeed on getting one 1G memory. While we only require far smaller
> > region than 1G, e.g about 200M which should be easy to get. So the way
> > taken in this patchset is too quirky and will cause regression on
> > systemswith small memory. This kind of sytems with small memory exists
> > widely on virt guest instance.
>
> I don't agree there is a regression. If the PUD-aligned allocation fails,
> there is a fallback to the allocation of the exact size requested for crash
> kernel. This allocation just won't get protected.
Sorry, I misunderstood it. I just went through the log and didn't
look into codes.
But honestly, if we accept the fallback which doesn't do the protection,
we should be able to take off the protection completely, right?
Otherwise, the reservation code is a little complicated.
>
> Also please note, that the changes are only for the case when user didn't
> force base-size pages in the linear map, so anything that works now will
> work the same way with this set applied.
>
> > The crashkernel reservation happens after linear map because the
> > reservation needs to know the dma zone boundary, arm64_dma_phys_limit.
> > If we can deduce that before bootmem_init(), the reservation can be
> > done before linear map. I will make an attempt on that. If still can't
> > be accepted, we would like to take off the crashkernel region protection
> > on arm64 for now.
>
> I doubt it would be easy because arm64_dma_phys_limit is determined after
> parsing of the device tree and there might be memory allocations of
> possibly unmapped memory during the parsing.
I have sent out the patches with an attempt, it's pretty straightforward
and simple. Because arm64 only has one exception, namely Raspberry Pi 4,
on which some peripherals can only address 30bit range. That is a corner
case, to be honest. And kdump is a necessary feature on server, but may
not be so expected on Raspberry Pi 4, a system for computer education
and hobbyists. And kdump only cares whether the dump target devices can
address 32bit range, namely storage device or network card on server.
If finally confirmed that storage devices can only address 30bit range
on Raspberry Pi 4, people still can have crashkernel=xM@yM method to
reserve crashkernel regions.
Thanks
Baoquan
On Sun, Aug 28, 2022 at 04:37:29PM +0800, Baoquan He wrote:
> On 08/25/22 at 10:48am, Mike Rapoport wrote:
> ......
> > > > There were several rounds of discussion how to remap with base pages only
> > > > the crash kernel area, the latest one here:
> > > >
> > > > https://lore.kernel.org/all/[email protected]
> > > >
> > > > and this is my attempt to allow having both large pages in the linear map
> > > > and protection for the crash kernel memory.
> > > >
> > > > For server systems it is important to protect crash kernel memory for
> > > > post-mortem analysis, and for that protection to work the crash kernel
> > > > memory should be mapped with base pages in the linear map.
> > > >
> > > > On the systems with ZONE_DMA/DMA32 enabled, crash kernel reservation
> > > > happens after the linear map is created and the current code forces using
> > > > base pages for the entire linear map, which results in performance
> > > > degradation.
> > > >
> > > > These patches enable remapping of the crash kernel area with base pages
> > > > while keeping large pages in the rest of the linear map.
> > > >
> > > > The idea is to align crash kernel reservation to PUD boundaries, remap that
> > > > PUD and then free the extra memory.
> > >
> > > Hi Mike,
> > >
> > > Thanks for the effort to work on this issue. While I have to say this
> > > isnt's good because it can only be made relying on a prerequisite that
> > > there's big enough memory. If on a system, say 2G memory, it's not easy
> > > to succeed on getting one 1G memory. While we only require far smaller
> > > region than 1G, e.g about 200M which should be easy to get. So the way
> > > taken in this patchset is too quirky and will cause regression on
> > > systemswith small memory. This kind of sytems with small memory exists
> > > widely on virt guest instance.
> >
> > I don't agree there is a regression. If the PUD-aligned allocation fails,
> > there is a fallback to the allocation of the exact size requested for crash
> > kernel. This allocation just won't get protected.
>
> Sorry, I misunderstood it. I just went through the log and didn't
> look into codes.
>
> But honestly, if we accept the fallback which doesn't do the protection,
> we should be able to take off the protection completely, right?
> Otherwise, the reservation code is a little complicated.
We don't do protection of the crash kernel for most architectures
supporting kexec ;-)
My goal was to allow large systems with ZONE_DMA/DMA32 have block mappings
in the linear map and crash kernel protection without breaking backward
compatibility for the existing systems.
> > Also please note, that the changes are only for the case when user didn't
> > force base-size pages in the linear map, so anything that works now will
> > work the same way with this set applied.
> >
> > > The crashkernel reservation happens after linear map because the
> > > reservation needs to know the dma zone boundary, arm64_dma_phys_limit.
> > > If we can deduce that before bootmem_init(), the reservation can be
> > > done before linear map. I will make an attempt on that. If still can't
> > > be accepted, we would like to take off the crashkernel region protection
> > > on arm64 for now.
> >
> > I doubt it would be easy because arm64_dma_phys_limit is determined after
> > parsing of the device tree and there might be memory allocations of
> > possibly unmapped memory during the parsing.
>
> I have sent out the patches with an attempt, it's pretty straightforward
> and simple. Because arm64 only has one exception, namely Raspberry Pi 4,
> on which some peripherals can only address 30bit range. That is a corner
> case, to be honest. And kdump is a necessary feature on server, but may
> not be so expected on Raspberry Pi 4, a system for computer education
> and hobbyists. And kdump only cares whether the dump target devices can
> address 32bit range, namely storage device or network card on server.
> If finally confirmed that storage devices can only address 30bit range
> on Raspberry Pi 4, people still can have crashkernel=xM@yM method to
> reserve crashkernel regions.
I hope you are right and Raspberry Pi 4 is the only system that limits
DMA'able range to 30 bits. But with diversity of arm64 chips and boards I
won't be surprised that there are other variants with a similar problem.
> Thanks
> Baoquan
>
--
Sincerely yours,
Mike.
On 08/29/22 at 05:31pm, Mike Rapoport wrote:
> On Sun, Aug 28, 2022 at 04:37:29PM +0800, Baoquan He wrote:
> > On 08/25/22 at 10:48am, Mike Rapoport wrote:
> > ......
> > > > > There were several rounds of discussion how to remap with base pages only
> > > > > the crash kernel area, the latest one here:
> > > > >
> > > > > https://lore.kernel.org/all/[email protected]
> > > > >
> > > > > and this is my attempt to allow having both large pages in the linear map
> > > > > and protection for the crash kernel memory.
> > > > >
> > > > > For server systems it is important to protect crash kernel memory for
> > > > > post-mortem analysis, and for that protection to work the crash kernel
> > > > > memory should be mapped with base pages in the linear map.
> > > > >
> > > > > On the systems with ZONE_DMA/DMA32 enabled, crash kernel reservation
> > > > > happens after the linear map is created and the current code forces using
> > > > > base pages for the entire linear map, which results in performance
> > > > > degradation.
> > > > >
> > > > > These patches enable remapping of the crash kernel area with base pages
> > > > > while keeping large pages in the rest of the linear map.
> > > > >
> > > > > The idea is to align crash kernel reservation to PUD boundaries, remap that
> > > > > PUD and then free the extra memory.
> > > >
> > > > Hi Mike,
> > > >
> > > > Thanks for the effort to work on this issue. While I have to say this
> > > > isnt's good because it can only be made relying on a prerequisite that
> > > > there's big enough memory. If on a system, say 2G memory, it's not easy
> > > > to succeed on getting one 1G memory. While we only require far smaller
> > > > region than 1G, e.g about 200M which should be easy to get. So the way
> > > > taken in this patchset is too quirky and will cause regression on
> > > > systemswith small memory. This kind of sytems with small memory exists
> > > > widely on virt guest instance.
> > >
> > > I don't agree there is a regression. If the PUD-aligned allocation fails,
> > > there is a fallback to the allocation of the exact size requested for crash
> > > kernel. This allocation just won't get protected.
> >
> > Sorry, I misunderstood it. I just went through the log and didn't
> > look into codes.
> >
> > But honestly, if we accept the fallback which doesn't do the protection,
> > we should be able to take off the protection completely, right?
> > Otherwise, the reservation code is a little complicated.
>
> We don't do protection of the crash kernel for most architectures
> supporting kexec ;-)
Yeah. The protection was introduced into x86 firstly by my former
colleague of Redhat as an enhancement. Later people ported it to arm64.
We have signature verification mechanism to check if corruption on
loaded kdump kernel happened. In fact, panic is a small probability
event, and accidental corruption on kdump kernel data is a much smaller
probability event. The protection is an icing on the cake. But if it
brings mess, better take it away if no way to clean up the mess.
>
> My goal was to allow large systems with ZONE_DMA/DMA32 have block mappings
> in the linear map and crash kernel protection without breaking backward
> compatibility for the existing systems.
>
> > > Also please note, that the changes are only for the case when user didn't
> > > force base-size pages in the linear map, so anything that works now will
> > > work the same way with this set applied.
> > >
> > > > The crashkernel reservation happens after linear map because the
> > > > reservation needs to know the dma zone boundary, arm64_dma_phys_limit.
> > > > If we can deduce that before bootmem_init(), the reservation can be
> > > > done before linear map. I will make an attempt on that. If still can't
> > > > be accepted, we would like to take off the crashkernel region protection
> > > > on arm64 for now.
> > >
> > > I doubt it would be easy because arm64_dma_phys_limit is determined after
> > > parsing of the device tree and there might be memory allocations of
> > > possibly unmapped memory during the parsing.
> >
> > I have sent out the patches with an attempt, it's pretty straightforward
> > and simple. Because arm64 only has one exception, namely Raspberry Pi 4,
> > on which some peripherals can only address 30bit range. That is a corner
> > case, to be honest. And kdump is a necessary feature on server, but may
> > not be so expected on Raspberry Pi 4, a system for computer education
> > and hobbyists. And kdump only cares whether the dump target devices can
> > address 32bit range, namely storage device or network card on server.
> > If finally confirmed that storage devices can only address 30bit range
> > on Raspberry Pi 4, people still can have crashkernel=xM@yM method to
> > reserve crashkernel regions.
>
> I hope you are right and Raspberry Pi 4 is the only system that limits
> DMA'able range to 30 bits. But with diversity of arm64 chips and boards I
> won't be surprised that there are other variants with a similar problem.
We still need people to confirm if the storage disk or NIC on RPi4 is
able to address 32 bit range. From Nicalas's patch log and cover-letter,
he said not all devices on RPi4 are 30bit addressable.
That's possible a new arm64 chip comes out with devices of 30bit addresing,
even though those arm64 servers usually deployed with devices of wider than
32bit DMA addressing ability. And I don't think users of the chip will care
about kdump. Kdump is relied more on enterprise level system.
On x86, we ignore those ISA devices in kdump kernel at the beginning.
As you can see, the current kdump kernel has no available physical pages
in DMA zone on x86. If people have a ISA device in x86_64 system, and
want to set it as dump target, it doesn't work at all. We don't support
the corner case. If we want to cover everything, we can only limp with
patches all over us.