2020-04-13 15:11:10

by Nicholas Piggin

[permalink] [raw]
Subject: [PATCH v2 0/4] huge vmalloc mappings

We can get a significant win with larger mappings for some of the big
global hashes.

Since RFC, relevant architectures have added p?d_leaf accessors so no
real arch changes required, and I changed it not to allocate huge
mappings for modules and a bunch of other fixes.

Nicholas Piggin (4):
mm/vmalloc: fix vmalloc_to_page for huge vmap mappings
mm: Move ioremap page table mapping function to mm/
mm: HUGE_VMAP arch query functions cleanup
mm/vmalloc: Hugepage vmalloc mappings

arch/arm64/mm/mmu.c | 8 +-
arch/powerpc/mm/book3s64/radix_pgtable.c | 6 +-
arch/x86/mm/ioremap.c | 6 +-
include/linux/io.h | 3 -
include/linux/vmalloc.h | 15 +
lib/ioremap.c | 203 +----------
mm/vmalloc.c | 413 +++++++++++++++++++----
7 files changed, 380 insertions(+), 274 deletions(-)

--
2.23.0


2020-04-13 15:11:20

by Nicholas Piggin

[permalink] [raw]
Subject: [PATCH v2 2/4] mm: Move ioremap page table mapping function to mm/

ioremap_page_range is a generic function to create a kernel virtual
mapping, move it to mm/vmalloc.c and rename it vmap_range.

For clarity with this move, also:
- Rename vunmap_page_range (vmap_range's inverse) to vunmap_range.
- Rename vmap_pages_range (which takes a page array) to vmap_pages.

Signed-off-by: Nicholas Piggin <[email protected]>
---
include/linux/vmalloc.h | 3 +
lib/ioremap.c | 182 +++---------------------------
mm/vmalloc.c | 239 ++++++++++++++++++++++++++++++++++++----
3 files changed, 239 insertions(+), 185 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 0507a162ccd0..eb8a5080e472 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -173,6 +173,9 @@ extern struct vm_struct *find_vm_area(const void *addr);
extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
struct page **pages);
#ifdef CONFIG_MMU
+int vmap_range(unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift);
extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
pgprot_t prot, struct page **pages);
extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
diff --git a/lib/ioremap.c b/lib/ioremap.c
index 3f0e18543de8..7e383bdc51ad 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -60,176 +60,26 @@ static inline int ioremap_pud_enabled(void) { return 0; }
static inline int ioremap_pmd_enabled(void) { return 0; }
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */

-static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
-{
- pte_t *pte;
- u64 pfn;
-
- pfn = phys_addr >> PAGE_SHIFT;
- pte = pte_alloc_kernel(pmd, addr);
- if (!pte)
- return -ENOMEM;
- do {
- BUG_ON(!pte_none(*pte));
- set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
- pfn++;
- } while (pte++, addr += PAGE_SIZE, addr != end);
- return 0;
-}
-
-static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr,
- pgprot_t prot)
-{
- if (!ioremap_pmd_enabled())
- return 0;
-
- if ((end - addr) != PMD_SIZE)
- return 0;
-
- if (!IS_ALIGNED(addr, PMD_SIZE))
- return 0;
-
- if (!IS_ALIGNED(phys_addr, PMD_SIZE))
- return 0;
-
- if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
- return 0;
-
- return pmd_set_huge(pmd, phys_addr, prot);
-}
-
-static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
-{
- pmd_t *pmd;
- unsigned long next;
-
- pmd = pmd_alloc(&init_mm, pud, addr);
- if (!pmd)
- return -ENOMEM;
- do {
- next = pmd_addr_end(addr, end);
-
- if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot))
- continue;
-
- if (ioremap_pte_range(pmd, addr, next, phys_addr, prot))
- return -ENOMEM;
- } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
-}
-
-static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr,
- pgprot_t prot)
-{
- if (!ioremap_pud_enabled())
- return 0;
-
- if ((end - addr) != PUD_SIZE)
- return 0;
-
- if (!IS_ALIGNED(addr, PUD_SIZE))
- return 0;
-
- if (!IS_ALIGNED(phys_addr, PUD_SIZE))
- return 0;
-
- if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
- return 0;
-
- return pud_set_huge(pud, phys_addr, prot);
-}
-
-static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
-{
- pud_t *pud;
- unsigned long next;
-
- pud = pud_alloc(&init_mm, p4d, addr);
- if (!pud)
- return -ENOMEM;
- do {
- next = pud_addr_end(addr, end);
-
- if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot))
- continue;
-
- if (ioremap_pmd_range(pud, addr, next, phys_addr, prot))
- return -ENOMEM;
- } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
-}
-
-static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr,
- pgprot_t prot)
-{
- if (!ioremap_p4d_enabled())
- return 0;
-
- if ((end - addr) != P4D_SIZE)
- return 0;
-
- if (!IS_ALIGNED(addr, P4D_SIZE))
- return 0;
-
- if (!IS_ALIGNED(phys_addr, P4D_SIZE))
- return 0;
-
- if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
- return 0;
-
- return p4d_set_huge(p4d, phys_addr, prot);
-}
-
-static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
-{
- p4d_t *p4d;
- unsigned long next;
-
- p4d = p4d_alloc(&init_mm, pgd, addr);
- if (!p4d)
- return -ENOMEM;
- do {
- next = p4d_addr_end(addr, end);
-
- if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot))
- continue;
-
- if (ioremap_pud_range(p4d, addr, next, phys_addr, prot))
- return -ENOMEM;
- } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
-}
-
int ioremap_page_range(unsigned long addr,
unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
{
- pgd_t *pgd;
- unsigned long start;
- unsigned long next;
- int err;
-
- might_sleep();
- BUG_ON(addr >= end);
-
- start = addr;
- pgd = pgd_offset_k(addr);
- do {
- next = pgd_addr_end(addr, end);
- err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot);
- if (err)
- break;
- } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
-
- flush_cache_vmap(start, end);
+ unsigned int max_page_shift = PAGE_SHIFT;
+
+ /*
+ * Due to the max_page_shift parameter to vmap_range, platforms must
+ * enable all smaller sizes to take advantage of a given size,
+ * otherwise fall back to small pages.
+ */
+ if (ioremap_pmd_enabled()) {
+ max_page_shift = PMD_SHIFT;
+ if (ioremap_pud_enabled()) {
+ max_page_shift = PUD_SHIFT;
+ if (ioremap_p4d_enabled())
+ max_page_shift = P4D_SHIFT;
+ }
+ }

- return err;
+ return vmap_range(addr, end, phys_addr, prot, max_page_shift);
}

#ifdef CONFIG_GENERIC_IOREMAP
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1afec7def23f..b1bc2fcae4e0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -128,7 +128,7 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end)
} while (p4d++, addr = next, addr != end);
}

-static void vunmap_page_range(unsigned long addr, unsigned long end)
+static void vunmap_range(unsigned long addr, unsigned long end)
{
pgd_t *pgd;
unsigned long next;
@@ -143,7 +143,208 @@ static void vunmap_page_range(unsigned long addr, unsigned long end)
} while (pgd++, addr = next, addr != end);
}

-static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
+static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot)
+{
+ pte_t *pte;
+ u64 pfn;
+
+ pfn = phys_addr >> PAGE_SHIFT;
+ pte = pte_alloc_kernel(pmd, addr);
+ if (!pte)
+ return -ENOMEM;
+ do {
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
+ pfn++;
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ return 0;
+}
+
+static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ return 0;
+
+ if (max_page_shift < PMD_SHIFT)
+ return 0;
+
+ if ((end - addr) != PMD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PMD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PMD_SIZE))
+ return 0;
+
+ if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
+ return 0;
+
+ return pmd_set_huge(pmd, phys_addr, prot);
+}
+
+static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_alloc(&init_mm, pud, addr);
+ if (!pmd)
+ return -ENOMEM;
+ do {
+ next = pmd_addr_end(addr, end);
+
+ if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
+ max_page_shift))
+ continue;
+
+ if (vmap_pte_range(pmd, addr, next, phys_addr, prot))
+ return -ENOMEM;
+ } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_try_huge_pud(pud_t *pud, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ return 0;
+
+ if (max_page_shift < PUD_SHIFT)
+ return 0;
+
+ if ((end - addr) != PUD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PUD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PUD_SIZE))
+ return 0;
+
+ if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
+ return 0;
+
+ return pud_set_huge(pud, phys_addr, prot);
+}
+
+static inline int vmap_pud_range(p4d_t *p4d, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_alloc(&init_mm, p4d, addr);
+ if (!pud)
+ return -ENOMEM;
+ do {
+ next = pud_addr_end(addr, end);
+
+ if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
+ max_page_shift))
+ continue;
+
+ if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
+ max_page_shift))
+ return -ENOMEM;
+ } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ return 0;
+
+ if (max_page_shift < P4D_SHIFT)
+ return 0;
+
+ if ((end - addr) != P4D_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, P4D_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, P4D_SIZE))
+ return 0;
+
+ if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
+ return 0;
+
+ return p4d_set_huge(p4d, phys_addr, prot);
+}
+
+static inline int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ p4d_t *p4d;
+ unsigned long next;
+
+ p4d = p4d_alloc(&init_mm, pgd, addr);
+ if (!p4d)
+ return -ENOMEM;
+ do {
+ next = p4d_addr_end(addr, end);
+
+ if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
+ max_page_shift))
+ continue;
+
+ if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
+ max_page_shift))
+ return -ENOMEM;
+ } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_range_noflush(unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ pgd_t *pgd;
+ unsigned long start;
+ unsigned long next;
+ int err;
+
+ might_sleep();
+ BUG_ON(addr >= end);
+
+ start = addr;
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
+ max_page_shift);
+ if (err)
+ break;
+ } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
+
+ return err;
+}
+
+int vmap_range(unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ int ret;
+
+ ret = vmap_range_noflush(addr, end, phys_addr, prot, max_page_shift);
+ flush_cache_vmap(addr, end);
+
+ return ret;
+}
+
+static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
pte_t *pte;
@@ -169,7 +370,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}

-static int vmap_pmd_range(pud_t *pud, unsigned long addr,
+static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
pmd_t *pmd;
@@ -180,13 +381,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr,
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
- if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
+ if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
return 0;
}

-static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
+static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
pud_t *pud;
@@ -197,13 +398,13 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
- if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
+ if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr))
return -ENOMEM;
} while (pud++, addr = next, addr != end);
return 0;
}

-static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
+static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
p4d_t *p4d;
@@ -214,7 +415,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
- if (vmap_pud_range(p4d, addr, next, prot, pages, nr))
+ if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr))
return -ENOMEM;
} while (p4d++, addr = next, addr != end);
return 0;
@@ -226,7 +427,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
*
* Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
*/
-static int vmap_page_range_noflush(unsigned long start, unsigned long end,
+static int vmap_pages_range_noflush(unsigned long start, unsigned long end,
pgprot_t prot, struct page **pages)
{
pgd_t *pgd;
@@ -239,7 +440,7 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end,
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
- err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
+ err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr);
if (err)
return err;
} while (pgd++, addr = next, addr != end);
@@ -247,12 +448,12 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end,
return nr;
}

-static int vmap_page_range(unsigned long start, unsigned long end,
+static int vmap_pages_range(unsigned long start, unsigned long end,
pgprot_t prot, struct page **pages)
{
int ret;

- ret = vmap_page_range_noflush(start, end, prot, pages);
+ ret = vmap_pages_range_noflush(start, end, prot, pages);
flush_cache_vmap(start, end);
return ret;
}
@@ -1238,7 +1439,7 @@ EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
*/
static void unmap_vmap_area(struct vmap_area *va)
{
- vunmap_page_range(va->va_start, va->va_end);
+ vunmap_range(va->va_start, va->va_end);
}

/*
@@ -1699,7 +1900,7 @@ static void vb_free(const void *addr, unsigned long size)
rcu_read_unlock();
BUG_ON(!vb);

- vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
+ vunmap_range((unsigned long)addr, (unsigned long)addr + size);

if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range((unsigned long)addr,
@@ -1854,7 +2055,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro

kasan_unpoison_vmalloc(mem, size);

- if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
+ if (vmap_pages_range(addr, addr + size, prot, pages) < 0) {
vm_unmap_ram(mem, count);
return NULL;
}
@@ -2020,7 +2221,7 @@ void __init vmalloc_init(void)
int map_kernel_range_noflush(unsigned long addr, unsigned long size,
pgprot_t prot, struct page **pages)
{
- return vmap_page_range_noflush(addr, addr + size, prot, pages);
+ return vmap_pages_range_noflush(addr, addr + size, prot, pages);
}

/**
@@ -2039,7 +2240,7 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
*/
void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
{
- vunmap_page_range(addr, addr + size);
+ vunmap_range(addr, addr + size);
}
EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);

@@ -2056,7 +2257,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
unsigned long end = addr + size;

flush_cache_vunmap(addr, end);
- vunmap_page_range(addr, end);
+ vunmap_range(addr, end);
flush_tlb_kernel_range(addr, end);
}
EXPORT_SYMBOL_GPL(unmap_kernel_range);
@@ -2067,7 +2268,7 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
unsigned long end = addr + get_vm_area_size(area);
int err;

- err = vmap_page_range(addr, end, prot, pages);
+ err = vmap_pages_range(addr, end, prot, pages);

return err > 0 ? 0 : err;
}
--
2.23.0

2020-04-13 15:11:23

by Nicholas Piggin

[permalink] [raw]
Subject: [PATCH v2 3/4] mm: HUGE_VMAP arch query functions cleanup

This changes the awkward approach where architectures provide init
functions to determine which levels they can provide large mappings for,
to one where the arch is queried for each call.

This allows odd configurations to be allowed (PUD but not PMD), and will
make it easier to constant-fold dead code away if the arch inlines
unsupported levels.

This also adds a prot argument to the arch query. This is unused
currently but could help with some architectures (some powerpc
implementations can't map uncacheable memory with large pages for
example).

The name is changed from ioremap to vmap, as it will be used more
generally in the next patch.

Signed-off-by: Nicholas Piggin <[email protected]>
---
arch/arm64/mm/mmu.c | 8 ++--
arch/powerpc/mm/book3s64/radix_pgtable.c | 6 +--
arch/x86/mm/ioremap.c | 6 +--
include/linux/io.h | 3 --
include/linux/vmalloc.h | 10 +++++
lib/ioremap.c | 51 ++----------------------
mm/vmalloc.c | 9 +++++
7 files changed, 33 insertions(+), 60 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index a374e4f51a62..b8e381c46fa1 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1244,12 +1244,12 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
return dt_virt;
}

-int __init arch_ioremap_p4d_supported(void)
+bool arch_vmap_p4d_supported(pgprot_t prot)
{
return 0;
}

-int __init arch_ioremap_pud_supported(void)
+bool arch_vmap_pud_supported(pgprot_t prot)
{
/*
* Only 4k granule supports level 1 block mappings.
@@ -1259,9 +1259,9 @@ int __init arch_ioremap_pud_supported(void)
!IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
}

-int __init arch_ioremap_pmd_supported(void)
+bool arch_vmap_pmd_supported(pgprot_t prot)
{
- /* See arch_ioremap_pud_supported() */
+ /* See arch_vmap_pud_supported() */
return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
}

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 8f9edf07063a..5130e7912dd4 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1091,13 +1091,13 @@ void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
set_pte_at(mm, addr, ptep, pte);
}

-int __init arch_ioremap_pud_supported(void)
+bool arch_vmap_pud_supported(pgprot_t prot)
{
/* HPT does not cope with large pages in the vmalloc area */
return radix_enabled();
}

-int __init arch_ioremap_pmd_supported(void)
+bool arch_vmap_pmd_supported(pgprot_t prot)
{
return radix_enabled();
}
@@ -1191,7 +1191,7 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
return 1;
}

-int __init arch_ioremap_p4d_supported(void)
+bool arch_vmap_p4d_supported(pgprot_t prot)
{
return 0;
}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 18c637c0dc6f..bb4b75c344e4 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -481,12 +481,12 @@ void iounmap(volatile void __iomem *addr)
}
EXPORT_SYMBOL(iounmap);

-int __init arch_ioremap_p4d_supported(void)
+bool arch_vmap_p4d_supported(pgprot_t prot)
{
return 0;
}

-int __init arch_ioremap_pud_supported(void)
+bool arch_vmap_pud_supported(pgprot_t prot)
{
#ifdef CONFIG_X86_64
return boot_cpu_has(X86_FEATURE_GBPAGES);
@@ -495,7 +495,7 @@ int __init arch_ioremap_pud_supported(void)
#endif
}

-int __init arch_ioremap_pmd_supported(void)
+bool arch_vmap_pmd_supported(pgprot_t prot)
{
return boot_cpu_has(X86_FEATURE_PSE);
}
diff --git a/include/linux/io.h b/include/linux/io.h
index 8394c56babc2..2832e051bc2e 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -33,9 +33,6 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end,

#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
void __init ioremap_huge_init(void);
-int arch_ioremap_p4d_supported(void);
-int arch_ioremap_pud_supported(void);
-int arch_ioremap_pmd_supported(void);
#else
static inline void ioremap_huge_init(void) { }
#endif
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index eb8a5080e472..291313a7e663 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -84,6 +84,16 @@ struct vmap_area {
};
};

+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+bool arch_vmap_p4d_supported(pgprot_t prot);
+bool arch_vmap_pud_supported(pgprot_t prot);
+bool arch_vmap_pmd_supported(pgprot_t prot);
+#else
+static inline bool arch_vmap_p4d_supported(pgprot_t prot) { return false; }
+static inline bool arch_vmap_pud_supported(pgprot_t prot) { return false; }
+static inline bool arch_vmap_pmd_supported(prprot_t prot) { return false; }
+#endif
+
/*
* Highlevel APIs for driver use
*/
diff --git a/lib/ioremap.c b/lib/ioremap.c
index 7e383bdc51ad..0a1ddf1a1286 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -14,10 +14,9 @@
#include <asm/cacheflush.h>
#include <asm/pgtable.h>

+static unsigned int __read_mostly max_page_shift = PAGE_SHIFT;
+
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static int __read_mostly ioremap_p4d_capable;
-static int __read_mostly ioremap_pud_capable;
-static int __read_mostly ioremap_pmd_capable;
static int __read_mostly ioremap_huge_disabled;

static int __init set_nohugeiomap(char *str)
@@ -29,56 +28,14 @@ early_param("nohugeiomap", set_nohugeiomap);

void __init ioremap_huge_init(void)
{
- if (!ioremap_huge_disabled) {
- if (arch_ioremap_p4d_supported())
- ioremap_p4d_capable = 1;
- if (arch_ioremap_pud_supported())
- ioremap_pud_capable = 1;
- if (arch_ioremap_pmd_supported())
- ioremap_pmd_capable = 1;
- }
-}
-
-static inline int ioremap_p4d_enabled(void)
-{
- return ioremap_p4d_capable;
-}
-
-static inline int ioremap_pud_enabled(void)
-{
- return ioremap_pud_capable;
+ if (!ioremap_huge_disabled)
+ max_page_shift = P4D_SHIFT;
}
-
-static inline int ioremap_pmd_enabled(void)
-{
- return ioremap_pmd_capable;
-}
-
-#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
-static inline int ioremap_p4d_enabled(void) { return 0; }
-static inline int ioremap_pud_enabled(void) { return 0; }
-static inline int ioremap_pmd_enabled(void) { return 0; }
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */

int ioremap_page_range(unsigned long addr,
unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
{
- unsigned int max_page_shift = PAGE_SHIFT;
-
- /*
- * Due to the max_page_shift parameter to vmap_range, platforms must
- * enable all smaller sizes to take advantage of a given size,
- * otherwise fall back to small pages.
- */
- if (ioremap_pmd_enabled()) {
- max_page_shift = PMD_SHIFT;
- if (ioremap_pud_enabled()) {
- max_page_shift = PUD_SHIFT;
- if (ioremap_p4d_enabled())
- max_page_shift = P4D_SHIFT;
- }
- }
-
return vmap_range(addr, end, phys_addr, prot, max_page_shift);
}

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b1bc2fcae4e0..c898d16ddd25 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -171,6 +171,9 @@ static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
if (max_page_shift < PMD_SHIFT)
return 0;

+ if (!arch_vmap_pmd_supported(prot))
+ return 0;
+
if ((end - addr) != PMD_SIZE)
return 0;

@@ -219,6 +222,9 @@ static int vmap_try_huge_pud(pud_t *pud, unsigned long addr,
if (max_page_shift < PUD_SHIFT)
return 0;

+ if (!arch_vmap_pud_supported(prot))
+ return 0;
+
if ((end - addr) != PUD_SIZE)
return 0;

@@ -268,6 +274,9 @@ static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
if (max_page_shift < P4D_SHIFT)
return 0;

+ if (!arch_vmap_p4d_supported(prot))
+ return 0;
+
if ((end - addr) != P4D_SIZE)
return 0;

--
2.23.0

2020-04-13 15:12:14

by Nicholas Piggin

[permalink] [raw]
Subject: [PATCH v2 4/4] mm/vmalloc: Hugepage vmalloc mappings

For platforms that define HAVE_ARCH_HUGE_VMAP and support PMD vmap mappings,
have vmalloc attempt to allocate PMD-sized pages first, before falling back
to small pages. Allocations which use something other than PAGE_KERNEL
protections are not permitted to use huge pages yet, not all callers expect
this (e.g., module allocations vs strict module rwx).

This gives a 6x reduction in dTLB misses for a `git diff` (of linux), from
45600 to 6500 and a 2.2% reduction in cycles on a 2-node POWER9.

This can result in more internal fragmentation and memory overhead for a
given allocation. It can also cause greater NUMA unbalance on hashdist
allocations.

There may be other callers that expect small pages under vmalloc but use
PAGE_KERNEL, I'm not sure if it's feasible to catch them all. An
alternative would be a new function or flag which enables large mappings,
and use that in callers.

Signed-off-by: Nicholas Piggin <[email protected]>
---
include/linux/vmalloc.h | 2 +
mm/vmalloc.c | 135 +++++++++++++++++++++++++++++-----------
2 files changed, 102 insertions(+), 35 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 291313a7e663..853b82eac192 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -24,6 +24,7 @@ struct notifier_block; /* in notifier.h */
#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */
#define VM_NO_GUARD 0x00000040 /* don't add guard page */
#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
+#define VM_HUGE_PAGES 0x00000100 /* may use huge pages */

/*
* VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC.
@@ -58,6 +59,7 @@ struct vm_struct {
unsigned long size;
unsigned long flags;
struct page **pages;
+ unsigned int page_order;
unsigned int nr_pages;
phys_addr_t phys_addr;
const void *caller;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index c898d16ddd25..7b7e992c5ff1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -436,7 +436,7 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
*
* Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
*/
-static int vmap_pages_range_noflush(unsigned long start, unsigned long end,
+static int vmap_small_pages_range_noflush(unsigned long start, unsigned long end,
pgprot_t prot, struct page **pages)
{
pgd_t *pgd;
@@ -457,13 +457,44 @@ static int vmap_pages_range_noflush(unsigned long start, unsigned long end,
return nr;
}

+static int vmap_pages_range_noflush(unsigned long start, unsigned long end,
+ pgprot_t prot, struct page **pages,
+ unsigned int page_shift)
+{
+ if (page_shift == PAGE_SIZE) {
+ return vmap_small_pages_range_noflush(start, end, prot, pages);
+ } else {
+ unsigned long addr = start;
+ unsigned int i, nr = (end - start) >> page_shift;
+
+ for (i = 0; i < nr; i++) {
+ int err;
+
+ err = vmap_range_noflush(addr,
+ addr + (1UL << page_shift),
+ __pa(page_address(pages[i])), prot,
+ page_shift);
+ if (err)
+ return err;
+
+ addr += 1UL << page_shift;
+ }
+
+ return 0;
+ }
+}
+
static int vmap_pages_range(unsigned long start, unsigned long end,
- pgprot_t prot, struct page **pages)
+ pgprot_t prot, struct page **pages,
+ unsigned int page_shift)
{
int ret;

- ret = vmap_pages_range_noflush(start, end, prot, pages);
+ BUG_ON(page_shift < PAGE_SHIFT);
+
+ ret = vmap_pages_range_noflush(start, end, prot, pages, page_shift);
flush_cache_vmap(start, end);
+
return ret;
}

@@ -2064,7 +2095,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro

kasan_unpoison_vmalloc(mem, size);

- if (vmap_pages_range(addr, addr + size, prot, pages) < 0) {
+ if (vmap_pages_range(addr, addr + size, prot, pages, PAGE_SHIFT) < 0) {
vm_unmap_ram(mem, count);
return NULL;
}
@@ -2230,7 +2261,7 @@ void __init vmalloc_init(void)
int map_kernel_range_noflush(unsigned long addr, unsigned long size,
pgprot_t prot, struct page **pages)
{
- return vmap_pages_range_noflush(addr, addr + size, prot, pages);
+ return vmap_pages_range_noflush(addr, addr + size, prot, pages, PAGE_SHIFT);
}

/**
@@ -2277,7 +2308,7 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
unsigned long end = addr + get_vm_area_size(area);
int err;

- err = vmap_pages_range(addr, end, prot, pages);
+ err = vmap_pages_range(addr, end, prot, pages, PAGE_SHIFT);

return err > 0 ? 0 : err;
}
@@ -2325,9 +2356,11 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
if (unlikely(!size))
return NULL;

- if (flags & VM_IOREMAP)
- align = 1ul << clamp_t(int, get_count_order_long(size),
- PAGE_SHIFT, IOREMAP_MAX_ORDER);
+ if (flags & VM_IOREMAP) {
+ align = max(align,
+ 1ul << clamp_t(int, get_count_order_long(size),
+ PAGE_SHIFT, IOREMAP_MAX_ORDER));
+ }

area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!area))
@@ -2534,7 +2567,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
struct page *page = area->pages[i];

BUG_ON(!page);
- __free_pages(page, 0);
+ __free_pages(page, area->page_order);
}
atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);

@@ -2672,26 +2705,29 @@ void *vmap(struct page **pages, unsigned int count,
EXPORT_SYMBOL(vmap);

static void *__vmalloc_node(unsigned long size, unsigned long align,
- gfp_t gfp_mask, pgprot_t prot,
- int node, const void *caller);
+ gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags,
+ int node, const void *caller);
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
- pgprot_t prot, int node)
+ pgprot_t prot, unsigned int page_shift,
+ int node)
{
struct page **pages;
+ unsigned long addr = (unsigned long)area->addr;
+ unsigned long size = get_vm_area_size(area);
+ unsigned int page_order = page_shift - PAGE_SHIFT;
unsigned int nr_pages, array_size, i;
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
- 0 :
- __GFP_HIGHMEM;
+ 0 : __GFP_HIGHMEM;

- nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
+ nr_pages = size >> page_shift;
array_size = (nr_pages * sizeof(struct page *));

/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
- PAGE_KERNEL, node, area->caller);
+ PAGE_KERNEL, 0, node, area->caller);
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
@@ -2704,14 +2740,13 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,

area->pages = pages;
area->nr_pages = nr_pages;
+ area->page_order = page_order;

for (i = 0; i < area->nr_pages; i++) {
struct page *page;

- if (node == NUMA_NO_NODE)
- page = alloc_page(alloc_mask|highmem_mask);
- else
- page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
+ page = alloc_pages_node(node,
+ alloc_mask|highmem_mask, page_order);

if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
@@ -2725,8 +2760,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
}
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);

- if (map_vm_area(area, prot, pages))
+ if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0)
goto fail;
+
return area->addr;

fail:
@@ -2760,22 +2796,39 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
pgprot_t prot, unsigned long vm_flags, int node,
const void *caller)
{
- struct vm_struct *area;
+ struct vm_struct *area = NULL;
void *addr;
unsigned long real_size = size;
+ unsigned long real_align = align;
+ unsigned int shift = PAGE_SHIFT;

size = PAGE_ALIGN(size);
if (!size || (size >> PAGE_SHIFT) > totalram_pages())
goto fail;

- area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |
+ if (IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP) &&
+ (vm_flags & VM_HUGE_PAGES)) {
+ unsigned long size_per_node;
+
+ size_per_node = size;
+ if (node == NUMA_NO_NODE)
+ size_per_node /= num_online_nodes();
+ if (size_per_node >= PMD_SIZE)
+ shift = PMD_SHIFT;
+ }
+
+again:
+ align = max(real_align, 1UL << shift);
+ size = ALIGN(real_size, align);
+
+ area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
vm_flags, start, end, node, gfp_mask, caller);
if (!area)
goto fail;

- addr = __vmalloc_area_node(area, gfp_mask, prot, node);
+ addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
if (!addr)
- return NULL;
+ goto fail;

/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -2789,8 +2842,16 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
return addr;

fail:
- warn_alloc(gfp_mask, NULL,
+ if (shift > PAGE_SHIFT) {
+ shift = PAGE_SHIFT;
+ goto again;
+ }
+
+ if (!area) {
+ /* Warn for area allocation, page allocations already warn */
+ warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure: %lu bytes", real_size);
+ }
return NULL;
}

@@ -2825,16 +2886,19 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_range);
* Return: pointer to the allocated memory or %NULL on error
*/
static void *__vmalloc_node(unsigned long size, unsigned long align,
- gfp_t gfp_mask, pgprot_t prot,
- int node, const void *caller)
+ gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags,
+ int node, const void *caller)
{
return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
- gfp_mask, prot, 0, node, caller);
+ gfp_mask, prot, vm_flags, node, caller);
}

void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
{
- return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
+ unsigned long vm_flags = 0;
+ if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL))
+ vm_flags |= VM_HUGE_PAGES;
+ return __vmalloc_node(size, 1, gfp_mask, prot, vm_flags, NUMA_NO_NODE,
__builtin_return_address(0));
}
EXPORT_SYMBOL(__vmalloc);
@@ -2842,7 +2906,7 @@ EXPORT_SYMBOL(__vmalloc);
static inline void *__vmalloc_node_flags(unsigned long size,
int node, gfp_t flags)
{
- return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
+ return __vmalloc_node(size, 1, flags, PAGE_KERNEL, VM_HUGE_PAGES,
node, __builtin_return_address(0));
}

@@ -2850,7 +2914,8 @@ static inline void *__vmalloc_node_flags(unsigned long size,
void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
void *caller)
{
- return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller);
+ return __vmalloc_node(size, 1, flags, PAGE_KERNEL, VM_HUGE_PAGES,
+ node, caller);
}

/**
@@ -2925,7 +2990,7 @@ EXPORT_SYMBOL(vmalloc_user);
*/
void *vmalloc_node(unsigned long size, int node)
{
- return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL,
+ return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL, VM_HUGE_PAGES,
node, __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_node);
@@ -3014,7 +3079,7 @@ void *vmalloc_exec(unsigned long size)
*/
void *vmalloc_32(unsigned long size)
{
- return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
+ return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, 0,
NUMA_NO_NODE, __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32);
--
2.23.0

2020-04-14 13:45:35

by David Rientjes

[permalink] [raw]
Subject: Re: [PATCH v2 0/4] huge vmalloc mappings

On Mon, 13 Apr 2020, Nicholas Piggin wrote:

> We can get a significant win with larger mappings for some of the big
> global hashes.
>
> Since RFC, relevant architectures have added p?d_leaf accessors so no
> real arch changes required, and I changed it not to allocate huge
> mappings for modules and a bunch of other fixes.
>

Hi Nicholas,

Any performance numbers to share besides the git diff in the last patch in
the series? I'm wondering if anything from mmtests or lkp-tests makes
sense to try?

> Nicholas Piggin (4):
> mm/vmalloc: fix vmalloc_to_page for huge vmap mappings
> mm: Move ioremap page table mapping function to mm/
> mm: HUGE_VMAP arch query functions cleanup
> mm/vmalloc: Hugepage vmalloc mappings
>
> arch/arm64/mm/mmu.c | 8 +-
> arch/powerpc/mm/book3s64/radix_pgtable.c | 6 +-
> arch/x86/mm/ioremap.c | 6 +-
> include/linux/io.h | 3 -
> include/linux/vmalloc.h | 15 +
> lib/ioremap.c | 203 +----------
> mm/vmalloc.c | 413 +++++++++++++++++++----
> 7 files changed, 380 insertions(+), 274 deletions(-)
>
> --
> 2.23.0
>
>
>

2020-04-14 15:24:49

by Nicholas Piggin

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] mm/vmalloc: Hugepage vmalloc mappings

Excerpts from Matthew Wilcox's message of April 13, 2020 11:41 pm:
> On Mon, Apr 13, 2020 at 10:53:03PM +1000, Nicholas Piggin wrote:
>> +static int vmap_pages_range_noflush(unsigned long start, unsigned long end,
>> + pgprot_t prot, struct page **pages,
>> + unsigned int page_shift)
>> +{
>> + if (page_shift == PAGE_SIZE) {
>
> ... I think you meant 'page_shift == PAGE_SHIFT'

Thanks, good catch. I obviously didn't test the fallback path (the
other path works for small pages, it just goes one at a time).

> Overall I like this series, although it's a bit biased towards CPUs
> which have page sizes which match PMD/PUD sizes. It doesn't offer the
> possibility of using 64kB page sizes on ARM, for example.

No, it's just an incremental step on existing huge vmap stuff in
tree, so such a thing would be out of scope.

> But it's a
> step in the right direction.
>

I don't know about moving kernel maps away from a generic Linux page
table format. I quite like moving to it and making it as generic as
possible.

On the other hand, I also would like to make some arch-specific
allowances for certain special cases that may not fit within the
standard page table format, but it might be a much more specific and
limited interface than the general vmalloc stuff.

Thanks,
Nick

2020-04-14 15:30:44

by Nicholas Piggin

[permalink] [raw]
Subject: Re: [PATCH v2 0/4] huge vmalloc mappings

Excerpts from David Rientjes's message of April 14, 2020 10:27 am:
> On Mon, 13 Apr 2020, Nicholas Piggin wrote:
>
>> We can get a significant win with larger mappings for some of the big
>> global hashes.
>>
>> Since RFC, relevant architectures have added p?d_leaf accessors so no
>> real arch changes required, and I changed it not to allocate huge
>> mappings for modules and a bunch of other fixes.
>>
>
> Hi Nicholas,
>
> Any performance numbers to share besides the git diff in the last patch in
> the series? I'm wondering if anything from mmtests or lkp-tests makes
> sense to try?

Hey, no I don't have any other tests I've run. Some of the networking
hashes do make use of it as well though, and might see a few % in
the right kind of workload. There's probably a bunch of other stuff
where it could help a little bit, looking through the tree, I just don't
have anything specific.

Thanks,
Nick

2020-04-14 15:31:30

by Nicholas Piggin

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] mm/vmalloc: Hugepage vmalloc mappings

Excerpts from Christoph Hellwig's message of April 14, 2020 5:23 pm:
> On Mon, Apr 13, 2020 at 10:53:03PM +1000, Nicholas Piggin wrote:
>> For platforms that define HAVE_ARCH_HUGE_VMAP and support PMD vmap mappings,
>> have vmalloc attempt to allocate PMD-sized pages first, before falling back
>> to small pages. Allocations which use something other than PAGE_KERNEL
>> protections are not permitted to use huge pages yet, not all callers expect
>> this (e.g., module allocations vs strict module rwx).
>>
>> This gives a 6x reduction in dTLB misses for a `git diff` (of linux), from
>> 45600 to 6500 and a 2.2% reduction in cycles on a 2-node POWER9.
>>
>> This can result in more internal fragmentation and memory overhead for a
>> given allocation. It can also cause greater NUMA unbalance on hashdist
>> allocations.
>>
>> There may be other callers that expect small pages under vmalloc but use
>> PAGE_KERNEL, I'm not sure if it's feasible to catch them all. An
>> alternative would be a new function or flag which enables large mappings,
>> and use that in callers.
>
> Why do we even use vmalloc in this case rather than just doing a huge
> page allocation?

Which case? Usually the answer would be because you don't want to use
contiguous physical memory and/or you don't want to use the linear
mapping.

> What callers are you intersted in?

The dentry and inode caches for this test, obviously.

Lots of other things could possibly benefit though, other system
hashes like networking, but lot of other vmalloc callers that might
benefit right away, some others could use some work to batch up
allocation sizes to benefit.

Thanks,
Nick

2020-04-14 15:33:07

by Christophe Leroy

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] mm/vmalloc: Hugepage vmalloc mappings



Le 13/04/2020 à 15:41, Matthew Wilcox a écrit :
> On Mon, Apr 13, 2020 at 10:53:03PM +1000, Nicholas Piggin wrote:
>> +static int vmap_pages_range_noflush(unsigned long start, unsigned long end,
>> + pgprot_t prot, struct page **pages,
>> + unsigned int page_shift)
>> +{
>> + if (page_shift == PAGE_SIZE) {
>
> ... I think you meant 'page_shift == PAGE_SHIFT'
>
> Overall I like this series, although it's a bit biased towards CPUs
> which have page sizes which match PMD/PUD sizes. It doesn't offer the
> possibility of using 64kB page sizes on ARM, for example. But it's a
> step in the right direction.
>

I was going to ask more or less the same question, I would have liked to
use 512kB hugepages on powerpc 8xx.

Even the 8M hugepages (still on the 8xx), can they be used as well,
taking into account that two PGD entries have to point to the same 8M page ?

I sent out a series which tends to make the management of 512k and 8M
pages closer to what Linux expects, in order to use them inside kernel,
for Linear mappings and Kasan mappings for the moment. See
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=164620
It would be nice if we could amplify it a use it for ioremaps and
vmallocs as well.

Christophe

2020-04-14 16:23:53

by Nicholas Piggin

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] mm/vmalloc: Hugepage vmalloc mappings

Excerpts from Christoph Hellwig's message of April 14, 2020 11:02 pm:
> On Tue, Apr 14, 2020 at 10:13:44PM +1000, Nicholas Piggin wrote:
>> Which case? Usually the answer would be because you don't want to use
>> contiguous physical memory and/or you don't want to use the linear
>> mapping.
>
> But with huge pages you do by definition already use large contiguous
> areas. So you want allocations larger than "small" huge pages but not
> using gigantic pages using vmalloc?

Yes.

Thanks,
Nick

2020-04-14 19:17:24

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] mm/vmalloc: Hugepage vmalloc mappings

On Mon, Apr 13, 2020 at 10:53:03PM +1000, Nicholas Piggin wrote:
> +static int vmap_pages_range_noflush(unsigned long start, unsigned long end,
> + pgprot_t prot, struct page **pages,
> + unsigned int page_shift)
> +{
> + if (page_shift == PAGE_SIZE) {

... I think you meant 'page_shift == PAGE_SHIFT'

Overall I like this series, although it's a bit biased towards CPUs
which have page sizes which match PMD/PUD sizes. It doesn't offer the
possibility of using 64kB page sizes on ARM, for example. But it's a
step in the right direction.

2020-04-15 19:48:13

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] mm/vmalloc: Hugepage vmalloc mappings

On Mon, Apr 13, 2020 at 10:53:03PM +1000, Nicholas Piggin wrote:
> For platforms that define HAVE_ARCH_HUGE_VMAP and support PMD vmap mappings,
> have vmalloc attempt to allocate PMD-sized pages first, before falling back
> to small pages. Allocations which use something other than PAGE_KERNEL
> protections are not permitted to use huge pages yet, not all callers expect
> this (e.g., module allocations vs strict module rwx).
>
> This gives a 6x reduction in dTLB misses for a `git diff` (of linux), from
> 45600 to 6500 and a 2.2% reduction in cycles on a 2-node POWER9.
>
> This can result in more internal fragmentation and memory overhead for a
> given allocation. It can also cause greater NUMA unbalance on hashdist
> allocations.
>
> There may be other callers that expect small pages under vmalloc but use
> PAGE_KERNEL, I'm not sure if it's feasible to catch them all. An
> alternative would be a new function or flag which enables large mappings,
> and use that in callers.

Why do we even use vmalloc in this case rather than just doing a huge
page allocation? What callers are you intersted in?

2020-04-15 21:30:54

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] mm/vmalloc: Hugepage vmalloc mappings

On Tue, Apr 14, 2020 at 10:13:44PM +1000, Nicholas Piggin wrote:
> Which case? Usually the answer would be because you don't want to use
> contiguous physical memory and/or you don't want to use the linear
> mapping.

But with huge pages you do by definition already use large contiguous
areas. So you want allocations larger than "small" huge pages but not
using gigantic pages using vmalloc?

2020-04-15 21:39:38

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] mm/vmalloc: Hugepage vmalloc mappings

On Tue, Apr 14, 2020 at 02:28:35PM +0200, Christophe Leroy wrote:
> Le 13/04/2020 ? 15:41, Matthew Wilcox a ?crit?:
> > On Mon, Apr 13, 2020 at 10:53:03PM +1000, Nicholas Piggin wrote:
> > > +static int vmap_pages_range_noflush(unsigned long start, unsigned long end,
> > > + pgprot_t prot, struct page **pages,
> > > + unsigned int page_shift)
> > > +{
> > > + if (page_shift == PAGE_SIZE) {
> >
> > ... I think you meant 'page_shift == PAGE_SHIFT'
> >
> > Overall I like this series, although it's a bit biased towards CPUs
> > which have page sizes which match PMD/PUD sizes. It doesn't offer the
> > possibility of using 64kB page sizes on ARM, for example. But it's a
> > step in the right direction.
>
> I was going to ask more or less the same question, I would have liked to use
> 512kB hugepages on powerpc 8xx.
>
> Even the 8M hugepages (still on the 8xx), can they be used as well, taking
> into account that two PGD entries have to point to the same 8M page ?
>
> I sent out a series which tends to make the management of 512k and 8M pages
> closer to what Linux expects, in order to use them inside kernel, for Linear
> mappings and Kasan mappings for the moment. See
> https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=164620
> It would be nice if we could amplify it a use it for ioremaps and vmallocs
> as well.

I haven't been looking at vmalloc at all; I've been looking at the page
cache. See:
https://lore.kernel.org/linux-mm/[email protected]/

Once we have large pages in the page cache, I want to sort out the API
for asking the CPU to insert a TLB entry. Right now, we use set_pte_at(),
set_pmd_at() and set_pud_at(). I'm thinking something along the lines of:

vm_fault_t vmf_set_page_at(struct vm_fault *vmf, struct page *page);

and the architecture can insert whatever PTEs and/or TLB entries it
likes based on compound_order(page) -- if, say, it's a 1MB page, it might
choose to insert 2 * 512kB entries, or just the upper or lower 512kB entry
(depending which half of the 1MB page the address sits in).

2020-04-15 22:57:57

by Will Deacon

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] mm/vmalloc: Hugepage vmalloc mappings

Hi Nick,

On Mon, Apr 13, 2020 at 10:53:03PM +1000, Nicholas Piggin wrote:
> For platforms that define HAVE_ARCH_HUGE_VMAP and support PMD vmap mappings,
> have vmalloc attempt to allocate PMD-sized pages first, before falling back
> to small pages. Allocations which use something other than PAGE_KERNEL
> protections are not permitted to use huge pages yet, not all callers expect
> this (e.g., module allocations vs strict module rwx).
>
> This gives a 6x reduction in dTLB misses for a `git diff` (of linux), from
> 45600 to 6500 and a 2.2% reduction in cycles on a 2-node POWER9.

I wonder if it's worth extending vmap() to handle higher order pages in
a similar way? That might be helpful for tracing PMUs such as Arm SPE,
where the CPU streams tracing data out to a virtually addressed buffer
(see rb_alloc_aux_page()).

> This can result in more internal fragmentation and memory overhead for a
> given allocation. It can also cause greater NUMA unbalance on hashdist
> allocations.
>
> There may be other callers that expect small pages under vmalloc but use
> PAGE_KERNEL, I'm not sure if it's feasible to catch them all. An
> alternative would be a new function or flag which enables large mappings,
> and use that in callers.
>
> Signed-off-by: Nicholas Piggin <[email protected]>
> ---
> include/linux/vmalloc.h | 2 +
> mm/vmalloc.c | 135 +++++++++++++++++++++++++++++-----------
> 2 files changed, 102 insertions(+), 35 deletions(-)
>
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 291313a7e663..853b82eac192 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -24,6 +24,7 @@ struct notifier_block; /* in notifier.h */
> #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */
> #define VM_NO_GUARD 0x00000040 /* don't add guard page */
> #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
> +#define VM_HUGE_PAGES 0x00000100 /* may use huge pages */

Please can you add a check for this in the arm64 change_memory_common()
code? Other architectures might need something similar, but we need to
forbid changing memory attributes for portions of the huge page.

In general, I'm a bit wary of software table walkers tripping over this.
For example, I don't think apply_to_existing_page_range() can handle
huge mappings at all, but the one user (KASAN) only ever uses page mappings
so it's ok there.

> @@ -2325,9 +2356,11 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
> if (unlikely(!size))
> return NULL;
>
> - if (flags & VM_IOREMAP)
> - align = 1ul << clamp_t(int, get_count_order_long(size),
> - PAGE_SHIFT, IOREMAP_MAX_ORDER);
> + if (flags & VM_IOREMAP) {
> + align = max(align,
> + 1ul << clamp_t(int, get_count_order_long(size),
> + PAGE_SHIFT, IOREMAP_MAX_ORDER));
> + }


I don't follow this part. Please could you explain why you're potentially
aligning above IOREMAP_MAX_ORDER? It doesn't seem to follow from the rest
of the patch.

Cheers,

Will

2020-04-16 02:40:23

by Nicholas Piggin

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] mm/vmalloc: Hugepage vmalloc mappings

Excerpts from Will Deacon's message of April 15, 2020 8:47 pm:
> Hi Nick,
>
> On Mon, Apr 13, 2020 at 10:53:03PM +1000, Nicholas Piggin wrote:
>> For platforms that define HAVE_ARCH_HUGE_VMAP and support PMD vmap mappings,
>> have vmalloc attempt to allocate PMD-sized pages first, before falling back
>> to small pages. Allocations which use something other than PAGE_KERNEL
>> protections are not permitted to use huge pages yet, not all callers expect
>> this (e.g., module allocations vs strict module rwx).
>>
>> This gives a 6x reduction in dTLB misses for a `git diff` (of linux), from
>> 45600 to 6500 and a 2.2% reduction in cycles on a 2-node POWER9.
>
> I wonder if it's worth extending vmap() to handle higher order pages in
> a similar way? That might be helpful for tracing PMUs such as Arm SPE,
> where the CPU streams tracing data out to a virtually addressed buffer
> (see rb_alloc_aux_page()).

Yeah it becomes pretty trivial to do that with VM_HUGE_PAGES after
this patch, I have something to do it but no callers ready yet, if
you have an easy one we can add it.

>> This can result in more internal fragmentation and memory overhead for a
>> given allocation. It can also cause greater NUMA unbalance on hashdist
>> allocations.
>>
>> There may be other callers that expect small pages under vmalloc but use
>> PAGE_KERNEL, I'm not sure if it's feasible to catch them all. An
>> alternative would be a new function or flag which enables large mappings,
>> and use that in callers.
>>
>> Signed-off-by: Nicholas Piggin <[email protected]>
>> ---
>> include/linux/vmalloc.h | 2 +
>> mm/vmalloc.c | 135 +++++++++++++++++++++++++++++-----------
>> 2 files changed, 102 insertions(+), 35 deletions(-)
>>
>> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
>> index 291313a7e663..853b82eac192 100644
>> --- a/include/linux/vmalloc.h
>> +++ b/include/linux/vmalloc.h
>> @@ -24,6 +24,7 @@ struct notifier_block; /* in notifier.h */
>> #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */
>> #define VM_NO_GUARD 0x00000040 /* don't add guard page */
>> #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
>> +#define VM_HUGE_PAGES 0x00000100 /* may use huge pages */
>
> Please can you add a check for this in the arm64 change_memory_common()
> code? Other architectures might need something similar, but we need to
> forbid changing memory attributes for portions of the huge page.

Yeah good idea, I can look about adding some more checks.

>
> In general, I'm a bit wary of software table walkers tripping over this.
> For example, I don't think apply_to_existing_page_range() can handle
> huge mappings at all, but the one user (KASAN) only ever uses page mappings
> so it's ok there.

Right, I have something to warn for apply to page range (and looking
at adding support for bigger pages). It doesn't even have a test and
warn at the moment which isn't good practice IMO so we should add one
even without huge vmap.

>
>> @@ -2325,9 +2356,11 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
>> if (unlikely(!size))
>> return NULL;
>>
>> - if (flags & VM_IOREMAP)
>> - align = 1ul << clamp_t(int, get_count_order_long(size),
>> - PAGE_SHIFT, IOREMAP_MAX_ORDER);
>> + if (flags & VM_IOREMAP) {
>> + align = max(align,
>> + 1ul << clamp_t(int, get_count_order_long(size),
>> + PAGE_SHIFT, IOREMAP_MAX_ORDER));
>> + }
>
>
> I don't follow this part. Please could you explain why you're potentially
> aligning above IOREMAP_MAX_ORDER? It doesn't seem to follow from the rest
> of the patch.

Trying to remember. If the caller asks for a particular alignment we
shouldn't reduce it. Should put it in another patch.

Thanks,
Nick

2020-07-01 07:11:23

by Zefan Li

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] mm/vmalloc: Hugepage vmalloc mappings

> static void *__vmalloc_node(unsigned long size, unsigned long align,
> - gfp_t gfp_mask, pgprot_t prot,
> - int node, const void *caller);
> + gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags,
> + int node, const void *caller);
> static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
> - pgprot_t prot, int node)
> + pgprot_t prot, unsigned int page_shift,
> + int node)
> {
> struct page **pages;
> + unsigned long addr = (unsigned long)area->addr;
> + unsigned long size = get_vm_area_size(area);
> + unsigned int page_order = page_shift - PAGE_SHIFT;
> unsigned int nr_pages, array_size, i;
> const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
> const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
> const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
> - 0 :
> - __GFP_HIGHMEM;
> + 0 : __GFP_HIGHMEM;
>
> - nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
> + nr_pages = size >> page_shift;

while try out this patchset, we encountered a BUG_ON in account_kernel_stack()
in kernel/fork.c.

BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);

which obviously should be updated accordingly.

> array_size = (nr_pages * sizeof(struct page *));
>
> /* Please note that the recursion is strictly bounded. */
> if (array_size > PAGE_SIZE) {
> pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
> - PAGE_KERNEL, node, area->caller);
> + PAGE_KERNEL, 0, node, area->caller);
> } else {
> pages = kmalloc_node(array_size, nested_gfp, node);
> }

2020-07-03 00:16:26

by Nicholas Piggin

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] mm/vmalloc: Hugepage vmalloc mappings

Excerpts from Zefan Li's message of July 1, 2020 5:10 pm:
>> static void *__vmalloc_node(unsigned long size, unsigned long align,
>> - gfp_t gfp_mask, pgprot_t prot,
>> - int node, const void *caller);
>> + gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags,
>> + int node, const void *caller);
>> static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>> - pgprot_t prot, int node)
>> + pgprot_t prot, unsigned int page_shift,
>> + int node)
>> {
>> struct page **pages;
>> + unsigned long addr = (unsigned long)area->addr;
>> + unsigned long size = get_vm_area_size(area);
>> + unsigned int page_order = page_shift - PAGE_SHIFT;
>> unsigned int nr_pages, array_size, i;
>> const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
>> const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
>> const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
>> - 0 :
>> - __GFP_HIGHMEM;
>> + 0 : __GFP_HIGHMEM;
>>
>> - nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
>> + nr_pages = size >> page_shift;
>
> while try out this patchset, we encountered a BUG_ON in account_kernel_stack()
> in kernel/fork.c.
>
> BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
>
> which obviously should be updated accordingly.

Thanks for finding that. We may have to change this around a bit so
nr_pages still appears to be in PAGE_SIZE units for anybody looking.

Thanks,
Nick

2020-07-20 02:02:59

by Zefan Li

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] mm/vmalloc: Hugepage vmalloc mappings

> +static int vmap_pages_range_noflush(unsigned long start, unsigned long end,
> + pgprot_t prot, struct page **pages,
> + unsigned int page_shift)
> +{
> + if (page_shift == PAGE_SIZE) {

Is this a typo of PAGE_SHIFT?

> + return vmap_small_pages_range_noflush(start, end, prot, pages);
> + } else {
> + unsigned long addr = start;
> + unsigned int i, nr = (end - start) >> page_shift;
> +
> + for (i = 0; i < nr; i++) {
> + int err;
> +
> + err = vmap_range_noflush(addr,
> + addr + (1UL << page_shift),
> + __pa(page_address(pages[i])), prot,
> + page_shift);
> + if (err)
> + return err;
> +
> + addr += 1UL << page_shift;
> + }
> +
> + return 0;
> + }
> +}
> +

2020-07-20 02:49:40

by Nicholas Piggin

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] mm/vmalloc: Hugepage vmalloc mappings

Excerpts from Zefan Li's message of July 20, 2020 12:02 pm:
>> +static int vmap_pages_range_noflush(unsigned long start, unsigned long end,
>> + pgprot_t prot, struct page **pages,
>> + unsigned int page_shift)
>> +{
>> + if (page_shift == PAGE_SIZE) {
>
> Is this a typo of PAGE_SHIFT?

Oh good catch, yeah that'll always be going via the one-at-a-time route
and slow down the small page vmaps. Will fix.

Thanks,
Nick

>
>> + return vmap_small_pages_range_noflush(start, end, prot, pages);
>> + } else {
>> + unsigned long addr = start;
>> + unsigned int i, nr = (end - start) >> page_shift;
>> +
>> + for (i = 0; i < nr; i++) {
>> + int err;
>> +
>> + err = vmap_range_noflush(addr,
>> + addr + (1UL << page_shift),
>> + __pa(page_address(pages[i])), prot,
>> + page_shift);
>> + if (err)
>> + return err;
>> +
>> + addr += 1UL << page_shift;
>> + }
>> +
>> + return 0;
>> + }
>> +}
>> +
>