2022-06-20 11:32:29

by Muchun Song

[permalink] [raw]
Subject: [PATCH v5 0/2] make hugetlb_optimize_vmemmap compatible with memmap_on_memory

This series makes hugetlb_optimize_vmemmap compatible with memmap_on_memory
and is based on mm-stable. The reason refers to the patch 2's commit log.

v5:
- Replace enum to defines per David.
- Walk vmemmap page tables to avoid false-positive.

v4:
- Fix compiling error when CONFIG_MEMORY_HOTPLUG is disabled reported by kernel test robot.
- Fix a bug when memory_block_size_bytes() is not equal to section size.

v3:
- Switch complicated enumeration magic (David).
- Introduce PageVmemmapSelfHosted to make both parameters compatible (David and Oscar).

v2:
- Fix compile error when !CONFIG_ZONE_DEVICE reported by kernel test robot.

Muchun Song (2):
mm: memory_hotplug: enumerate all supported section flags
mm: memory_hotplug: make hugetlb_optimize_vmemmap compatible with
memmap_on_memory

Documentation/admin-guide/kernel-parameters.txt | 22 ++++-----
Documentation/admin-guide/sysctl/vm.rst | 5 +-
include/linux/memory_hotplug.h | 9 ----
include/linux/mmzone.h | 41 +++++++++++----
include/linux/page-flags.h | 11 +++++
mm/hugetlb_vmemmap.c | 66 ++++++++++++++++++++++---
mm/memory_hotplug.c | 33 +++++++------
mm/sparse.c | 2 +-
8 files changed, 132 insertions(+), 57 deletions(-)

--
2.11.0


2022-06-20 11:47:13

by Muchun Song

[permalink] [raw]
Subject: [PATCH v5 1/2] mm: memory_hotplug: enumerate all supported section flags

We are almost running out of section flags, only one bit is available in
the worst case (powerpc with 256k pages). However, there are still some
free bits (in ->section_mem_map) on other architectures (e.g. x86_64 has
10 bits available, arm64 has 8 bits available with worst case of 64K
pages). We have hard coded those numbers in code, it is inconvenient to
use those bits on other architectures except powerpc. So transfer those
section flags to enumeration to make it easy to add new section flags in
the future. Also, move SECTION_TAINT_ZONE_DEVICE into the scope of
CONFIG_ZONE_DEVICE to save a bit on non-zone-device case.

Signed-off-by: Muchun Song <[email protected]>
---
include/linux/mmzone.h | 41 ++++++++++++++++++++++++++++++++---------
mm/memory_hotplug.c | 6 ++++++
mm/sparse.c | 2 +-
3 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index aab70355d64f..2b5757752333 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1418,16 +1418,32 @@ extern size_t mem_section_usage_size(void);
* (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
* worst combination is powerpc with 256k pages,
* which results in PFN_SECTION_SHIFT equal 6.
- * To sum it up, at least 6 bits are available.
+ * To sum it up, at least 6 bits are available on all architectures.
+ * However, we can exceed 6 bits on some other architectures except
+ * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available
+ * with the worst case of 64K pages on arm64) if we make sure the
+ * exceeded bit is not applicable to powerpc.
*/
-#define SECTION_MARKED_PRESENT (1UL<<0)
-#define SECTION_HAS_MEM_MAP (1UL<<1)
-#define SECTION_IS_ONLINE (1UL<<2)
-#define SECTION_IS_EARLY (1UL<<3)
-#define SECTION_TAINT_ZONE_DEVICE (1UL<<4)
-#define SECTION_MAP_LAST_BIT (1UL<<5)
-#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1))
-#define SECTION_NID_SHIFT 6
+enum {
+ SECTION_MARKED_PRESENT_BIT,
+ SECTION_HAS_MEM_MAP_BIT,
+ SECTION_IS_ONLINE_BIT,
+ SECTION_IS_EARLY_BIT,
+#ifdef CONFIG_ZONE_DEVICE
+ SECTION_TAINT_ZONE_DEVICE_BIT,
+#endif
+ SECTION_MAP_LAST_BIT,
+};
+
+#define SECTION_MARKED_PRESENT BIT(SECTION_MARKED_PRESENT_BIT)
+#define SECTION_HAS_MEM_MAP BIT(SECTION_HAS_MEM_MAP_BIT)
+#define SECTION_IS_ONLINE BIT(SECTION_IS_ONLINE_BIT)
+#define SECTION_IS_EARLY BIT(SECTION_IS_EARLY_BIT)
+#ifdef CONFIG_ZONE_DEVICE
+#define SECTION_TAINT_ZONE_DEVICE BIT(SECTION_TAINT_ZONE_DEVICE_BIT)
+#endif
+#define SECTION_MAP_MASK (~(BIT(SECTION_MAP_LAST_BIT) - 1))
+#define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT

static inline struct page *__section_mem_map_addr(struct mem_section *section)
{
@@ -1466,12 +1482,19 @@ static inline int online_section(struct mem_section *section)
return (section && (section->section_mem_map & SECTION_IS_ONLINE));
}

+#ifdef CONFIG_ZONE_DEVICE
static inline int online_device_section(struct mem_section *section)
{
unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE;

return section && ((section->section_mem_map & flags) == flags);
}
+#else
+static inline int online_device_section(struct mem_section *section)
+{
+ return 0;
+}
+#endif

static inline int online_section_nr(unsigned long nr)
{
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1f1a730c4499..6662b86e9e64 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -670,12 +670,18 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon

}

+#ifdef CONFIG_ZONE_DEVICE
static void section_taint_zone_device(unsigned long pfn)
{
struct mem_section *ms = __pfn_to_section(pfn);

ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE;
}
+#else
+static inline void section_taint_zone_device(unsigned long pfn)
+{
+}
+#endif

/*
* Associate the pfn range with the given zone, initializing the memmaps
diff --git a/mm/sparse.c b/mm/sparse.c
index cb3bfae64036..e5a8a3a0edd7 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -281,7 +281,7 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p
{
unsigned long coded_mem_map =
(unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
- BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
+ BUILD_BUG_ON(SECTION_MAP_LAST_BIT > PFN_SECTION_SHIFT);
BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
return coded_mem_map;
}
--
2.11.0

2022-06-21 08:38:06

by David Hildenbrand

[permalink] [raw]
Subject: Re: [PATCH v5 1/2] mm: memory_hotplug: enumerate all supported section flags

On 20.06.22 13:06, Muchun Song wrote:
> We are almost running out of section flags, only one bit is available in
> the worst case (powerpc with 256k pages). However, there are still some
> free bits (in ->section_mem_map) on other architectures (e.g. x86_64 has
> 10 bits available, arm64 has 8 bits available with worst case of 64K
> pages). We have hard coded those numbers in code, it is inconvenient to
> use those bits on other architectures except powerpc. So transfer those
> section flags to enumeration to make it easy to add new section flags in
> the future. Also, move SECTION_TAINT_ZONE_DEVICE into the scope of
> CONFIG_ZONE_DEVICE to save a bit on non-zone-device case.
>
> Signed-off-by: Muchun Song <[email protected]>

Reviewed-by: David Hildenbrand <[email protected]>

--
Thanks,

David / dhildenb

2022-06-21 21:32:05

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH v5 0/2] make hugetlb_optimize_vmemmap compatible with memmap_on_memory

On Mon, 20 Jun 2022 19:06:14 +0800 Muchun Song <[email protected]> wrote:

> This series makes hugetlb_optimize_vmemmap compatible with memmap_on_memory
> and is based on mm-stable. The reason refers to the patch 2's commit log.
>
> v5:
> - Replace enum to defines per David.
> - Walk vmemmap page tables to avoid false-positive.

I can't see this second change in the v3->v5 deltas?



From: Muchun Song <[email protected]>
Subject: mm-memory_hotplug-enumerate-all-supported-section-flags-v5
Date: Mon, 20 Jun 2022 19:06:15 +0800

replace enum with defines per David

Link: https://lkml.kernel.org/r/[email protected]
Signed-off-by: Muchun Song <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

include/linux/mmzone.h | 13 +++++--------
1 file changed, 5 insertions(+), 8 deletions(-)

--- a/include/linux/mmzone.h~mm-memory_hotplug-enumerate-all-supported-section-flags-v5
+++ a/include/linux/mmzone.h
@@ -1439,16 +1439,13 @@ enum {
SECTION_MAP_LAST_BIT,
};

-enum {
- SECTION_MARKED_PRESENT = BIT(SECTION_MARKED_PRESENT_BIT),
- SECTION_HAS_MEM_MAP = BIT(SECTION_HAS_MEM_MAP_BIT),
- SECTION_IS_ONLINE = BIT(SECTION_IS_ONLINE_BIT),
- SECTION_IS_EARLY = BIT(SECTION_IS_EARLY_BIT),
+#define SECTION_MARKED_PRESENT BIT(SECTION_MARKED_PRESENT_BIT)
+#define SECTION_HAS_MEM_MAP BIT(SECTION_HAS_MEM_MAP_BIT)
+#define SECTION_IS_ONLINE BIT(SECTION_IS_ONLINE_BIT)
+#define SECTION_IS_EARLY BIT(SECTION_IS_EARLY_BIT)
#ifdef CONFIG_ZONE_DEVICE
- SECTION_TAINT_ZONE_DEVICE = BIT(SECTION_TAINT_ZONE_DEVICE_BIT),
+#define SECTION_TAINT_ZONE_DEVICE BIT(SECTION_TAINT_ZONE_DEVICE_BIT)
#endif
-};
-
#define SECTION_MAP_MASK (~(BIT(SECTION_MAP_LAST_BIT) - 1))
#define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT

_




From: Muchun Song <[email protected]>
Subject: mm-memory_hotplug-make-hugetlb_optimize_vmemmap-compatible-with-memmap_on_memory-v5
Date: Mon, 20 Jun 2022 19:06:16 +0800

walk vmemmap page tables to avoid false-positive

Link: https://lkml.kernel.org/r/[email protected]
Signed-off-by: Muchun Song <[email protected]>
Co-developed-by: Oscar Salvador <[email protected]>
Signed-off-by: Oscar Salvador <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

mm/hugetlb_vmemmap.c | 69 ++++++++++++++++++++++++++---------------
1 file changed, 44 insertions(+), 25 deletions(-)

--- a/mm/hugetlb_vmemmap.c~mm-memory_hotplug-make-hugetlb_optimize_vmemmap-compatible-with-memmap_on_memory-v5
+++ a/mm/hugetlb_vmemmap.c
@@ -10,6 +10,7 @@
*/
#define pr_fmt(fmt) "HugeTLB: " fmt

+#include <linux/memory.h>
#include "hugetlb_vmemmap.h"

/*
@@ -99,34 +100,52 @@ int hugetlb_vmemmap_alloc(struct hstate
static unsigned int vmemmap_optimizable_pages(struct hstate *h,
struct page *head)
{
- struct mem_section *ms;
- struct page *vmemmap_page;
- unsigned long pfn = page_to_pfn(head);
-
if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
return 0;

- ms = __pfn_to_section(pfn);
- vmemmap_page = sparse_decode_mem_map(ms->section_mem_map,
- pfn_to_section_nr(pfn));
- /*
- * Only the vmemmap pages' vmemmap may be marked as VmemmapSelfHosted.
- *
- * Due to HugeTLB alignment requirements, and the vmemmap pages being
- * at the start of the hotplugged memory region. Checking any vmemmap
- * page's vmemmap is fine.
- *
- * [ hotplugged memory ]
- * [ vmemmap ][ usable memory ]
- * ^ | | |
- * +---+ | |
- * ^ | |
- * +--------+ |
- * ^ |
- * +-----------------+
- */
- if (PageVmemmapSelfHosted(vmemmap_page))
- return 0;
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
+ pmd_t *pmdp, pmd;
+ struct page *vmemmap_page;
+ unsigned long vaddr = (unsigned long)head;
+
+ /*
+ * Only the vmemmap page's vmemmap page can be self-hosted.
+ * Walking the page tables to find the backing page of the
+ * vmemmap page.
+ */
+ pmdp = pmd_off_k(vaddr);
+ /*
+ * The READ_ONCE() is used to stabilize *pmdp in a register or
+ * on the stack so that it will stop changing under the code.
+ * The only concurrent operation where it can be changed is
+ * split_vmemmap_huge_pmd() (*pmdp will be stable after this
+ * operation).
+ */
+ pmd = READ_ONCE(*pmdp);
+ if (pmd_leaf(pmd))
+ vmemmap_page = pmd_page(pmd) + pte_index(vaddr);
+ else
+ vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr));
+ /*
+ * Due to HugeTLB alignment requirements and the vmemmap pages
+ * being at the start of the hotplugged memory region in
+ * memory_hotplug.memmap_on_memory case. Checking any vmemmap
+ * page's vmemmap page if it is marked as VmemmapSelfHosted is
+ * sufficient.
+ *
+ * [ hotplugged memory ]
+ * [ section ][...][ section ]
+ * [ vmemmap ][ usable memory ]
+ * ^ | | |
+ * +---+ | |
+ * ^ | |
+ * +-------+ |
+ * ^ |
+ * +-------------------------------------------+
+ */
+ if (PageVmemmapSelfHosted(vmemmap_page))
+ return 0;
+ }

return hugetlb_optimize_vmemmap_pages(h);
}
_

2022-06-22 03:46:51

by Muchun Song

[permalink] [raw]
Subject: Re: [PATCH v5 0/2] make hugetlb_optimize_vmemmap compatible with memmap_on_memory

On Tue, Jun 21, 2022 at 01:53:13PM -0700, Andrew Morton wrote:
> On Mon, 20 Jun 2022 19:06:14 +0800 Muchun Song <[email protected]> wrote:
>
> > This series makes hugetlb_optimize_vmemmap compatible with memmap_on_memory
> > and is based on mm-stable. The reason refers to the patch 2's commit log.
> >
> > v5:
> > - Replace enum to defines per David.
> > - Walk vmemmap page tables to avoid false-positive.
>
> I can't see this second change in the v3->v5 deltas?
>

My changlog is not clear, Let me clarify it here.

v3: Drop a section flag SECTION_CANNOT_OPTIMIZE_VMEMMAP and introduce a page
flag PageVmemmapSelfHosted to make both parameters compatible.
v4: Fix compiling error when !CONFIG_MEMORY_HOTPLUG and a bug when memory block
spans multiple sections.
v5: Fix a bug which PageVmemmapSelfHosted() check can be false-positive.

Thanks.

> From: Muchun Song <[email protected]>
> Subject: mm-memory_hotplug-enumerate-all-supported-section-flags-v5
> Date: Mon, 20 Jun 2022 19:06:15 +0800
>
> replace enum with defines per David
>
> Link: https://lkml.kernel.org/r/[email protected]
> Signed-off-by: Muchun Song <[email protected]>
> Signed-off-by: Andrew Morton <[email protected]>
> ---
>
> include/linux/mmzone.h | 13 +++++--------
> 1 file changed, 5 insertions(+), 8 deletions(-)
>
> --- a/include/linux/mmzone.h~mm-memory_hotplug-enumerate-all-supported-section-flags-v5
> +++ a/include/linux/mmzone.h
> @@ -1439,16 +1439,13 @@ enum {
> SECTION_MAP_LAST_BIT,
> };
>
> -enum {
> - SECTION_MARKED_PRESENT = BIT(SECTION_MARKED_PRESENT_BIT),
> - SECTION_HAS_MEM_MAP = BIT(SECTION_HAS_MEM_MAP_BIT),
> - SECTION_IS_ONLINE = BIT(SECTION_IS_ONLINE_BIT),
> - SECTION_IS_EARLY = BIT(SECTION_IS_EARLY_BIT),
> +#define SECTION_MARKED_PRESENT BIT(SECTION_MARKED_PRESENT_BIT)
> +#define SECTION_HAS_MEM_MAP BIT(SECTION_HAS_MEM_MAP_BIT)
> +#define SECTION_IS_ONLINE BIT(SECTION_IS_ONLINE_BIT)
> +#define SECTION_IS_EARLY BIT(SECTION_IS_EARLY_BIT)
> #ifdef CONFIG_ZONE_DEVICE
> - SECTION_TAINT_ZONE_DEVICE = BIT(SECTION_TAINT_ZONE_DEVICE_BIT),
> +#define SECTION_TAINT_ZONE_DEVICE BIT(SECTION_TAINT_ZONE_DEVICE_BIT)
> #endif
> -};
> -
> #define SECTION_MAP_MASK (~(BIT(SECTION_MAP_LAST_BIT) - 1))
> #define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT
>
> _
>
>
>
>
> From: Muchun Song <[email protected]>
> Subject: mm-memory_hotplug-make-hugetlb_optimize_vmemmap-compatible-with-memmap_on_memory-v5
> Date: Mon, 20 Jun 2022 19:06:16 +0800
>
> walk vmemmap page tables to avoid false-positive
>
> Link: https://lkml.kernel.org/r/[email protected]
> Signed-off-by: Muchun Song <[email protected]>
> Co-developed-by: Oscar Salvador <[email protected]>
> Signed-off-by: Oscar Salvador <[email protected]>
> Signed-off-by: Andrew Morton <[email protected]>
> ---
>
> mm/hugetlb_vmemmap.c | 69 ++++++++++++++++++++++++++---------------
> 1 file changed, 44 insertions(+), 25 deletions(-)
>
> --- a/mm/hugetlb_vmemmap.c~mm-memory_hotplug-make-hugetlb_optimize_vmemmap-compatible-with-memmap_on_memory-v5
> +++ a/mm/hugetlb_vmemmap.c
> @@ -10,6 +10,7 @@
> */
> #define pr_fmt(fmt) "HugeTLB: " fmt
>
> +#include <linux/memory.h>
> #include "hugetlb_vmemmap.h"
>
> /*
> @@ -99,34 +100,52 @@ int hugetlb_vmemmap_alloc(struct hstate
> static unsigned int vmemmap_optimizable_pages(struct hstate *h,
> struct page *head)
> {
> - struct mem_section *ms;
> - struct page *vmemmap_page;
> - unsigned long pfn = page_to_pfn(head);
> -
> if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
> return 0;
>
> - ms = __pfn_to_section(pfn);
> - vmemmap_page = sparse_decode_mem_map(ms->section_mem_map,
> - pfn_to_section_nr(pfn));
> - /*
> - * Only the vmemmap pages' vmemmap may be marked as VmemmapSelfHosted.
> - *
> - * Due to HugeTLB alignment requirements, and the vmemmap pages being
> - * at the start of the hotplugged memory region. Checking any vmemmap
> - * page's vmemmap is fine.
> - *
> - * [ hotplugged memory ]
> - * [ vmemmap ][ usable memory ]
> - * ^ | | |
> - * +---+ | |
> - * ^ | |
> - * +--------+ |
> - * ^ |
> - * +-----------------+
> - */
> - if (PageVmemmapSelfHosted(vmemmap_page))
> - return 0;
> + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
> + pmd_t *pmdp, pmd;
> + struct page *vmemmap_page;
> + unsigned long vaddr = (unsigned long)head;
> +
> + /*
> + * Only the vmemmap page's vmemmap page can be self-hosted.
> + * Walking the page tables to find the backing page of the
> + * vmemmap page.
> + */
> + pmdp = pmd_off_k(vaddr);
> + /*
> + * The READ_ONCE() is used to stabilize *pmdp in a register or
> + * on the stack so that it will stop changing under the code.
> + * The only concurrent operation where it can be changed is
> + * split_vmemmap_huge_pmd() (*pmdp will be stable after this
> + * operation).
> + */
> + pmd = READ_ONCE(*pmdp);
> + if (pmd_leaf(pmd))
> + vmemmap_page = pmd_page(pmd) + pte_index(vaddr);
> + else
> + vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr));
> + /*
> + * Due to HugeTLB alignment requirements and the vmemmap pages
> + * being at the start of the hotplugged memory region in
> + * memory_hotplug.memmap_on_memory case. Checking any vmemmap
> + * page's vmemmap page if it is marked as VmemmapSelfHosted is
> + * sufficient.
> + *
> + * [ hotplugged memory ]
> + * [ section ][...][ section ]
> + * [ vmemmap ][ usable memory ]
> + * ^ | | |
> + * +---+ | |
> + * ^ | |
> + * +-------+ |
> + * ^ |
> + * +-------------------------------------------+
> + */
> + if (PageVmemmapSelfHosted(vmemmap_page))
> + return 0;
> + }
>
> return hugetlb_optimize_vmemmap_pages(h);
> }
> _
>
>