This series makes hugetlb_optimize_vmemmap compatible with memmap_on_memory
and is based on next-20220518. The reason refers to the patch 2's commit log.
v2:
- Fix compile error when !CONFIG_ZONE_DEVICE reported by kernel test robot.
Muchun Song (2):
mm: memory_hotplug: enumerate all supported section flags
mm: memory_hotplug: introduce SECTION_CANNOT_OPTIMIZE_VMEMMAP
Documentation/admin-guide/kernel-parameters.txt | 22 ++++----
Documentation/admin-guide/sysctl/vm.rst | 5 +-
include/linux/kconfig.h | 1 +
include/linux/memory_hotplug.h | 9 ----
include/linux/mmzone.h | 71 ++++++++++++++++++++++---
mm/hugetlb_vmemmap.c | 28 +++++++---
mm/memory_hotplug.c | 28 +++++-----
mm/sparse.c | 8 +++
8 files changed, 119 insertions(+), 53 deletions(-)
--
2.11.0
For now, the feature of hugetlb_free_vmemmap is not compatible with the
feature of memory_hotplug.memmap_on_memory, and hugetlb_free_vmemmap
takes precedence over memory_hotplug.memmap_on_memory. However, someone
wants to make memory_hotplug.memmap_on_memory takes precedence over
hugetlb_free_vmemmap since memmap_on_memory makes it more likely to
succeed memory hotplug in close-to-OOM situations. So the decision
of making hugetlb_free_vmemmap take precedence is not wise and elegant.
The proper approach is to have hugetlb_vmemmap.c do the check whether
the section which the HugeTLB pages belong to can be optimized. If
the section's vmemmap pages are allocated from the added memory block
itself, hugetlb_free_vmemmap should refuse to optimize the vmemmap,
otherwise, do the optimization. Then both kernel parameters are
compatible. So this patch introduces SECTION_CANNOT_OPTIMIZE_VMEMMAP
to indicate whether the section could be optimized.
Signed-off-by: Muchun Song <[email protected]>
---
Documentation/admin-guide/kernel-parameters.txt | 22 +++++++++----------
Documentation/admin-guide/sysctl/vm.rst | 5 ++---
include/linux/memory_hotplug.h | 9 --------
include/linux/mmzone.h | 17 +++++++++++++++
mm/hugetlb_vmemmap.c | 28 ++++++++++++++++++-------
mm/memory_hotplug.c | 22 +++++++------------
mm/sparse.c | 8 +++++++
7 files changed, 66 insertions(+), 45 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index c087f578d9d8..5359ffb04a84 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1730,9 +1730,11 @@
Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y,
the default is on.
- This is not compatible with memory_hotplug.memmap_on_memory.
- If both parameters are enabled, hugetlb_free_vmemmap takes
- precedence over memory_hotplug.memmap_on_memory.
+ Note that the vmemmap pages may be allocated from the added
+ memory block itself when memory_hotplug.memmap_on_memory is
+ enabled, those vmemmap pages cannot be optimized even if this
+ feature is enabled. Other vmemmap pages not allocated from
+ the added memory block itself do not be affected.
hung_task_panic=
[KNL] Should the hung task detector generate panics.
@@ -3077,10 +3079,12 @@
[KNL,X86,ARM] Boolean flag to enable this feature.
Format: {on | off (default)}
When enabled, runtime hotplugged memory will
- allocate its internal metadata (struct pages)
- from the hotadded memory which will allow to
- hotadd a lot of memory without requiring
- additional memory to do so.
+ allocate its internal metadata (struct pages,
+ those vmemmap pages cannot be optimized even
+ if hugetlb_free_vmemmap is enabled) from the
+ hotadded memory which will allow to hotadd a
+ lot of memory without requiring additional
+ memory to do so.
This feature is disabled by default because it
has some implication on large (e.g. GB)
allocations in some configurations (e.g. small
@@ -3090,10 +3094,6 @@
Note that even when enabled, there are a few cases where
the feature is not effective.
- This is not compatible with hugetlb_free_vmemmap. If
- both parameters are enabled, hugetlb_free_vmemmap takes
- precedence over memory_hotplug.memmap_on_memory.
-
memtest= [KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest
Format: <integer>
default : 0 <disable>
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 5c9aa171a0d3..d7374a1e8ac9 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -565,9 +565,8 @@ See Documentation/admin-guide/mm/hugetlbpage.rst
hugetlb_optimize_vmemmap
========================
-This knob is not available when memory_hotplug.memmap_on_memory (kernel parameter)
-is configured or the size of 'struct page' (a structure defined in
-include/linux/mm_types.h) is not power of two (an unusual system config could
+This knob is not available when the size of 'struct page' (a structure defined
+in include/linux/mm_types.h) is not power of two (an unusual system config could
result in this).
Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 20d7edf62a6a..e0b2209ab71c 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -351,13 +351,4 @@ void arch_remove_linear_mapping(u64 start, u64 size);
extern bool mhp_supports_memmap_on_memory(unsigned long size);
#endif /* CONFIG_MEMORY_HOTPLUG */
-#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
-bool mhp_memmap_on_memory(void);
-#else
-static inline bool mhp_memmap_on_memory(void)
-{
- return false;
-}
-#endif
-
#endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2cf2a76535ab..607a4fcabbd4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1434,6 +1434,7 @@ extern size_t mem_section_usage_size(void);
MAPPER(IS_ONLINE) \
MAPPER(IS_EARLY) \
MAPPER(TAINT_ZONE_DEVICE, CONFIG_ZONE_DEVICE) \
+ MAPPER(CANNOT_OPTIMIZE_VMEMMAP, CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) \
MAPPER(MAP_LAST_BIT)
#define __SECTION_SHIFT_FLAG_MAPPER_0(x)
@@ -1471,6 +1472,22 @@ static inline struct page *__section_mem_map_addr(struct mem_section *section)
return (struct page *)map;
}
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+static inline void section_mark_cannot_optimize_vmemmap(struct mem_section *ms)
+{
+ ms->section_mem_map |= SECTION_CANNOT_OPTIMIZE_VMEMMAP;
+}
+
+static inline int section_cannot_optimize_vmemmap(struct mem_section *ms)
+{
+ return (ms && (ms->section_mem_map & SECTION_CANNOT_OPTIMIZE_VMEMMAP));
+}
+#else
+static inline void section_mark_cannot_optimize_vmemmap(struct mem_section *ms)
+{
+}
+#endif
+
static inline int present_section(struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index fcd9f7872064..f12170520337 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -97,18 +97,32 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
return ret;
}
+static unsigned int optimizable_vmemmap_pages(struct hstate *h,
+ struct page *head)
+{
+ unsigned long pfn = page_to_pfn(head);
+ unsigned long end = pfn + pages_per_huge_page(h);
+
+ if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
+ return 0;
+
+ for (; pfn < end; pfn += PAGES_PER_SECTION) {
+ if (section_cannot_optimize_vmemmap(__pfn_to_section(pfn)))
+ return 0;
+ }
+
+ return hugetlb_optimize_vmemmap_pages(h);
+}
+
void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
{
unsigned long vmemmap_addr = (unsigned long)head;
unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
- vmemmap_pages = hugetlb_optimize_vmemmap_pages(h);
+ vmemmap_pages = optimizable_vmemmap_pages(h, head);
if (!vmemmap_pages)
return;
- if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
- return;
-
static_branch_inc(&hugetlb_optimize_vmemmap_key);
vmemmap_addr += RESERVE_VMEMMAP_SIZE;
@@ -199,10 +213,10 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = {
static __init int hugetlb_vmemmap_sysctls_init(void)
{
/*
- * If "memory_hotplug.memmap_on_memory" is enabled or "struct page"
- * crosses page boundaries, the vmemmap pages cannot be optimized.
+ * If "struct page" crosses page boundaries, the vmemmap pages cannot
+ * be optimized.
*/
- if (!mhp_memmap_on_memory() && is_power_of_2(sizeof(struct page)))
+ if (is_power_of_2(sizeof(struct page)))
register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
return 0;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3b360eda933f..7309694c4dee 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -43,30 +43,22 @@
#include "shuffle.h"
#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
-static int memmap_on_memory_set(const char *val, const struct kernel_param *kp)
-{
- if (hugetlb_optimize_vmemmap_enabled())
- return 0;
- return param_set_bool(val, kp);
-}
-
-static const struct kernel_param_ops memmap_on_memory_ops = {
- .flags = KERNEL_PARAM_OPS_FL_NOARG,
- .set = memmap_on_memory_set,
- .get = param_get_bool,
-};
-
/*
* memory_hotplug.memmap_on_memory parameter
*/
static bool memmap_on_memory __ro_after_init;
-module_param_cb(memmap_on_memory, &memmap_on_memory_ops, &memmap_on_memory, 0444);
+module_param(memmap_on_memory, bool, 0444);
MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
-bool mhp_memmap_on_memory(void)
+static inline bool mhp_memmap_on_memory(void)
{
return memmap_on_memory;
}
+#else
+static inline bool mhp_memmap_on_memory(void)
+{
+ return false;
+}
#endif
enum {
diff --git a/mm/sparse.c b/mm/sparse.c
index cb3bfae64036..1f353bf9ea6b 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -913,6 +913,14 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
ms = __nr_to_section(section_nr);
set_section_nid(section_nr, nid);
__section_mark_present(ms, section_nr);
+ /*
+ * Mark whole section as non-optimizable once there is a subsection
+ * whose vmemmap pages are allocated from alternative allocator. The
+ * early section is always optimizable since the early section's
+ * vmemmap pages do not consider partially being populated.
+ */
+ if (!early_section(ms) && altmap)
+ section_mark_cannot_optimize_vmemmap(ms);
/* Align memmap to section boundary in the subsection case */
if (section_nr_to_pfn(section_nr) != start_pfn)
--
2.11.0
We are almost running out of section flags, only one bit is available in
the worst case (powerpc with 256k pages). However, there are still some
free bits (in ->section_mem_map) on other architectures (e.g. x86_64 has
10 bits available, arm64 has 8 bits available with worst case of 64K
pages). We have hard coded those numbers in code, it is inconvenient to
use those bits on other architectures except powerpc. So transfer those
section flags to enumeration to make it easy to add new section flags in
the future. Also, move SECTION_TAINT_ZONE_DEVICE into the scope of
CONFIG_ZONE_DEVICE to save a bit on non-zone-device case.
Signed-off-by: Muchun Song <[email protected]>
---
include/linux/kconfig.h | 1 +
include/linux/mmzone.h | 54 +++++++++++++++++++++++++++++++++++++++++--------
mm/memory_hotplug.c | 6 ++++++
3 files changed, 53 insertions(+), 8 deletions(-)
diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h
index 20d1079e92b4..7044032b9f42 100644
--- a/include/linux/kconfig.h
+++ b/include/linux/kconfig.h
@@ -10,6 +10,7 @@
#define __LITTLE_ENDIAN 1234
#endif
+#define __ARG_PLACEHOLDER_ 0,
#define __ARG_PLACEHOLDER_1 0,
#define __take_second_arg(__ignored, val, ...) val
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 299259cfe462..2cf2a76535ab 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1422,16 +1422,47 @@ extern size_t mem_section_usage_size(void);
* (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
* worst combination is powerpc with 256k pages,
* which results in PFN_SECTION_SHIFT equal 6.
- * To sum it up, at least 6 bits are available.
+ * To sum it up, at least 6 bits are available on all architectures.
+ * However, we can exceed 6 bits on some other architectures except
+ * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available
+ * with the worst case of 64K pages on arm64) if we make sure the
+ * exceeded bit is not applicable to powerpc.
*/
-#define SECTION_MARKED_PRESENT (1UL<<0)
-#define SECTION_HAS_MEM_MAP (1UL<<1)
-#define SECTION_IS_ONLINE (1UL<<2)
-#define SECTION_IS_EARLY (1UL<<3)
-#define SECTION_TAINT_ZONE_DEVICE (1UL<<4)
-#define SECTION_MAP_LAST_BIT (1UL<<5)
+#define ENUM_SECTION_FLAG(MAPPER) \
+ MAPPER(MARKED_PRESENT) \
+ MAPPER(HAS_MEM_MAP) \
+ MAPPER(IS_ONLINE) \
+ MAPPER(IS_EARLY) \
+ MAPPER(TAINT_ZONE_DEVICE, CONFIG_ZONE_DEVICE) \
+ MAPPER(MAP_LAST_BIT)
+
+#define __SECTION_SHIFT_FLAG_MAPPER_0(x)
+#define __SECTION_SHIFT_FLAG_MAPPER_1(x) SECTION_##x##_SHIFT,
+#define __SECTION_SHIFT_FLAG_MAPPER(x, ...) \
+ __PASTE(__SECTION_SHIFT_FLAG_MAPPER_, IS_ENABLED(__VA_ARGS__))(x)
+
+#define __SECTION_FLAG_MAPPER_0(x)
+#define __SECTION_FLAG_MAPPER_1(x) SECTION_##x = BIT(SECTION_##x##_SHIFT),
+#define __SECTION_FLAG_MAPPER(x, ...) \
+ __PASTE(__SECTION_FLAG_MAPPER_, IS_ENABLED(__VA_ARGS__))(x)
+
+enum {
+ /*
+ * Generate a series of enumeration flags like SECTION_$name_SHIFT.
+ * Each entry in ENUM_SECTION_FLAG() macro will be generated to one
+ * enumeration iff the 2nd parameter of MAPPER() is defined or absent.
+ * The $name comes from the 1st parameter of MAPPER() macro.
+ */
+ ENUM_SECTION_FLAG(__SECTION_SHIFT_FLAG_MAPPER)
+ /*
+ * Generate a series of enumeration flags like:
+ * SECTION_$name = BIT(SECTION_$name_SHIFT)
+ */
+ ENUM_SECTION_FLAG(__SECTION_FLAG_MAPPER)
+};
+
#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1))
-#define SECTION_NID_SHIFT 6
+#define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT_SHIFT
static inline struct page *__section_mem_map_addr(struct mem_section *section)
{
@@ -1470,12 +1501,19 @@ static inline int online_section(struct mem_section *section)
return (section && (section->section_mem_map & SECTION_IS_ONLINE));
}
+#ifdef CONFIG_ZONE_DEVICE
static inline int online_device_section(struct mem_section *section)
{
unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE;
return section && ((section->section_mem_map & flags) == flags);
}
+#else
+static inline int online_device_section(struct mem_section *section)
+{
+ return 0;
+}
+#endif
static inline int online_section_nr(unsigned long nr)
{
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1213d0c67a53..3b360eda933f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -672,12 +672,18 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
}
+#ifdef CONFIG_ZONE_DEVICE
static void section_taint_zone_device(unsigned long pfn)
{
struct mem_section *ms = __pfn_to_section(pfn);
ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE;
}
+#else
+static inline void section_taint_zone_device(unsigned long pfn)
+{
+}
+#endif
/*
* Associate the pfn range with the given zone, initializing the memmaps
--
2.11.0
On 20.05.22 04:55, Muchun Song wrote:
> We are almost running out of section flags, only one bit is available in
> the worst case (powerpc with 256k pages). However, there are still some
> free bits (in ->section_mem_map) on other architectures (e.g. x86_64 has
> 10 bits available, arm64 has 8 bits available with worst case of 64K
> pages). We have hard coded those numbers in code, it is inconvenient to
> use those bits on other architectures except powerpc. So transfer those
> section flags to enumeration to make it easy to add new section flags in
> the future. Also, move SECTION_TAINT_ZONE_DEVICE into the scope of
> CONFIG_ZONE_DEVICE to save a bit on non-zone-device case.
>
> Signed-off-by: Muchun Song <[email protected]>
Sorry for the late reply. This looks overly complicated to me.
IOW, staring at that patch I don't quite like what I am seeing.
Something like the following is *a lot* easier to read than some
MAPPER macro magic. What speaks against it?
/*
* Section bits use the lower unused bits in the ->section_mem_map
*/
enum {
SECTION_MARKED_PRESENT_BIT = 0,
SECTION_HAS_MEM_MAP_BIT,
...
#ifdef ZONE_DEVICE
SECTION_TAINT_ZONE_DEVICE_BIT
#endif
}
#define SECTION_MARKED_PRESENT (1ULL << SECTION_MARKED_PRESENT_BIT)
...
#ifdef ZONE_DEVICE
#define SECTION_TAINT_ZONE_DEVICE (1ULL << SECTION_TAINT_ZONE_DEVICE_BIT)
#endif /* ZONE_DEVICE */
> ---
> include/linux/kconfig.h | 1 +
> include/linux/mmzone.h | 54 +++++++++++++++++++++++++++++++++++++++++--------
> mm/memory_hotplug.c | 6 ++++++
> 3 files changed, 53 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h
> index 20d1079e92b4..7044032b9f42 100644
> --- a/include/linux/kconfig.h
> +++ b/include/linux/kconfig.h
> @@ -10,6 +10,7 @@
> #define __LITTLE_ENDIAN 1234
> #endif
>
> +#define __ARG_PLACEHOLDER_ 0,
> #define __ARG_PLACEHOLDER_1 0,
> #define __take_second_arg(__ignored, val, ...) val
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 299259cfe462..2cf2a76535ab 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -1422,16 +1422,47 @@ extern size_t mem_section_usage_size(void);
> * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
> * worst combination is powerpc with 256k pages,
> * which results in PFN_SECTION_SHIFT equal 6.
> - * To sum it up, at least 6 bits are available.
> + * To sum it up, at least 6 bits are available on all architectures.
> + * However, we can exceed 6 bits on some other architectures except
> + * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available
> + * with the worst case of 64K pages on arm64) if we make sure the
> + * exceeded bit is not applicable to powerpc.
> */
> -#define SECTION_MARKED_PRESENT (1UL<<0)
> -#define SECTION_HAS_MEM_MAP (1UL<<1)
> -#define SECTION_IS_ONLINE (1UL<<2)
> -#define SECTION_IS_EARLY (1UL<<3)
> -#define SECTION_TAINT_ZONE_DEVICE (1UL<<4)
> -#define SECTION_MAP_LAST_BIT (1UL<<5)
> +#define ENUM_SECTION_FLAG(MAPPER) \
> + MAPPER(MARKED_PRESENT) \
> + MAPPER(HAS_MEM_MAP) \
> + MAPPER(IS_ONLINE) \
> + MAPPER(IS_EARLY) \
> + MAPPER(TAINT_ZONE_DEVICE, CONFIG_ZONE_DEVICE) \
> + MAPPER(MAP_LAST_BIT)
> +
> +#define __SECTION_SHIFT_FLAG_MAPPER_0(x)
> +#define __SECTION_SHIFT_FLAG_MAPPER_1(x) SECTION_##x##_SHIFT,
> +#define __SECTION_SHIFT_FLAG_MAPPER(x, ...) \
> + __PASTE(__SECTION_SHIFT_FLAG_MAPPER_, IS_ENABLED(__VA_ARGS__))(x)
> +
> +#define __SECTION_FLAG_MAPPER_0(x)
> +#define __SECTION_FLAG_MAPPER_1(x) SECTION_##x = BIT(SECTION_##x##_SHIFT),
> +#define __SECTION_FLAG_MAPPER(x, ...) \
> + __PASTE(__SECTION_FLAG_MAPPER_, IS_ENABLED(__VA_ARGS__))(x)
> +
> +enum {
> + /*
> + * Generate a series of enumeration flags like SECTION_$name_SHIFT.
> + * Each entry in ENUM_SECTION_FLAG() macro will be generated to one
> + * enumeration iff the 2nd parameter of MAPPER() is defined or absent.
> + * The $name comes from the 1st parameter of MAPPER() macro.
> + */
> + ENUM_SECTION_FLAG(__SECTION_SHIFT_FLAG_MAPPER)
> + /*
> + * Generate a series of enumeration flags like:
> + * SECTION_$name = BIT(SECTION_$name_SHIFT)
> + */
> + ENUM_SECTION_FLAG(__SECTION_FLAG_MAPPER)
> +};
> +
> #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1))
> -#define SECTION_NID_SHIFT 6
> +#define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT_SHIFT
>
> static inline struct page *__section_mem_map_addr(struct mem_section *section)
> {
> @@ -1470,12 +1501,19 @@ static inline int online_section(struct mem_section *section)
> return (section && (section->section_mem_map & SECTION_IS_ONLINE));
> }
>
> +#ifdef CONFIG_ZONE_DEVICE
> static inline int online_device_section(struct mem_section *section)
> {
> unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE;
>
> return section && ((section->section_mem_map & flags) == flags);
> }
> +#else
> +static inline int online_device_section(struct mem_section *section)
> +{
> + return 0;
> +}
> +#endif
>
> static inline int online_section_nr(unsigned long nr)
> {
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 1213d0c67a53..3b360eda933f 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -672,12 +672,18 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
>
> }
>
> +#ifdef CONFIG_ZONE_DEVICE
> static void section_taint_zone_device(unsigned long pfn)
> {
> struct mem_section *ms = __pfn_to_section(pfn);
>
> ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE;
> }
> +#else
> +static inline void section_taint_zone_device(unsigned long pfn)
> +{
> +}
> +#endif
>
> /*
> * Associate the pfn range with the given zone, initializing the memmaps
--
Thanks,
David / dhildenb
On 20.05.22 04:55, Muchun Song wrote:
> For now, the feature of hugetlb_free_vmemmap is not compatible with the
> feature of memory_hotplug.memmap_on_memory, and hugetlb_free_vmemmap
> takes precedence over memory_hotplug.memmap_on_memory. However, someone
> wants to make memory_hotplug.memmap_on_memory takes precedence over
> hugetlb_free_vmemmap since memmap_on_memory makes it more likely to
> succeed memory hotplug in close-to-OOM situations. So the decision
> of making hugetlb_free_vmemmap take precedence is not wise and elegant.
> The proper approach is to have hugetlb_vmemmap.c do the check whether
> the section which the HugeTLB pages belong to can be optimized. If
> the section's vmemmap pages are allocated from the added memory block
> itself, hugetlb_free_vmemmap should refuse to optimize the vmemmap,
> otherwise, do the optimization. Then both kernel parameters are
> compatible. So this patch introduces SECTION_CANNOT_OPTIMIZE_VMEMMAP
> to indicate whether the section could be optimized.
>
In theory, we have that information stored in the relevant memory block,
but I assume that lookup in the xarray + locking is impractical.
I wonder if we can derive that information simply from the vmemmap pages
themselves, because *drumroll*
For one vmemmap page (the first one), the vmemmap corresponds to itself
-- what?!
[ hotplugged memory ]
[ memmap ][ usable memory ]
| | |
^--- | |
^------- |
^----------------------
The memmap of the first page of hotplugged memory falls onto itself.
We'd have to derive from actual "usable memory" that condition.
We currently support memmap_on_memory memory only within fixed-size
memory blocks. So "hotplugged memory" is guaranteed to be aligned to
memory_block_size_bytes() and the size is memory_block_size_bytes().
If we'd have a page falling into usbale memory, we'd simply lookup the
first page and test if the vmemmap maps to itself.
Of course, once we'd support variable-sized memory blocks, it would be
different.
An easier/future-proof approach might simply be flagging the vmemmap
pages as being special. We reuse page flags for that, which don't have
semantics yet (i.e., PG_reserved indicates a boot-time allocation via
memblock).
You'd walk the applicable vmemmap pages you want to optimize and check
if they are marked as special. You don't have to walk all but can
optimize: memmap_on_memory uses a vmemmap size that's at least PMD_SIZE.
So it's sufficient to check a single vmemmap page inside a PMD_SIZE
vmemmap range.
> Signed-off-by: Muchun Song <[email protected]>
> ---
> Documentation/admin-guide/kernel-parameters.txt | 22 +++++++++----------
> Documentation/admin-guide/sysctl/vm.rst | 5 ++---
> include/linux/memory_hotplug.h | 9 --------
> include/linux/mmzone.h | 17 +++++++++++++++
> mm/hugetlb_vmemmap.c | 28 ++++++++++++++++++-------
> mm/memory_hotplug.c | 22 +++++++------------
> mm/sparse.c | 8 +++++++
> 7 files changed, 66 insertions(+), 45 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index c087f578d9d8..5359ffb04a84 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -1730,9 +1730,11 @@
> Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y,
> the default is on.
>
> - This is not compatible with memory_hotplug.memmap_on_memory.
> - If both parameters are enabled, hugetlb_free_vmemmap takes
> - precedence over memory_hotplug.memmap_on_memory.
> + Note that the vmemmap pages may be allocated from the added
> + memory block itself when memory_hotplug.memmap_on_memory is
> + enabled, those vmemmap pages cannot be optimized even if this
> + feature is enabled. Other vmemmap pages not allocated from
> + the added memory block itself do not be affected.
>
> hung_task_panic=
> [KNL] Should the hung task detector generate panics.
> @@ -3077,10 +3079,12 @@
> [KNL,X86,ARM] Boolean flag to enable this feature.
> Format: {on | off (default)}
> When enabled, runtime hotplugged memory will
> - allocate its internal metadata (struct pages)
> - from the hotadded memory which will allow to
> - hotadd a lot of memory without requiring
> - additional memory to do so.
> + allocate its internal metadata (struct pages,
> + those vmemmap pages cannot be optimized even
> + if hugetlb_free_vmemmap is enabled) from the
> + hotadded memory which will allow to hotadd a
> + lot of memory without requiring additional
> + memory to do so.
> This feature is disabled by default because it
> has some implication on large (e.g. GB)
> allocations in some configurations (e.g. small
> @@ -3090,10 +3094,6 @@
> Note that even when enabled, there are a few cases where
> the feature is not effective.
>
> - This is not compatible with hugetlb_free_vmemmap. If
> - both parameters are enabled, hugetlb_free_vmemmap takes
> - precedence over memory_hotplug.memmap_on_memory.
> -
> memtest= [KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest
> Format: <integer>
> default : 0 <disable>
> diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
> index 5c9aa171a0d3..d7374a1e8ac9 100644
> --- a/Documentation/admin-guide/sysctl/vm.rst
> +++ b/Documentation/admin-guide/sysctl/vm.rst
> @@ -565,9 +565,8 @@ See Documentation/admin-guide/mm/hugetlbpage.rst
> hugetlb_optimize_vmemmap
> ========================
>
> -This knob is not available when memory_hotplug.memmap_on_memory (kernel parameter)
> -is configured or the size of 'struct page' (a structure defined in
> -include/linux/mm_types.h) is not power of two (an unusual system config could
> +This knob is not available when the size of 'struct page' (a structure defined
> +in include/linux/mm_types.h) is not power of two (an unusual system config could
> result in this).
>
> Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages
> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> index 20d7edf62a6a..e0b2209ab71c 100644
> --- a/include/linux/memory_hotplug.h
> +++ b/include/linux/memory_hotplug.h
> @@ -351,13 +351,4 @@ void arch_remove_linear_mapping(u64 start, u64 size);
> extern bool mhp_supports_memmap_on_memory(unsigned long size);
> #endif /* CONFIG_MEMORY_HOTPLUG */
>
> -#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
> -bool mhp_memmap_on_memory(void);
> -#else
> -static inline bool mhp_memmap_on_memory(void)
> -{
> - return false;
> -}
> -#endif
> -
> #endif /* __LINUX_MEMORY_HOTPLUG_H */
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 2cf2a76535ab..607a4fcabbd4 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -1434,6 +1434,7 @@ extern size_t mem_section_usage_size(void);
> MAPPER(IS_ONLINE) \
> MAPPER(IS_EARLY) \
> MAPPER(TAINT_ZONE_DEVICE, CONFIG_ZONE_DEVICE) \
> + MAPPER(CANNOT_OPTIMIZE_VMEMMAP, CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) \
> MAPPER(MAP_LAST_BIT)
>
> #define __SECTION_SHIFT_FLAG_MAPPER_0(x)
> @@ -1471,6 +1472,22 @@ static inline struct page *__section_mem_map_addr(struct mem_section *section)
> return (struct page *)map;
> }
>
> +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
> +static inline void section_mark_cannot_optimize_vmemmap(struct mem_section *ms)
> +{
> + ms->section_mem_map |= SECTION_CANNOT_OPTIMIZE_VMEMMAP;
> +}
> +
> +static inline int section_cannot_optimize_vmemmap(struct mem_section *ms)
> +{
> + return (ms && (ms->section_mem_map & SECTION_CANNOT_OPTIMIZE_VMEMMAP));
> +}
> +#else
> +static inline void section_mark_cannot_optimize_vmemmap(struct mem_section *ms)
> +{
> +}
> +#endif
> +
> static inline int present_section(struct mem_section *section)
> {
> return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index fcd9f7872064..f12170520337 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -97,18 +97,32 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
> return ret;
> }
>
> +static unsigned int optimizable_vmemmap_pages(struct hstate *h,
> + struct page *head)
> +{
> + unsigned long pfn = page_to_pfn(head);
> + unsigned long end = pfn + pages_per_huge_page(h);
> +
> + if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
> + return 0;
> +
> + for (; pfn < end; pfn += PAGES_PER_SECTION) {
> + if (section_cannot_optimize_vmemmap(__pfn_to_section(pfn)))
> + return 0;
> + }
> +
> + return hugetlb_optimize_vmemmap_pages(h);
> +}
> +
> void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
> {
> unsigned long vmemmap_addr = (unsigned long)head;
> unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
>
> - vmemmap_pages = hugetlb_optimize_vmemmap_pages(h);
> + vmemmap_pages = optimizable_vmemmap_pages(h, head);
> if (!vmemmap_pages)
> return;
>
> - if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
> - return;
> -
> static_branch_inc(&hugetlb_optimize_vmemmap_key);
>
> vmemmap_addr += RESERVE_VMEMMAP_SIZE;
> @@ -199,10 +213,10 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = {
> static __init int hugetlb_vmemmap_sysctls_init(void)
> {
> /*
> - * If "memory_hotplug.memmap_on_memory" is enabled or "struct page"
> - * crosses page boundaries, the vmemmap pages cannot be optimized.
> + * If "struct page" crosses page boundaries, the vmemmap pages cannot
> + * be optimized.
> */
> - if (!mhp_memmap_on_memory() && is_power_of_2(sizeof(struct page)))
> + if (is_power_of_2(sizeof(struct page)))
> register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
>
> return 0;
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 3b360eda933f..7309694c4dee 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -43,30 +43,22 @@
> #include "shuffle.h"
>
> #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
> -static int memmap_on_memory_set(const char *val, const struct kernel_param *kp)
> -{
> - if (hugetlb_optimize_vmemmap_enabled())
> - return 0;
> - return param_set_bool(val, kp);
> -}
> -
> -static const struct kernel_param_ops memmap_on_memory_ops = {
> - .flags = KERNEL_PARAM_OPS_FL_NOARG,
> - .set = memmap_on_memory_set,
> - .get = param_get_bool,
> -};
> -
> /*
> * memory_hotplug.memmap_on_memory parameter
> */
> static bool memmap_on_memory __ro_after_init;
> -module_param_cb(memmap_on_memory, &memmap_on_memory_ops, &memmap_on_memory, 0444);
> +module_param(memmap_on_memory, bool, 0444);
> MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
>
> -bool mhp_memmap_on_memory(void)
> +static inline bool mhp_memmap_on_memory(void)
> {
> return memmap_on_memory;
> }
> +#else
> +static inline bool mhp_memmap_on_memory(void)
> +{
> + return false;
> +}
> #endif
>
> enum {
> diff --git a/mm/sparse.c b/mm/sparse.c
> index cb3bfae64036..1f353bf9ea6b 100644
> --- a/mm/sparse.c
> +++ b/mm/sparse.c
> @@ -913,6 +913,14 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
> ms = __nr_to_section(section_nr);
> set_section_nid(section_nr, nid);
> __section_mark_present(ms, section_nr);
> + /*
> + * Mark whole section as non-optimizable once there is a subsection
> + * whose vmemmap pages are allocated from alternative allocator. The
> + * early section is always optimizable since the early section's
> + * vmemmap pages do not consider partially being populated.
> + */
> + if (!early_section(ms) && altmap)
> + section_mark_cannot_optimize_vmemmap(ms);
>
> /* Align memmap to section boundary in the subsection case */
> if (section_nr_to_pfn(section_nr) != start_pfn)
--
Thanks,
David / dhildenb
On Wed, Jun 15, 2022 at 11:35:09AM +0200, David Hildenbrand wrote:
> On 20.05.22 04:55, Muchun Song wrote:
> > We are almost running out of section flags, only one bit is available in
> > the worst case (powerpc with 256k pages). However, there are still some
> > free bits (in ->section_mem_map) on other architectures (e.g. x86_64 has
> > 10 bits available, arm64 has 8 bits available with worst case of 64K
> > pages). We have hard coded those numbers in code, it is inconvenient to
> > use those bits on other architectures except powerpc. So transfer those
> > section flags to enumeration to make it easy to add new section flags in
> > the future. Also, move SECTION_TAINT_ZONE_DEVICE into the scope of
> > CONFIG_ZONE_DEVICE to save a bit on non-zone-device case.
> >
> > Signed-off-by: Muchun Song <[email protected]>
>
> Sorry for the late reply. This looks overly complicated to me.
>
> IOW, staring at that patch I don't quite like what I am seeing.
>
>
> Something like the following is *a lot* easier to read than some
> MAPPER macro magic. What speaks against it?
>
Thanks for taking a look.
Yeah, it is more readable. This question is also raised by Oscar.
I pasted the reply to here.
"
Yeah, it's a little complicated. All the magic aims to generate
two enumeration from one MAPPER(xxx, config), one is SECTION_xxx_SHIFT,
another is SECTION_xxx = BIT(SECTION_xxx_SHIFT) if the 'config' is
configured. If we want to add a new flag, like the follow patch, just
one line could do that.
MAPPER(CANNOT_OPTIMIZE_VMEMMAP, CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP)
Without those magic, we have to add 4 lines like follows to do the
similar thing.
#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
SECTION_CANNOT_OPTIMIZE_VMEMMAP_SHIFT,
#define SECTION_CANNOT_OPTIMIZE_VMEMMAP BIT(SECTION_CANNOT_OPTIMIZE_VMEMMAP_SHIFT)
#endif
I admit it is more clear but not simplified as above approach.
"
Both two approaches are fine to me. I can switch to the following approach
seems you think the following one is better.
Thanks.
> /*
> * Section bits use the lower unused bits in the ->section_mem_map
> */
> enum {
> SECTION_MARKED_PRESENT_BIT = 0,
> SECTION_HAS_MEM_MAP_BIT,
> ...
> #ifdef ZONE_DEVICE
> SECTION_TAINT_ZONE_DEVICE_BIT
> #endif
> }
>
> #define SECTION_MARKED_PRESENT (1ULL << SECTION_MARKED_PRESENT_BIT)
> ...
> #ifdef ZONE_DEVICE
> #define SECTION_TAINT_ZONE_DEVICE (1ULL << SECTION_TAINT_ZONE_DEVICE_BIT)
> #endif /* ZONE_DEVICE */
>
>
>
> > ---
> > include/linux/kconfig.h | 1 +
> > include/linux/mmzone.h | 54 +++++++++++++++++++++++++++++++++++++++++--------
> > mm/memory_hotplug.c | 6 ++++++
> > 3 files changed, 53 insertions(+), 8 deletions(-)
> >
> > diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h
> > index 20d1079e92b4..7044032b9f42 100644
> > --- a/include/linux/kconfig.h
> > +++ b/include/linux/kconfig.h
> > @@ -10,6 +10,7 @@
> > #define __LITTLE_ENDIAN 1234
> > #endif
> >
> > +#define __ARG_PLACEHOLDER_ 0,
> > #define __ARG_PLACEHOLDER_1 0,
> > #define __take_second_arg(__ignored, val, ...) val
> >
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index 299259cfe462..2cf2a76535ab 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
> > @@ -1422,16 +1422,47 @@ extern size_t mem_section_usage_size(void);
> > * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
> > * worst combination is powerpc with 256k pages,
> > * which results in PFN_SECTION_SHIFT equal 6.
> > - * To sum it up, at least 6 bits are available.
> > + * To sum it up, at least 6 bits are available on all architectures.
> > + * However, we can exceed 6 bits on some other architectures except
> > + * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available
> > + * with the worst case of 64K pages on arm64) if we make sure the
> > + * exceeded bit is not applicable to powerpc.
> > */
> > -#define SECTION_MARKED_PRESENT (1UL<<0)
> > -#define SECTION_HAS_MEM_MAP (1UL<<1)
> > -#define SECTION_IS_ONLINE (1UL<<2)
> > -#define SECTION_IS_EARLY (1UL<<3)
> > -#define SECTION_TAINT_ZONE_DEVICE (1UL<<4)
> > -#define SECTION_MAP_LAST_BIT (1UL<<5)
> > +#define ENUM_SECTION_FLAG(MAPPER) \
> > + MAPPER(MARKED_PRESENT) \
> > + MAPPER(HAS_MEM_MAP) \
> > + MAPPER(IS_ONLINE) \
> > + MAPPER(IS_EARLY) \
> > + MAPPER(TAINT_ZONE_DEVICE, CONFIG_ZONE_DEVICE) \
> > + MAPPER(MAP_LAST_BIT)
> > +
> > +#define __SECTION_SHIFT_FLAG_MAPPER_0(x)
> > +#define __SECTION_SHIFT_FLAG_MAPPER_1(x) SECTION_##x##_SHIFT,
> > +#define __SECTION_SHIFT_FLAG_MAPPER(x, ...) \
> > + __PASTE(__SECTION_SHIFT_FLAG_MAPPER_, IS_ENABLED(__VA_ARGS__))(x)
> > +
> > +#define __SECTION_FLAG_MAPPER_0(x)
> > +#define __SECTION_FLAG_MAPPER_1(x) SECTION_##x = BIT(SECTION_##x##_SHIFT),
> > +#define __SECTION_FLAG_MAPPER(x, ...) \
> > + __PASTE(__SECTION_FLAG_MAPPER_, IS_ENABLED(__VA_ARGS__))(x)
> > +
> > +enum {
> > + /*
> > + * Generate a series of enumeration flags like SECTION_$name_SHIFT.
> > + * Each entry in ENUM_SECTION_FLAG() macro will be generated to one
> > + * enumeration iff the 2nd parameter of MAPPER() is defined or absent.
> > + * The $name comes from the 1st parameter of MAPPER() macro.
> > + */
> > + ENUM_SECTION_FLAG(__SECTION_SHIFT_FLAG_MAPPER)
> > + /*
> > + * Generate a series of enumeration flags like:
> > + * SECTION_$name = BIT(SECTION_$name_SHIFT)
> > + */
> > + ENUM_SECTION_FLAG(__SECTION_FLAG_MAPPER)
> > +};
> > +
> > #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1))
> > -#define SECTION_NID_SHIFT 6
> > +#define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT_SHIFT
> >
> > static inline struct page *__section_mem_map_addr(struct mem_section *section)
> > {
> > @@ -1470,12 +1501,19 @@ static inline int online_section(struct mem_section *section)
> > return (section && (section->section_mem_map & SECTION_IS_ONLINE));
> > }
> >
> > +#ifdef CONFIG_ZONE_DEVICE
> > static inline int online_device_section(struct mem_section *section)
> > {
> > unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE;
> >
> > return section && ((section->section_mem_map & flags) == flags);
> > }
> > +#else
> > +static inline int online_device_section(struct mem_section *section)
> > +{
> > + return 0;
> > +}
> > +#endif
> >
> > static inline int online_section_nr(unsigned long nr)
> > {
> > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> > index 1213d0c67a53..3b360eda933f 100644
> > --- a/mm/memory_hotplug.c
> > +++ b/mm/memory_hotplug.c
> > @@ -672,12 +672,18 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
> >
> > }
> >
> > +#ifdef CONFIG_ZONE_DEVICE
> > static void section_taint_zone_device(unsigned long pfn)
> > {
> > struct mem_section *ms = __pfn_to_section(pfn);
> >
> > ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE;
> > }
> > +#else
> > +static inline void section_taint_zone_device(unsigned long pfn)
> > +{
> > +}
> > +#endif
> >
> > /*
> > * Associate the pfn range with the given zone, initializing the memmaps
>
>
> --
> Thanks,
>
> David / dhildenb
>
>
On Wed, Jun 15, 2022 at 11:51:49AM +0200, David Hildenbrand wrote:
> On 20.05.22 04:55, Muchun Song wrote:
> > For now, the feature of hugetlb_free_vmemmap is not compatible with the
> > feature of memory_hotplug.memmap_on_memory, and hugetlb_free_vmemmap
> > takes precedence over memory_hotplug.memmap_on_memory. However, someone
> > wants to make memory_hotplug.memmap_on_memory takes precedence over
> > hugetlb_free_vmemmap since memmap_on_memory makes it more likely to
> > succeed memory hotplug in close-to-OOM situations. So the decision
> > of making hugetlb_free_vmemmap take precedence is not wise and elegant.
> > The proper approach is to have hugetlb_vmemmap.c do the check whether
> > the section which the HugeTLB pages belong to can be optimized. If
> > the section's vmemmap pages are allocated from the added memory block
> > itself, hugetlb_free_vmemmap should refuse to optimize the vmemmap,
> > otherwise, do the optimization. Then both kernel parameters are
> > compatible. So this patch introduces SECTION_CANNOT_OPTIMIZE_VMEMMAP
> > to indicate whether the section could be optimized.
> >
>
> In theory, we have that information stored in the relevant memory block,
> but I assume that lookup in the xarray + locking is impractical.
>
> I wonder if we can derive that information simply from the vmemmap pages
> themselves, because *drumroll*
>
> For one vmemmap page (the first one), the vmemmap corresponds to itself
> -- what?!
>
>
> [ hotplugged memory ]
> [ memmap ][ usable memory ]
> | | |
> ^--- | |
> ^------- |
> ^----------------------
>
> The memmap of the first page of hotplugged memory falls onto itself.
> We'd have to derive from actual "usable memory" that condition.
>
>
> We currently support memmap_on_memory memory only within fixed-size
> memory blocks. So "hotplugged memory" is guaranteed to be aligned to
> memory_block_size_bytes() and the size is memory_block_size_bytes().
>
> If we'd have a page falling into usbale memory, we'd simply lookup the
> first page and test if the vmemmap maps to itself.
>
I think this can work. Should we use this approach in next version?
>
> Of course, once we'd support variable-sized memory blocks, it would be
> different.
>
>
> An easier/future-proof approach might simply be flagging the vmemmap
> pages as being special. We reuse page flags for that, which don't have
> semantics yet (i.e., PG_reserved indicates a boot-time allocation via
> memblock).
>
I think you mean flag vmemmap pages' struct page as PG_reserved if it
can be optimized, right? When the vmemmap pages are allocated in
hugetlb_vmemmap_alloc(), is it valid to flag them as PG_reserved (they
are allocated from buddy allocator not memblock)?
Thanks.
> You'd walk the applicable vmemmap pages you want to optimize and check
> if they are marked as special. You don't have to walk all but can
> optimize: memmap_on_memory uses a vmemmap size that's at least PMD_SIZE.
> So it's sufficient to check a single vmemmap page inside a PMD_SIZE
> vmemmap range.
>
>
> > Signed-off-by: Muchun Song <[email protected]>
> > ---
> > Documentation/admin-guide/kernel-parameters.txt | 22 +++++++++----------
> > Documentation/admin-guide/sysctl/vm.rst | 5 ++---
> > include/linux/memory_hotplug.h | 9 --------
> > include/linux/mmzone.h | 17 +++++++++++++++
> > mm/hugetlb_vmemmap.c | 28 ++++++++++++++++++-------
> > mm/memory_hotplug.c | 22 +++++++------------
> > mm/sparse.c | 8 +++++++
> > 7 files changed, 66 insertions(+), 45 deletions(-)
> >
> > diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> > index c087f578d9d8..5359ffb04a84 100644
> > --- a/Documentation/admin-guide/kernel-parameters.txt
> > +++ b/Documentation/admin-guide/kernel-parameters.txt
> > @@ -1730,9 +1730,11 @@
> > Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y,
> > the default is on.
> >
> > - This is not compatible with memory_hotplug.memmap_on_memory.
> > - If both parameters are enabled, hugetlb_free_vmemmap takes
> > - precedence over memory_hotplug.memmap_on_memory.
> > + Note that the vmemmap pages may be allocated from the added
> > + memory block itself when memory_hotplug.memmap_on_memory is
> > + enabled, those vmemmap pages cannot be optimized even if this
> > + feature is enabled. Other vmemmap pages not allocated from
> > + the added memory block itself do not be affected.
> >
> > hung_task_panic=
> > [KNL] Should the hung task detector generate panics.
> > @@ -3077,10 +3079,12 @@
> > [KNL,X86,ARM] Boolean flag to enable this feature.
> > Format: {on | off (default)}
> > When enabled, runtime hotplugged memory will
> > - allocate its internal metadata (struct pages)
> > - from the hotadded memory which will allow to
> > - hotadd a lot of memory without requiring
> > - additional memory to do so.
> > + allocate its internal metadata (struct pages,
> > + those vmemmap pages cannot be optimized even
> > + if hugetlb_free_vmemmap is enabled) from the
> > + hotadded memory which will allow to hotadd a
> > + lot of memory without requiring additional
> > + memory to do so.
> > This feature is disabled by default because it
> > has some implication on large (e.g. GB)
> > allocations in some configurations (e.g. small
> > @@ -3090,10 +3094,6 @@
> > Note that even when enabled, there are a few cases where
> > the feature is not effective.
> >
> > - This is not compatible with hugetlb_free_vmemmap. If
> > - both parameters are enabled, hugetlb_free_vmemmap takes
> > - precedence over memory_hotplug.memmap_on_memory.
> > -
> > memtest= [KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest
> > Format: <integer>
> > default : 0 <disable>
> > diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
> > index 5c9aa171a0d3..d7374a1e8ac9 100644
> > --- a/Documentation/admin-guide/sysctl/vm.rst
> > +++ b/Documentation/admin-guide/sysctl/vm.rst
> > @@ -565,9 +565,8 @@ See Documentation/admin-guide/mm/hugetlbpage.rst
> > hugetlb_optimize_vmemmap
> > ========================
> >
> > -This knob is not available when memory_hotplug.memmap_on_memory (kernel parameter)
> > -is configured or the size of 'struct page' (a structure defined in
> > -include/linux/mm_types.h) is not power of two (an unusual system config could
> > +This knob is not available when the size of 'struct page' (a structure defined
> > +in include/linux/mm_types.h) is not power of two (an unusual system config could
> > result in this).
> >
> > Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages
> > diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> > index 20d7edf62a6a..e0b2209ab71c 100644
> > --- a/include/linux/memory_hotplug.h
> > +++ b/include/linux/memory_hotplug.h
> > @@ -351,13 +351,4 @@ void arch_remove_linear_mapping(u64 start, u64 size);
> > extern bool mhp_supports_memmap_on_memory(unsigned long size);
> > #endif /* CONFIG_MEMORY_HOTPLUG */
> >
> > -#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
> > -bool mhp_memmap_on_memory(void);
> > -#else
> > -static inline bool mhp_memmap_on_memory(void)
> > -{
> > - return false;
> > -}
> > -#endif
> > -
> > #endif /* __LINUX_MEMORY_HOTPLUG_H */
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index 2cf2a76535ab..607a4fcabbd4 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
> > @@ -1434,6 +1434,7 @@ extern size_t mem_section_usage_size(void);
> > MAPPER(IS_ONLINE) \
> > MAPPER(IS_EARLY) \
> > MAPPER(TAINT_ZONE_DEVICE, CONFIG_ZONE_DEVICE) \
> > + MAPPER(CANNOT_OPTIMIZE_VMEMMAP, CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) \
> > MAPPER(MAP_LAST_BIT)
> >
> > #define __SECTION_SHIFT_FLAG_MAPPER_0(x)
> > @@ -1471,6 +1472,22 @@ static inline struct page *__section_mem_map_addr(struct mem_section *section)
> > return (struct page *)map;
> > }
> >
> > +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
> > +static inline void section_mark_cannot_optimize_vmemmap(struct mem_section *ms)
> > +{
> > + ms->section_mem_map |= SECTION_CANNOT_OPTIMIZE_VMEMMAP;
> > +}
> > +
> > +static inline int section_cannot_optimize_vmemmap(struct mem_section *ms)
> > +{
> > + return (ms && (ms->section_mem_map & SECTION_CANNOT_OPTIMIZE_VMEMMAP));
> > +}
> > +#else
> > +static inline void section_mark_cannot_optimize_vmemmap(struct mem_section *ms)
> > +{
> > +}
> > +#endif
> > +
> > static inline int present_section(struct mem_section *section)
> > {
> > return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
> > diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> > index fcd9f7872064..f12170520337 100644
> > --- a/mm/hugetlb_vmemmap.c
> > +++ b/mm/hugetlb_vmemmap.c
> > @@ -97,18 +97,32 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
> > return ret;
> > }
> >
> > +static unsigned int optimizable_vmemmap_pages(struct hstate *h,
> > + struct page *head)
> > +{
> > + unsigned long pfn = page_to_pfn(head);
> > + unsigned long end = pfn + pages_per_huge_page(h);
> > +
> > + if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
> > + return 0;
> > +
> > + for (; pfn < end; pfn += PAGES_PER_SECTION) {
> > + if (section_cannot_optimize_vmemmap(__pfn_to_section(pfn)))
> > + return 0;
> > + }
> > +
> > + return hugetlb_optimize_vmemmap_pages(h);
> > +}
> > +
> > void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
> > {
> > unsigned long vmemmap_addr = (unsigned long)head;
> > unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
> >
> > - vmemmap_pages = hugetlb_optimize_vmemmap_pages(h);
> > + vmemmap_pages = optimizable_vmemmap_pages(h, head);
> > if (!vmemmap_pages)
> > return;
> >
> > - if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
> > - return;
> > -
> > static_branch_inc(&hugetlb_optimize_vmemmap_key);
> >
> > vmemmap_addr += RESERVE_VMEMMAP_SIZE;
> > @@ -199,10 +213,10 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = {
> > static __init int hugetlb_vmemmap_sysctls_init(void)
> > {
> > /*
> > - * If "memory_hotplug.memmap_on_memory" is enabled or "struct page"
> > - * crosses page boundaries, the vmemmap pages cannot be optimized.
> > + * If "struct page" crosses page boundaries, the vmemmap pages cannot
> > + * be optimized.
> > */
> > - if (!mhp_memmap_on_memory() && is_power_of_2(sizeof(struct page)))
> > + if (is_power_of_2(sizeof(struct page)))
> > register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
> >
> > return 0;
> > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> > index 3b360eda933f..7309694c4dee 100644
> > --- a/mm/memory_hotplug.c
> > +++ b/mm/memory_hotplug.c
> > @@ -43,30 +43,22 @@
> > #include "shuffle.h"
> >
> > #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
> > -static int memmap_on_memory_set(const char *val, const struct kernel_param *kp)
> > -{
> > - if (hugetlb_optimize_vmemmap_enabled())
> > - return 0;
> > - return param_set_bool(val, kp);
> > -}
> > -
> > -static const struct kernel_param_ops memmap_on_memory_ops = {
> > - .flags = KERNEL_PARAM_OPS_FL_NOARG,
> > - .set = memmap_on_memory_set,
> > - .get = param_get_bool,
> > -};
> > -
> > /*
> > * memory_hotplug.memmap_on_memory parameter
> > */
> > static bool memmap_on_memory __ro_after_init;
> > -module_param_cb(memmap_on_memory, &memmap_on_memory_ops, &memmap_on_memory, 0444);
> > +module_param(memmap_on_memory, bool, 0444);
> > MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
> >
> > -bool mhp_memmap_on_memory(void)
> > +static inline bool mhp_memmap_on_memory(void)
> > {
> > return memmap_on_memory;
> > }
> > +#else
> > +static inline bool mhp_memmap_on_memory(void)
> > +{
> > + return false;
> > +}
> > #endif
> >
> > enum {
> > diff --git a/mm/sparse.c b/mm/sparse.c
> > index cb3bfae64036..1f353bf9ea6b 100644
> > --- a/mm/sparse.c
> > +++ b/mm/sparse.c
> > @@ -913,6 +913,14 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
> > ms = __nr_to_section(section_nr);
> > set_section_nid(section_nr, nid);
> > __section_mark_present(ms, section_nr);
> > + /*
> > + * Mark whole section as non-optimizable once there is a subsection
> > + * whose vmemmap pages are allocated from alternative allocator. The
> > + * early section is always optimizable since the early section's
> > + * vmemmap pages do not consider partially being populated.
> > + */
> > + if (!early_section(ms) && altmap)
> > + section_mark_cannot_optimize_vmemmap(ms);
> >
> > /* Align memmap to section boundary in the subsection case */
> > if (section_nr_to_pfn(section_nr) != start_pfn)
>
>
> --
> Thanks,
>
> David / dhildenb
>
>
On Wed, Jun 15, 2022 at 11:51:49AM +0200, David Hildenbrand wrote:
> An easier/future-proof approach might simply be flagging the vmemmap
> pages as being special. We reuse page flags for that, which don't have
> semantics yet (i.e., PG_reserved indicates a boot-time allocation via
> memblock).
The first versions of memmap_on_memory [1] introduced a new page type
to represent such pages, not sure if that is what you mean, e.g:
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f91cb8898ff0..75f302a532f9 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -708,6 +708,7 @@ PAGEFLAG_FALSE(DoubleMap)
#define PG_kmemcg 0x00000200
#define PG_table 0x00000400
#define PG_guard 0x00000800
+#define PG_vmemmap 0x00001000
#define PageType(page, flag) \
((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
@@ -764,6 +765,24 @@ PAGE_TYPE_OPS(Table, table)
*/
PAGE_TYPE_OPS(Guard, guard)
+/*
+ * Vmemmap pages refers to those pages that are used to create the memmap
+ * array, and reside within the same memory range that was hotppluged, so
+ * they are self-hosted. (see include/linux/memory_hotplug.h)
+ */
+PAGE_TYPE_OPS(Vmemmap, vmemmap)
+static __always_inline void SetPageVmemmap(struct page *page)
+{
+ __SetPageVmemmap(page);
+ __SetPageReserved(page);
+}
+
+static __always_inline void ClearPageVmemmap(struct page *page)
+{
+ __ClearPageVmemmap(page);
+ __ClearPageReserved(page);
+}
[1] https://patchwork.kernel.org/project/linux-mm/patch/[email protected]/
>
> You'd walk the applicable vmemmap pages you want to optimize and check
> if they are marked as special. You don't have to walk all but can
> optimize: memmap_on_memory uses a vmemmap size that's at least PMD_SIZE.
> So it's sufficient to check a single vmemmap page inside a PMD_SIZE
> vmemmap range.
>
>
> > Signed-off-by: Muchun Song <[email protected]>
> > ---
> > Documentation/admin-guide/kernel-parameters.txt | 22 +++++++++----------
> > Documentation/admin-guide/sysctl/vm.rst | 5 ++---
> > include/linux/memory_hotplug.h | 9 --------
> > include/linux/mmzone.h | 17 +++++++++++++++
> > mm/hugetlb_vmemmap.c | 28 ++++++++++++++++++-------
> > mm/memory_hotplug.c | 22 +++++++------------
> > mm/sparse.c | 8 +++++++
> > 7 files changed, 66 insertions(+), 45 deletions(-)
> >
> > diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> > index c087f578d9d8..5359ffb04a84 100644
> > --- a/Documentation/admin-guide/kernel-parameters.txt
> > +++ b/Documentation/admin-guide/kernel-parameters.txt
> > @@ -1730,9 +1730,11 @@
> > Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y,
> > the default is on.
> >
> > - This is not compatible with memory_hotplug.memmap_on_memory.
> > - If both parameters are enabled, hugetlb_free_vmemmap takes
> > - precedence over memory_hotplug.memmap_on_memory.
> > + Note that the vmemmap pages may be allocated from the added
> > + memory block itself when memory_hotplug.memmap_on_memory is
> > + enabled, those vmemmap pages cannot be optimized even if this
> > + feature is enabled. Other vmemmap pages not allocated from
> > + the added memory block itself do not be affected.
> >
> > hung_task_panic=
> > [KNL] Should the hung task detector generate panics.
> > @@ -3077,10 +3079,12 @@
> > [KNL,X86,ARM] Boolean flag to enable this feature.
> > Format: {on | off (default)}
> > When enabled, runtime hotplugged memory will
> > - allocate its internal metadata (struct pages)
> > - from the hotadded memory which will allow to
> > - hotadd a lot of memory without requiring
> > - additional memory to do so.
> > + allocate its internal metadata (struct pages,
> > + those vmemmap pages cannot be optimized even
> > + if hugetlb_free_vmemmap is enabled) from the
> > + hotadded memory which will allow to hotadd a
> > + lot of memory without requiring additional
> > + memory to do so.
> > This feature is disabled by default because it
> > has some implication on large (e.g. GB)
> > allocations in some configurations (e.g. small
> > @@ -3090,10 +3094,6 @@
> > Note that even when enabled, there are a few cases where
> > the feature is not effective.
> >
> > - This is not compatible with hugetlb_free_vmemmap. If
> > - both parameters are enabled, hugetlb_free_vmemmap takes
> > - precedence over memory_hotplug.memmap_on_memory.
> > -
> > memtest= [KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest
> > Format: <integer>
> > default : 0 <disable>
> > diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
> > index 5c9aa171a0d3..d7374a1e8ac9 100644
> > --- a/Documentation/admin-guide/sysctl/vm.rst
> > +++ b/Documentation/admin-guide/sysctl/vm.rst
> > @@ -565,9 +565,8 @@ See Documentation/admin-guide/mm/hugetlbpage.rst
> > hugetlb_optimize_vmemmap
> > ========================
> >
> > -This knob is not available when memory_hotplug.memmap_on_memory (kernel parameter)
> > -is configured or the size of 'struct page' (a structure defined in
> > -include/linux/mm_types.h) is not power of two (an unusual system config could
> > +This knob is not available when the size of 'struct page' (a structure defined
> > +in include/linux/mm_types.h) is not power of two (an unusual system config could
> > result in this).
> >
> > Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages
> > diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> > index 20d7edf62a6a..e0b2209ab71c 100644
> > --- a/include/linux/memory_hotplug.h
> > +++ b/include/linux/memory_hotplug.h
> > @@ -351,13 +351,4 @@ void arch_remove_linear_mapping(u64 start, u64 size);
> > extern bool mhp_supports_memmap_on_memory(unsigned long size);
> > #endif /* CONFIG_MEMORY_HOTPLUG */
> >
> > -#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
> > -bool mhp_memmap_on_memory(void);
> > -#else
> > -static inline bool mhp_memmap_on_memory(void)
> > -{
> > - return false;
> > -}
> > -#endif
> > -
> > #endif /* __LINUX_MEMORY_HOTPLUG_H */
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index 2cf2a76535ab..607a4fcabbd4 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
> > @@ -1434,6 +1434,7 @@ extern size_t mem_section_usage_size(void);
> > MAPPER(IS_ONLINE) \
> > MAPPER(IS_EARLY) \
> > MAPPER(TAINT_ZONE_DEVICE, CONFIG_ZONE_DEVICE) \
> > + MAPPER(CANNOT_OPTIMIZE_VMEMMAP, CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) \
> > MAPPER(MAP_LAST_BIT)
> >
> > #define __SECTION_SHIFT_FLAG_MAPPER_0(x)
> > @@ -1471,6 +1472,22 @@ static inline struct page *__section_mem_map_addr(struct mem_section *section)
> > return (struct page *)map;
> > }
> >
> > +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
> > +static inline void section_mark_cannot_optimize_vmemmap(struct mem_section *ms)
> > +{
> > + ms->section_mem_map |= SECTION_CANNOT_OPTIMIZE_VMEMMAP;
> > +}
> > +
> > +static inline int section_cannot_optimize_vmemmap(struct mem_section *ms)
> > +{
> > + return (ms && (ms->section_mem_map & SECTION_CANNOT_OPTIMIZE_VMEMMAP));
> > +}
> > +#else
> > +static inline void section_mark_cannot_optimize_vmemmap(struct mem_section *ms)
> > +{
> > +}
> > +#endif
> > +
> > static inline int present_section(struct mem_section *section)
> > {
> > return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
> > diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> > index fcd9f7872064..f12170520337 100644
> > --- a/mm/hugetlb_vmemmap.c
> > +++ b/mm/hugetlb_vmemmap.c
> > @@ -97,18 +97,32 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
> > return ret;
> > }
> >
> > +static unsigned int optimizable_vmemmap_pages(struct hstate *h,
> > + struct page *head)
> > +{
> > + unsigned long pfn = page_to_pfn(head);
> > + unsigned long end = pfn + pages_per_huge_page(h);
> > +
> > + if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
> > + return 0;
> > +
> > + for (; pfn < end; pfn += PAGES_PER_SECTION) {
> > + if (section_cannot_optimize_vmemmap(__pfn_to_section(pfn)))
> > + return 0;
> > + }
> > +
> > + return hugetlb_optimize_vmemmap_pages(h);
> > +}
> > +
> > void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
> > {
> > unsigned long vmemmap_addr = (unsigned long)head;
> > unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
> >
> > - vmemmap_pages = hugetlb_optimize_vmemmap_pages(h);
> > + vmemmap_pages = optimizable_vmemmap_pages(h, head);
> > if (!vmemmap_pages)
> > return;
> >
> > - if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
> > - return;
> > -
> > static_branch_inc(&hugetlb_optimize_vmemmap_key);
> >
> > vmemmap_addr += RESERVE_VMEMMAP_SIZE;
> > @@ -199,10 +213,10 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = {
> > static __init int hugetlb_vmemmap_sysctls_init(void)
> > {
> > /*
> > - * If "memory_hotplug.memmap_on_memory" is enabled or "struct page"
> > - * crosses page boundaries, the vmemmap pages cannot be optimized.
> > + * If "struct page" crosses page boundaries, the vmemmap pages cannot
> > + * be optimized.
> > */
> > - if (!mhp_memmap_on_memory() && is_power_of_2(sizeof(struct page)))
> > + if (is_power_of_2(sizeof(struct page)))
> > register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
> >
> > return 0;
> > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> > index 3b360eda933f..7309694c4dee 100644
> > --- a/mm/memory_hotplug.c
> > +++ b/mm/memory_hotplug.c
> > @@ -43,30 +43,22 @@
> > #include "shuffle.h"
> >
> > #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
> > -static int memmap_on_memory_set(const char *val, const struct kernel_param *kp)
> > -{
> > - if (hugetlb_optimize_vmemmap_enabled())
> > - return 0;
> > - return param_set_bool(val, kp);
> > -}
> > -
> > -static const struct kernel_param_ops memmap_on_memory_ops = {
> > - .flags = KERNEL_PARAM_OPS_FL_NOARG,
> > - .set = memmap_on_memory_set,
> > - .get = param_get_bool,
> > -};
> > -
> > /*
> > * memory_hotplug.memmap_on_memory parameter
> > */
> > static bool memmap_on_memory __ro_after_init;
> > -module_param_cb(memmap_on_memory, &memmap_on_memory_ops, &memmap_on_memory, 0444);
> > +module_param(memmap_on_memory, bool, 0444);
> > MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
> >
> > -bool mhp_memmap_on_memory(void)
> > +static inline bool mhp_memmap_on_memory(void)
> > {
> > return memmap_on_memory;
> > }
> > +#else
> > +static inline bool mhp_memmap_on_memory(void)
> > +{
> > + return false;
> > +}
> > #endif
> >
> > enum {
> > diff --git a/mm/sparse.c b/mm/sparse.c
> > index cb3bfae64036..1f353bf9ea6b 100644
> > --- a/mm/sparse.c
> > +++ b/mm/sparse.c
> > @@ -913,6 +913,14 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
> > ms = __nr_to_section(section_nr);
> > set_section_nid(section_nr, nid);
> > __section_mark_present(ms, section_nr);
> > + /*
> > + * Mark whole section as non-optimizable once there is a subsection
> > + * whose vmemmap pages are allocated from alternative allocator. The
> > + * early section is always optimizable since the early section's
> > + * vmemmap pages do not consider partially being populated.
> > + */
> > + if (!early_section(ms) && altmap)
> > + section_mark_cannot_optimize_vmemmap(ms);
> >
> > /* Align memmap to section boundary in the subsection case */
> > if (section_nr_to_pfn(section_nr) != start_pfn)
>
>
> --
> Thanks,
>
> David / dhildenb
>
>
--
Oscar Salvador
SUSE Labs
On 16.06.22 04:45, Muchun Song wrote:
> On Wed, Jun 15, 2022 at 11:51:49AM +0200, David Hildenbrand wrote:
>> On 20.05.22 04:55, Muchun Song wrote:
>>> For now, the feature of hugetlb_free_vmemmap is not compatible with the
>>> feature of memory_hotplug.memmap_on_memory, and hugetlb_free_vmemmap
>>> takes precedence over memory_hotplug.memmap_on_memory. However, someone
>>> wants to make memory_hotplug.memmap_on_memory takes precedence over
>>> hugetlb_free_vmemmap since memmap_on_memory makes it more likely to
>>> succeed memory hotplug in close-to-OOM situations. So the decision
>>> of making hugetlb_free_vmemmap take precedence is not wise and elegant.
>>> The proper approach is to have hugetlb_vmemmap.c do the check whether
>>> the section which the HugeTLB pages belong to can be optimized. If
>>> the section's vmemmap pages are allocated from the added memory block
>>> itself, hugetlb_free_vmemmap should refuse to optimize the vmemmap,
>>> otherwise, do the optimization. Then both kernel parameters are
>>> compatible. So this patch introduces SECTION_CANNOT_OPTIMIZE_VMEMMAP
>>> to indicate whether the section could be optimized.
>>>
>>
>> In theory, we have that information stored in the relevant memory block,
>> but I assume that lookup in the xarray + locking is impractical.
>>
>> I wonder if we can derive that information simply from the vmemmap pages
>> themselves, because *drumroll*
>>
>> For one vmemmap page (the first one), the vmemmap corresponds to itself
>> -- what?!
>>
>>
>> [ hotplugged memory ]
>> [ memmap ][ usable memory ]
>> | | |
>> ^--- | |
>> ^------- |
>> ^----------------------
>>
>> The memmap of the first page of hotplugged memory falls onto itself.
>> We'd have to derive from actual "usable memory" that condition.
>>
>>
>> We currently support memmap_on_memory memory only within fixed-size
>> memory blocks. So "hotplugged memory" is guaranteed to be aligned to
>> memory_block_size_bytes() and the size is memory_block_size_bytes().
>>
>> If we'd have a page falling into usbale memory, we'd simply lookup the
>> first page and test if the vmemmap maps to itself.
>>
>
> I think this can work. Should we use this approach in next version?
>
Either that or more preferable, flagging the vmemmap pages eventually.
That's might be future proof.
>>
>> Of course, once we'd support variable-sized memory blocks, it would be
>> different.
>>
>>
>> An easier/future-proof approach might simply be flagging the vmemmap
>> pages as being special. We reuse page flags for that, which don't have
>> semantics yet (i.e., PG_reserved indicates a boot-time allocation via
>> memblock).
>>
>
> I think you mean flag vmemmap pages' struct page as PG_reserved if it
> can be optimized, right? When the vmemmap pages are allocated in
> hugetlb_vmemmap_alloc(), is it valid to flag them as PG_reserved (they
> are allocated from buddy allocator not memblock)?
>
Sorry I wasn't clear. I'd flag them with some other
not-yet-used-for-vmemmap-pages flag. Reusing PG_reserved could result in
trouble.
--
Thanks,
David / dhildenb
On 16.06.22 05:57, Oscar Salvador wrote:
> On Wed, Jun 15, 2022 at 11:51:49AM +0200, David Hildenbrand wrote:
>> An easier/future-proof approach might simply be flagging the vmemmap
>> pages as being special. We reuse page flags for that, which don't have
>> semantics yet (i.e., PG_reserved indicates a boot-time allocation via
>> memblock).
>
> The first versions of memmap_on_memory [1] introduced a new page type
> to represent such pages, not sure if that is what you mean, e.g:
>
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index f91cb8898ff0..75f302a532f9 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -708,6 +708,7 @@ PAGEFLAG_FALSE(DoubleMap)
> #define PG_kmemcg 0x00000200
> #define PG_table 0x00000400
> #define PG_guard 0x00000800
> +#define PG_vmemmap 0x00001000
>
> #define PageType(page, flag) \
> ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
> @@ -764,6 +765,24 @@ PAGE_TYPE_OPS(Table, table)
> */
> PAGE_TYPE_OPS(Guard, guard)
>
> +/*
> + * Vmemmap pages refers to those pages that are used to create the memmap
> + * array, and reside within the same memory range that was hotppluged, so
> + * they are self-hosted. (see include/linux/memory_hotplug.h)
> + */
> +PAGE_TYPE_OPS(Vmemmap, vmemmap)
> +static __always_inline void SetPageVmemmap(struct page *page)
> +{
> + __SetPageVmemmap(page);
> + __SetPageReserved(page);
> +}
> +
> +static __always_inline void ClearPageVmemmap(struct page *page)
> +{
> + __ClearPageVmemmap(page);
> + __ClearPageReserved(page);
> +}
>
> [1] https://patchwork.kernel.org/project/linux-mm/patch/[email protected]/
IIRC, that was used to skip these patches on the offlining path before
we provided the ranges to offline_pages().
I'd not mess with PG_reserved, and give them a clearer name, to not
confuse them with other, ordinary, vmemmap pages that are not
self-hosted (maybe in the future we might want to flag all vmemmap pages
with a new type?).
I'd just try reusing the flag PG_owner_priv_1. And eventually, flag all
(v)memmap pages with a type PG_memmap. However, the latter would be
optional and might not be strictly required
So what think could make sense is
/* vmemmap pages that are self-hosted and cannot be optimized/freed. */
PG_vmemmap_self_hosted = PG_owner_priv_1,
--
Thanks,
David / dhildenb
On Thu, Jun 16, 2022 at 09:21:35AM +0200, David Hildenbrand wrote:
> On 16.06.22 04:45, Muchun Song wrote:
> > On Wed, Jun 15, 2022 at 11:51:49AM +0200, David Hildenbrand wrote:
> >> On 20.05.22 04:55, Muchun Song wrote:
> >>> For now, the feature of hugetlb_free_vmemmap is not compatible with the
> >>> feature of memory_hotplug.memmap_on_memory, and hugetlb_free_vmemmap
> >>> takes precedence over memory_hotplug.memmap_on_memory. However, someone
> >>> wants to make memory_hotplug.memmap_on_memory takes precedence over
> >>> hugetlb_free_vmemmap since memmap_on_memory makes it more likely to
> >>> succeed memory hotplug in close-to-OOM situations. So the decision
> >>> of making hugetlb_free_vmemmap take precedence is not wise and elegant.
> >>> The proper approach is to have hugetlb_vmemmap.c do the check whether
> >>> the section which the HugeTLB pages belong to can be optimized. If
> >>> the section's vmemmap pages are allocated from the added memory block
> >>> itself, hugetlb_free_vmemmap should refuse to optimize the vmemmap,
> >>> otherwise, do the optimization. Then both kernel parameters are
> >>> compatible. So this patch introduces SECTION_CANNOT_OPTIMIZE_VMEMMAP
> >>> to indicate whether the section could be optimized.
> >>>
> >>
> >> In theory, we have that information stored in the relevant memory block,
> >> but I assume that lookup in the xarray + locking is impractical.
> >>
> >> I wonder if we can derive that information simply from the vmemmap pages
> >> themselves, because *drumroll*
> >>
> >> For one vmemmap page (the first one), the vmemmap corresponds to itself
> >> -- what?!
> >>
> >>
> >> [ hotplugged memory ]
> >> [ memmap ][ usable memory ]
> >> | | |
> >> ^--- | |
> >> ^------- |
> >> ^----------------------
> >>
> >> The memmap of the first page of hotplugged memory falls onto itself.
> >> We'd have to derive from actual "usable memory" that condition.
> >>
> >>
> >> We currently support memmap_on_memory memory only within fixed-size
> >> memory blocks. So "hotplugged memory" is guaranteed to be aligned to
> >> memory_block_size_bytes() and the size is memory_block_size_bytes().
> >>
> >> If we'd have a page falling into usbale memory, we'd simply lookup the
> >> first page and test if the vmemmap maps to itself.
> >>
> >
> > I think this can work. Should we use this approach in next version?
> >
>
> Either that or more preferable, flagging the vmemmap pages eventually.
> That's might be future proof.
>
All right. I think we can go with the above approach, we can improve it
to flagging-base approach in the future if needed.
> >>
> >> Of course, once we'd support variable-sized memory blocks, it would be
> >> different.
> >>
> >>
> >> An easier/future-proof approach might simply be flagging the vmemmap
> >> pages as being special. We reuse page flags for that, which don't have
> >> semantics yet (i.e., PG_reserved indicates a boot-time allocation via
> >> memblock).
> >>
> >
> > I think you mean flag vmemmap pages' struct page as PG_reserved if it
> > can be optimized, right? When the vmemmap pages are allocated in
> > hugetlb_vmemmap_alloc(), is it valid to flag them as PG_reserved (they
> > are allocated from buddy allocator not memblock)?
> >
>
> Sorry I wasn't clear. I'd flag them with some other
> not-yet-used-for-vmemmap-pages flag. Reusing PG_reserved could result in
> trouble.
>
Sorry. I thought you suggest reusing "PG_reserved". My bad misreading.
Thanks.
On Thu, Jun 16, 2022 at 09:30:33AM +0200, David Hildenbrand wrote:
> IIRC, that was used to skip these patches on the offlining path before
> we provided the ranges to offline_pages().
Yeah, it was designed for that purpose back then.
> I'd not mess with PG_reserved, and give them a clearer name, to not
> confuse them with other, ordinary, vmemmap pages that are not
> self-hosted (maybe in the future we might want to flag all vmemmap pages
> with a new type?).
Not sure whether a new type is really needed, or to put it another way, I
cannot see the benefit.
>
> I'd just try reusing the flag PG_owner_priv_1. And eventually, flag all
> (v)memmap pages with a type PG_memmap. However, the latter would be
> optional and might not be strictly required
>
>
> So what think could make sense is
>
> /* vmemmap pages that are self-hosted and cannot be optimized/freed. */
> PG_vmemmap_self_hosted = PG_owner_priv_1,
Sure, I just lightly tested the below, and seems to work, but not sure
whether that is what you are referring to.
@Munchun: thoughts?
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e66f7aa3191d..a4556afd7bda 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -193,6 +193,11 @@ enum pageflags {
/* Only valid for buddy pages. Used to track pages that are reported */
PG_reported = PG_uptodate,
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+ /* For self-hosted memmap pages */
+ PG_vmemmap_self_hosted = PG_owner_priv_1,
+#endif
};
#define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
@@ -628,6 +633,10 @@ PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison)
*/
__PAGEFLAG(Reported, reported, PF_NO_COMPOUND)
+#ifdef CONFIG_MEMORY_HOTPLUG
+PAGEFLAG(Vmemmap_self_hosted, vmemmap_self_hosted, PF_ANY)
+#endif
+
/*
* On an anonymous page mapped into a user virtual memory area,
* page->mapping points to its anon_vma, not to a struct address_space;
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 1089ea8a9c98..e2de7ed27e9e 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -101,6 +101,14 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
{
unsigned long vmemmap_addr = (unsigned long)head;
unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
+ struct mem_section *ms = __pfn_to_section(page_to_pfn(head));
+ struct page *memmap;
+
+ memmap = sparse_decode_mem_map(ms->section_mem_map,
+ pfn_to_section_nr(page_to_pfn(head)));
+
+ if (PageVmemmap_self_hosted(memmap))
+ return;
vmemmap_pages = hugetlb_optimize_vmemmap_pages(h);
if (!vmemmap_pages)
@@ -199,10 +207,10 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = {
static __init int hugetlb_vmemmap_sysctls_init(void)
{
/*
- * If "memory_hotplug.memmap_on_memory" is enabled or "struct page"
- * crosses page boundaries, the vmemmap pages cannot be optimized.
+ * If "struct page" crosses page boundaries, the vmemmap pages cannot
+ * be optimized.
*/
- if (!mhp_memmap_on_memory() && is_power_of_2(sizeof(struct page)))
+ if (is_power_of_2(sizeof(struct page)))
register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
return 0;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1213d0c67a53..863966c2c6f1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -45,8 +45,6 @@
#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
static int memmap_on_memory_set(const char *val, const struct kernel_param *kp)
{
- if (hugetlb_optimize_vmemmap_enabled())
- return 0;
return param_set_bool(val, kp);
}
@@ -1032,6 +1030,7 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
{
unsigned long end_pfn = pfn + nr_pages;
int ret;
+ int i;
ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
if (ret)
@@ -1039,6 +1038,12 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
+ /*
+ * Let us flag self-hosted memmap
+ */
+ for (i = 0; i < nr_pages; i++)
+ SetPageVmemmap_self_hosted(pfn_to_page(pfn + i));
+
/*
* It might be that the vmemmap_pages fully span sections. If that is
* the case, mark those sections online here as otherwise they will be
--
Oscar Salvador
SUSE Labs
On Fri, Jun 17, 2022 at 07:46:53AM +0200, Oscar Salvador wrote:
> On Thu, Jun 16, 2022 at 09:30:33AM +0200, David Hildenbrand wrote:
> > IIRC, that was used to skip these patches on the offlining path before
> > we provided the ranges to offline_pages().
>
> Yeah, it was designed for that purpose back then.
>
> > I'd not mess with PG_reserved, and give them a clearer name, to not
> > confuse them with other, ordinary, vmemmap pages that are not
> > self-hosted (maybe in the future we might want to flag all vmemmap pages
> > with a new type?).
>
> Not sure whether a new type is really needed, or to put it another way, I
> cannot see the benefit.
>
> >
> > I'd just try reusing the flag PG_owner_priv_1. And eventually, flag all
> > (v)memmap pages with a type PG_memmap. However, the latter would be
> > optional and might not be strictly required
> >
> >
> > So what think could make sense is
> >
> > /* vmemmap pages that are self-hosted and cannot be optimized/freed. */
> > PG_vmemmap_self_hosted = PG_owner_priv_1,
>
> Sure, I just lightly tested the below, and seems to work, but not sure
> whether that is what you are referring to.
> @Munchun: thoughts?
>
I think it works and fits my requirement.
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index e66f7aa3191d..a4556afd7bda 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -193,6 +193,11 @@ enum pageflags {
>
> /* Only valid for buddy pages. Used to track pages that are reported */
> PG_reported = PG_uptodate,
> +
> +#ifdef CONFIG_MEMORY_HOTPLUG
> + /* For self-hosted memmap pages */
> + PG_vmemmap_self_hosted = PG_owner_priv_1,
> +#endif
> };
>
> #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
> @@ -628,6 +633,10 @@ PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison)
> */
> __PAGEFLAG(Reported, reported, PF_NO_COMPOUND)
>
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +PAGEFLAG(Vmemmap_self_hosted, vmemmap_self_hosted, PF_ANY)
> +#endif
> +
> /*
> * On an anonymous page mapped into a user virtual memory area,
> * page->mapping points to its anon_vma, not to a struct address_space;
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index 1089ea8a9c98..e2de7ed27e9e 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -101,6 +101,14 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
> {
> unsigned long vmemmap_addr = (unsigned long)head;
> unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
> + struct mem_section *ms = __pfn_to_section(page_to_pfn(head));
> + struct page *memmap;
> +
> + memmap = sparse_decode_mem_map(ms->section_mem_map,
> + pfn_to_section_nr(page_to_pfn(head)));
> +
> + if (PageVmemmap_self_hosted(memmap))
> + return;
I think here needs a loop if it is a 1GB page (spans multiple sections).
Right? Here is an implementation based on another approach. But I think
your implementation is more simpler and efficient. Would you mind me
squash your diff into my patch and with your "Co-developed-by"?
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index fcd9f7872064..46d637acc15e 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -10,7 +10,7 @@
*/
#define pr_fmt(fmt) "HugeTLB: " fmt
-#include <linux/memory_hotplug.h>
+#include <linux/memory.h>
#include "hugetlb_vmemmap.h"
/*
@@ -97,18 +97,79 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
return ret;
}
+/*
+ * The vmemmap of the first page of hotplugged memory falls onto itself when
+ * memory_hotplug.memmap_on_memory is enabled, and the vmemmap pages cannot be
+ * optimized in this case. We can simply lookup the first page and test if
+ * the vmemmap maps to itself to detect if memory_hotplug.memmap_on_memory is
+ * enabled for this memory block.
+ *
+ * [ hotplugged memory ]
+ * [ vmemmap ][ usable memory ]
+ * ^ | | |
+ * +---+ | |
+ * ^ | |
+ * +--------+ |
+ * ^ |
+ * +-----------------+
+ */
+static bool memory_block_vmemmap_optimizable(unsigned long start_pfn)
+{
+ pmd_t *pmdp, pmd;
+ unsigned long pfn, vaddr;
+
+ vaddr = (unsigned long)pfn_to_page(start_pfn);
+ pmdp = pmd_off_k(vaddr);
+ /*
+ * The READ_ONCE() is used to stabilize *pmdp in a register or on
+ * the stack so that it will stop changing under the code.
+ */
+ pmd = READ_ONCE(*pmdp);
+
+ if (pmd_large(pmd))
+ pfn = pmd_pfn(pmd);
+ else
+ pfn = pte_pfn(*pte_offset_kernel(pmdp, vaddr));
+
+ return pfn != start_pfn;
+}
+
+static unsigned int optimizable_vmemmap_pages(struct hstate *h,
+ struct page *head)
+{
+ unsigned long size = memory_block_size_bytes();
+ unsigned long pfn = page_to_pfn(head);
+ unsigned long start = ALIGN_DOWN(pfn, size);
+ unsigned long end = start + pages_per_huge_page(h);
+
+ if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
+ return 0;
+
+ for (; start < end; start += size) {
+ /*
+ * Fast path. The early section is always optimizable since the
+ * early section's vmemmap pages do not allocated from the added
+ * memory block itself.
+ */
+ if (early_section(__pfn_to_section(start + (pfn & PAGE_SECTION_MASK))))
+ continue;
+
+ if (!memory_block_vmemmap_optimizable(start))
+ return 0;
+ }
+
+ return hugetlb_optimize_vmemmap_pages(h);
+}
+
void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
{
unsigned long vmemmap_addr = (unsigned long)head;
unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
- vmemmap_pages = hugetlb_optimize_vmemmap_pages(h);
+ vmemmap_pages = optimizable_vmemmap_pages(h, head);
if (!vmemmap_pages)
return;
- if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
- return;
-
static_branch_inc(&hugetlb_optimize_vmemmap_key);
vmemmap_addr += RESERVE_VMEMMAP_SIZE;
@@ -199,10 +260,10 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = {
static __init int hugetlb_vmemmap_sysctls_init(void)
{
/*
- * If "memory_hotplug.memmap_on_memory" is enabled or "struct page"
- * crosses page boundaries, the vmemmap pages cannot be optimized.
+ * If "struct page" crosses page boundaries, the vmemmap pages cannot
+ * be optimized.
*/
- if (!mhp_memmap_on_memory() && is_power_of_2(sizeof(struct page)))
+ if (is_power_of_2(sizeof(struct page)))
register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
return 0;
>
> vmemmap_pages = hugetlb_optimize_vmemmap_pages(h);
> if (!vmemmap_pages)
> @@ -199,10 +207,10 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = {
> static __init int hugetlb_vmemmap_sysctls_init(void)
> {
> /*
> - * If "memory_hotplug.memmap_on_memory" is enabled or "struct page"
> - * crosses page boundaries, the vmemmap pages cannot be optimized.
> + * If "struct page" crosses page boundaries, the vmemmap pages cannot
> + * be optimized.
> */
> - if (!mhp_memmap_on_memory() && is_power_of_2(sizeof(struct page)))
> + if (is_power_of_2(sizeof(struct page)))
> register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
>
> return 0;
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 1213d0c67a53..863966c2c6f1 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -45,8 +45,6 @@
> #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
> static int memmap_on_memory_set(const char *val, const struct kernel_param *kp)
> {
> - if (hugetlb_optimize_vmemmap_enabled())
> - return 0;
> return param_set_bool(val, kp);
> }
>
> @@ -1032,6 +1030,7 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
> {
> unsigned long end_pfn = pfn + nr_pages;
> int ret;
> + int i;
>
> ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
> if (ret)
> @@ -1039,6 +1038,12 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
>
> move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
>
> + /*
> + * Let us flag self-hosted memmap
> + */
> + for (i = 0; i < nr_pages; i++)
> + SetPageVmemmap_self_hosted(pfn_to_page(pfn + i));
> +
> /*
> * It might be that the vmemmap_pages fully span sections. If that is
> * the case, mark those sections online here as otherwise they will be
>
>
> --
> Oscar Salvador
> SUSE Labs
>
On 17.06.22 09:28, Muchun Song wrote:
> On Fri, Jun 17, 2022 at 07:46:53AM +0200, Oscar Salvador wrote:
>> On Thu, Jun 16, 2022 at 09:30:33AM +0200, David Hildenbrand wrote:
>>> IIRC, that was used to skip these patches on the offlining path before
>>> we provided the ranges to offline_pages().
>>
>> Yeah, it was designed for that purpose back then.
>>
>>> I'd not mess with PG_reserved, and give them a clearer name, to not
>>> confuse them with other, ordinary, vmemmap pages that are not
>>> self-hosted (maybe in the future we might want to flag all vmemmap pages
>>> with a new type?).
>>
>> Not sure whether a new type is really needed, or to put it another way, I
>> cannot see the benefit.
>>
>>>
>>> I'd just try reusing the flag PG_owner_priv_1. And eventually, flag all
>>> (v)memmap pages with a type PG_memmap. However, the latter would be
>>> optional and might not be strictly required
>>>
>>>
>>> So what think could make sense is
>>>
>>> /* vmemmap pages that are self-hosted and cannot be optimized/freed. */
>>> PG_vmemmap_self_hosted = PG_owner_priv_1,
>>
>> Sure, I just lightly tested the below, and seems to work, but not sure
>> whether that is what you are referring to.
>> @Munchun: thoughts?
>>
>
> I think it works and fits my requirement.
>
>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
>> index e66f7aa3191d..a4556afd7bda 100644
>> --- a/include/linux/page-flags.h
>> +++ b/include/linux/page-flags.h
>> @@ -193,6 +193,11 @@ enum pageflags {
>>
>> /* Only valid for buddy pages. Used to track pages that are reported */
>> PG_reported = PG_uptodate,
>> +
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> + /* For self-hosted memmap pages */
>> + PG_vmemmap_self_hosted = PG_owner_priv_1,
>> +#endif
>> };
>>
>> #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
>> @@ -628,6 +633,10 @@ PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison)
>> */
>> __PAGEFLAG(Reported, reported, PF_NO_COMPOUND)
>>
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +PAGEFLAG(Vmemmap_self_hosted, vmemmap_self_hosted, PF_ANY)
>> +#endif
>> +
>> /*
>> * On an anonymous page mapped into a user virtual memory area,
>> * page->mapping points to its anon_vma, not to a struct address_space;
>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>> index 1089ea8a9c98..e2de7ed27e9e 100644
>> --- a/mm/hugetlb_vmemmap.c
>> +++ b/mm/hugetlb_vmemmap.c
>> @@ -101,6 +101,14 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
>> {
>> unsigned long vmemmap_addr = (unsigned long)head;
>> unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
>> + struct mem_section *ms = __pfn_to_section(page_to_pfn(head));
>> + struct page *memmap;
>> +
>> + memmap = sparse_decode_mem_map(ms->section_mem_map,
>> + pfn_to_section_nr(page_to_pfn(head)));
>> +
>> + if (PageVmemmap_self_hosted(memmap))
>> + return;
>
> I think here needs a loop if it is a 1GB page (spans multiple sections).
> Right? Here is an implementation based on another approach. But I think
> your implementation is more simpler and efficient. Would you mind me
> squash your diff into my patch and with your "Co-developed-by"?
Due to hugtlb alignment requirements, and the vmemmap pages being at the
start of the hotplugged memory region, I think that cannot currently
happen. Checking the first vmemmap page might be good enough for now,
and probably for the future.
--
Thanks,
David / dhildenb
On 17.06.22 07:46, Oscar Salvador wrote:
> On Thu, Jun 16, 2022 at 09:30:33AM +0200, David Hildenbrand wrote:
>> IIRC, that was used to skip these patches on the offlining path before
>> we provided the ranges to offline_pages().
>
> Yeah, it was designed for that purpose back then.
>
>> I'd not mess with PG_reserved, and give them a clearer name, to not
>> confuse them with other, ordinary, vmemmap pages that are not
>> self-hosted (maybe in the future we might want to flag all vmemmap pages
>> with a new type?).
>
> Not sure whether a new type is really needed, or to put it another way, I
> cannot see the benefit.
>
>>
>> I'd just try reusing the flag PG_owner_priv_1. And eventually, flag all
>> (v)memmap pages with a type PG_memmap. However, the latter would be
>> optional and might not be strictly required
>>
>>
>> So what think could make sense is
>>
>> /* vmemmap pages that are self-hosted and cannot be optimized/freed. */
>> PG_vmemmap_self_hosted = PG_owner_priv_1,
>
> Sure, I just lightly tested the below, and seems to work, but not sure
> whether that is what you are referring to.
> @Munchun: thoughts?
>
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index e66f7aa3191d..a4556afd7bda 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -193,6 +193,11 @@ enum pageflags {
>
> /* Only valid for buddy pages. Used to track pages that are reported */
> PG_reported = PG_uptodate,
> +
> +#ifdef CONFIG_MEMORY_HOTPLUG
> + /* For self-hosted memmap pages */
> + PG_vmemmap_self_hosted = PG_owner_priv_1,
> +#endif
> };
>
> #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
> @@ -628,6 +633,10 @@ PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison)
> */
> __PAGEFLAG(Reported, reported, PF_NO_COMPOUND)
>
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +PAGEFLAG(Vmemmap_self_hosted, vmemmap_self_hosted, PF_ANY)
VmemmapSelfHosted, then the function names get nicer.
> +#endif
> +
> /*
> * On an anonymous page mapped into a user virtual memory area,
> * page->mapping points to its anon_vma, not to a struct address_space;
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index 1089ea8a9c98..e2de7ed27e9e 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -101,6 +101,14 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
> {
> unsigned long vmemmap_addr = (unsigned long)head;
> unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
> + struct mem_section *ms = __pfn_to_section(page_to_pfn(head));
> + struct page *memmap;
> +
> + memmap = sparse_decode_mem_map(ms->section_mem_map,
> + pfn_to_section_nr(page_to_pfn(head)));
Why can't we check the head page directly? Either I need more coffee or
this can be simplified.
> +
> + if (PageVmemmap_self_hosted(memmap))
Maybe that's the right place for a comment, an ascii art, and how it is
safe to only check the first vmemmap page due to alignment restrictions.
> + return;
>
> vmemmap_pages = hugetlb_optimize_vmemmap_pages(h);
> if (!vmemmap_pages)
> @@ -199,10 +207,10 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = {
> static __init int hugetlb_vmemmap_sysctls_init(void)
> {
> /*
> - * If "memory_hotplug.memmap_on_memory" is enabled or "struct page"
> - * crosses page boundaries, the vmemmap pages cannot be optimized.
> + * If "struct page" crosses page boundaries, the vmemmap pages cannot
> + * be optimized.
> */
> - if (!mhp_memmap_on_memory() && is_power_of_2(sizeof(struct page)))
> + if (is_power_of_2(sizeof(struct page)))
> register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
>
> return 0;
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 1213d0c67a53..863966c2c6f1 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -45,8 +45,6 @@
> #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
> static int memmap_on_memory_set(const char *val, const struct kernel_param *kp)
> {
> - if (hugetlb_optimize_vmemmap_enabled())
> - return 0;
> return param_set_bool(val, kp);
> }
>
> @@ -1032,6 +1030,7 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
> {
> unsigned long end_pfn = pfn + nr_pages;
> int ret;
> + int i;
>
> ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
> if (ret)
> @@ -1039,6 +1038,12 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
>
> move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
>
> + /*
> + * Let us flag self-hosted memmap
> + */
I think that comment can be dropped because the code does exactly that.
> + for (i = 0; i < nr_pages; i++)
> + SetPageVmemmap_self_hosted(pfn_to_page(pfn + i));
> +
> /*
> * It might be that the vmemmap_pages fully span sections. If that is
> * the case, mark those sections online here as otherwise they will be
>
>
--
Thanks,
David / dhildenb
On Fri, Jun 17, 2022 at 09:39:27AM +0200, David Hildenbrand wrote:
> On 17.06.22 09:28, Muchun Song wrote:
> > On Fri, Jun 17, 2022 at 07:46:53AM +0200, Oscar Salvador wrote:
> >> On Thu, Jun 16, 2022 at 09:30:33AM +0200, David Hildenbrand wrote:
> >>> IIRC, that was used to skip these patches on the offlining path before
> >>> we provided the ranges to offline_pages().
> >>
> >> Yeah, it was designed for that purpose back then.
> >>
> >>> I'd not mess with PG_reserved, and give them a clearer name, to not
> >>> confuse them with other, ordinary, vmemmap pages that are not
> >>> self-hosted (maybe in the future we might want to flag all vmemmap pages
> >>> with a new type?).
> >>
> >> Not sure whether a new type is really needed, or to put it another way, I
> >> cannot see the benefit.
> >>
> >>>
> >>> I'd just try reusing the flag PG_owner_priv_1. And eventually, flag all
> >>> (v)memmap pages with a type PG_memmap. However, the latter would be
> >>> optional and might not be strictly required
> >>>
> >>>
> >>> So what think could make sense is
> >>>
> >>> /* vmemmap pages that are self-hosted and cannot be optimized/freed. */
> >>> PG_vmemmap_self_hosted = PG_owner_priv_1,
> >>
> >> Sure, I just lightly tested the below, and seems to work, but not sure
> >> whether that is what you are referring to.
> >> @Munchun: thoughts?
> >>
> >
> > I think it works and fits my requirement.
> >
> >> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> >> index e66f7aa3191d..a4556afd7bda 100644
> >> --- a/include/linux/page-flags.h
> >> +++ b/include/linux/page-flags.h
> >> @@ -193,6 +193,11 @@ enum pageflags {
> >>
> >> /* Only valid for buddy pages. Used to track pages that are reported */
> >> PG_reported = PG_uptodate,
> >> +
> >> +#ifdef CONFIG_MEMORY_HOTPLUG
> >> + /* For self-hosted memmap pages */
> >> + PG_vmemmap_self_hosted = PG_owner_priv_1,
> >> +#endif
> >> };
> >>
> >> #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
> >> @@ -628,6 +633,10 @@ PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison)
> >> */
> >> __PAGEFLAG(Reported, reported, PF_NO_COMPOUND)
> >>
> >> +#ifdef CONFIG_MEMORY_HOTPLUG
> >> +PAGEFLAG(Vmemmap_self_hosted, vmemmap_self_hosted, PF_ANY)
> >> +#endif
> >> +
> >> /*
> >> * On an anonymous page mapped into a user virtual memory area,
> >> * page->mapping points to its anon_vma, not to a struct address_space;
> >> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> >> index 1089ea8a9c98..e2de7ed27e9e 100644
> >> --- a/mm/hugetlb_vmemmap.c
> >> +++ b/mm/hugetlb_vmemmap.c
> >> @@ -101,6 +101,14 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
> >> {
> >> unsigned long vmemmap_addr = (unsigned long)head;
> >> unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
> >> + struct mem_section *ms = __pfn_to_section(page_to_pfn(head));
> >> + struct page *memmap;
> >> +
> >> + memmap = sparse_decode_mem_map(ms->section_mem_map,
> >> + pfn_to_section_nr(page_to_pfn(head)));
> >> +
> >> + if (PageVmemmap_self_hosted(memmap))
> >> + return;
> >
> > I think here needs a loop if it is a 1GB page (spans multiple sections).
> > Right? Here is an implementation based on another approach. But I think
> > your implementation is more simpler and efficient. Would you mind me
> > squash your diff into my patch and with your "Co-developed-by"?
>
> Due to hugtlb alignment requirements, and the vmemmap pages being at the
> start of the hotplugged memory region, I think that cannot currently
> happen. Checking the first vmemmap page might be good enough for now,
> and probably for the future.
>
If the memory block size is 128MB, then a 1GB huge page spans 8 blocks.
Is it possible that some blocks of them are vmemmap-hosted?
Thanks.
On 17.06.22 11:10, Muchun Song wrote:
> On Fri, Jun 17, 2022 at 09:39:27AM +0200, David Hildenbrand wrote:
>> On 17.06.22 09:28, Muchun Song wrote:
>>> On Fri, Jun 17, 2022 at 07:46:53AM +0200, Oscar Salvador wrote:
>>>> On Thu, Jun 16, 2022 at 09:30:33AM +0200, David Hildenbrand wrote:
>>>>> IIRC, that was used to skip these patches on the offlining path before
>>>>> we provided the ranges to offline_pages().
>>>>
>>>> Yeah, it was designed for that purpose back then.
>>>>
>>>>> I'd not mess with PG_reserved, and give them a clearer name, to not
>>>>> confuse them with other, ordinary, vmemmap pages that are not
>>>>> self-hosted (maybe in the future we might want to flag all vmemmap pages
>>>>> with a new type?).
>>>>
>>>> Not sure whether a new type is really needed, or to put it another way, I
>>>> cannot see the benefit.
>>>>
>>>>>
>>>>> I'd just try reusing the flag PG_owner_priv_1. And eventually, flag all
>>>>> (v)memmap pages with a type PG_memmap. However, the latter would be
>>>>> optional and might not be strictly required
>>>>>
>>>>>
>>>>> So what think could make sense is
>>>>>
>>>>> /* vmemmap pages that are self-hosted and cannot be optimized/freed. */
>>>>> PG_vmemmap_self_hosted = PG_owner_priv_1,
>>>>
>>>> Sure, I just lightly tested the below, and seems to work, but not sure
>>>> whether that is what you are referring to.
>>>> @Munchun: thoughts?
>>>>
>>>
>>> I think it works and fits my requirement.
>>>
>>>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
>>>> index e66f7aa3191d..a4556afd7bda 100644
>>>> --- a/include/linux/page-flags.h
>>>> +++ b/include/linux/page-flags.h
>>>> @@ -193,6 +193,11 @@ enum pageflags {
>>>>
>>>> /* Only valid for buddy pages. Used to track pages that are reported */
>>>> PG_reported = PG_uptodate,
>>>> +
>>>> +#ifdef CONFIG_MEMORY_HOTPLUG
>>>> + /* For self-hosted memmap pages */
>>>> + PG_vmemmap_self_hosted = PG_owner_priv_1,
>>>> +#endif
>>>> };
>>>>
>>>> #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
>>>> @@ -628,6 +633,10 @@ PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison)
>>>> */
>>>> __PAGEFLAG(Reported, reported, PF_NO_COMPOUND)
>>>>
>>>> +#ifdef CONFIG_MEMORY_HOTPLUG
>>>> +PAGEFLAG(Vmemmap_self_hosted, vmemmap_self_hosted, PF_ANY)
>>>> +#endif
>>>> +
>>>> /*
>>>> * On an anonymous page mapped into a user virtual memory area,
>>>> * page->mapping points to its anon_vma, not to a struct address_space;
>>>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>>>> index 1089ea8a9c98..e2de7ed27e9e 100644
>>>> --- a/mm/hugetlb_vmemmap.c
>>>> +++ b/mm/hugetlb_vmemmap.c
>>>> @@ -101,6 +101,14 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
>>>> {
>>>> unsigned long vmemmap_addr = (unsigned long)head;
>>>> unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
>>>> + struct mem_section *ms = __pfn_to_section(page_to_pfn(head));
>>>> + struct page *memmap;
>>>> +
>>>> + memmap = sparse_decode_mem_map(ms->section_mem_map,
>>>> + pfn_to_section_nr(page_to_pfn(head)));
>>>> +
>>>> + if (PageVmemmap_self_hosted(memmap))
>>>> + return;
>>>
>>> I think here needs a loop if it is a 1GB page (spans multiple sections).
>>> Right? Here is an implementation based on another approach. But I think
>>> your implementation is more simpler and efficient. Would you mind me
>>> squash your diff into my patch and with your "Co-developed-by"?
>>
>> Due to hugtlb alignment requirements, and the vmemmap pages being at the
>> start of the hotplugged memory region, I think that cannot currently
>> happen. Checking the first vmemmap page might be good enough for now,
>> and probably for the future.
>>
>
> If the memory block size is 128MB, then a 1GB huge page spans 8 blocks.
> Is it possible that some blocks of them are vmemmap-hosted?
No, don't think so. If you think about it, a huge/gigantic page can only
start in a memmap-on-memory region but never end in on (or overlap one)
-- because the reserved memmap part of the memory block always precedes
actually usable data.
So even with variable-size memory blocks and weird address alignment,
checking the first memmap of a huge page for vmemmp-on-memory should be
sufficient.
Unless I am missing something.
--
Thanks,
David / dhildenb
On Fri, Jun 17, 2022 at 11:25:20AM +0200, David Hildenbrand wrote:
> On 17.06.22 11:10, Muchun Song wrote:
> > On Fri, Jun 17, 2022 at 09:39:27AM +0200, David Hildenbrand wrote:
> >> On 17.06.22 09:28, Muchun Song wrote:
> >>> On Fri, Jun 17, 2022 at 07:46:53AM +0200, Oscar Salvador wrote:
> >>>> On Thu, Jun 16, 2022 at 09:30:33AM +0200, David Hildenbrand wrote:
> >>>>> IIRC, that was used to skip these patches on the offlining path before
> >>>>> we provided the ranges to offline_pages().
> >>>>
> >>>> Yeah, it was designed for that purpose back then.
> >>>>
> >>>>> I'd not mess with PG_reserved, and give them a clearer name, to not
> >>>>> confuse them with other, ordinary, vmemmap pages that are not
> >>>>> self-hosted (maybe in the future we might want to flag all vmemmap pages
> >>>>> with a new type?).
> >>>>
> >>>> Not sure whether a new type is really needed, or to put it another way, I
> >>>> cannot see the benefit.
> >>>>
> >>>>>
> >>>>> I'd just try reusing the flag PG_owner_priv_1. And eventually, flag all
> >>>>> (v)memmap pages with a type PG_memmap. However, the latter would be
> >>>>> optional and might not be strictly required
> >>>>>
> >>>>>
> >>>>> So what think could make sense is
> >>>>>
> >>>>> /* vmemmap pages that are self-hosted and cannot be optimized/freed. */
> >>>>> PG_vmemmap_self_hosted = PG_owner_priv_1,
> >>>>
> >>>> Sure, I just lightly tested the below, and seems to work, but not sure
> >>>> whether that is what you are referring to.
> >>>> @Munchun: thoughts?
> >>>>
> >>>
> >>> I think it works and fits my requirement.
> >>>
> >>>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> >>>> index e66f7aa3191d..a4556afd7bda 100644
> >>>> --- a/include/linux/page-flags.h
> >>>> +++ b/include/linux/page-flags.h
> >>>> @@ -193,6 +193,11 @@ enum pageflags {
> >>>>
> >>>> /* Only valid for buddy pages. Used to track pages that are reported */
> >>>> PG_reported = PG_uptodate,
> >>>> +
> >>>> +#ifdef CONFIG_MEMORY_HOTPLUG
> >>>> + /* For self-hosted memmap pages */
> >>>> + PG_vmemmap_self_hosted = PG_owner_priv_1,
> >>>> +#endif
> >>>> };
> >>>>
> >>>> #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
> >>>> @@ -628,6 +633,10 @@ PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison)
> >>>> */
> >>>> __PAGEFLAG(Reported, reported, PF_NO_COMPOUND)
> >>>>
> >>>> +#ifdef CONFIG_MEMORY_HOTPLUG
> >>>> +PAGEFLAG(Vmemmap_self_hosted, vmemmap_self_hosted, PF_ANY)
> >>>> +#endif
> >>>> +
> >>>> /*
> >>>> * On an anonymous page mapped into a user virtual memory area,
> >>>> * page->mapping points to its anon_vma, not to a struct address_space;
> >>>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> >>>> index 1089ea8a9c98..e2de7ed27e9e 100644
> >>>> --- a/mm/hugetlb_vmemmap.c
> >>>> +++ b/mm/hugetlb_vmemmap.c
> >>>> @@ -101,6 +101,14 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
> >>>> {
> >>>> unsigned long vmemmap_addr = (unsigned long)head;
> >>>> unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
> >>>> + struct mem_section *ms = __pfn_to_section(page_to_pfn(head));
> >>>> + struct page *memmap;
> >>>> +
> >>>> + memmap = sparse_decode_mem_map(ms->section_mem_map,
> >>>> + pfn_to_section_nr(page_to_pfn(head)));
> >>>> +
> >>>> + if (PageVmemmap_self_hosted(memmap))
> >>>> + return;
> >>>
> >>> I think here needs a loop if it is a 1GB page (spans multiple sections).
> >>> Right? Here is an implementation based on another approach. But I think
> >>> your implementation is more simpler and efficient. Would you mind me
> >>> squash your diff into my patch and with your "Co-developed-by"?
> >>
> >> Due to hugtlb alignment requirements, and the vmemmap pages being at the
> >> start of the hotplugged memory region, I think that cannot currently
> >> happen. Checking the first vmemmap page might be good enough for now,
> >> and probably for the future.
> >>
> >
> > If the memory block size is 128MB, then a 1GB huge page spans 8 blocks.
> > Is it possible that some blocks of them are vmemmap-hosted?
>
> No, don't think so. If you think about it, a huge/gigantic page can only
> start in a memmap-on-memory region but never end in on (or overlap one)
> -- because the reserved memmap part of the memory block always precedes
> actually usable data.
>
> So even with variable-size memory blocks and weird address alignment,
> checking the first memmap of a huge page for vmemmp-on-memory should be
> sufficient.
>
> Unless I am missing something.
>
Got it. You are awesome. I ignored the fact that we have reserved
some memory as vmemmap pages in memmap-on-memory case, the whole
memory block cannot be used as a gigantic page. Thanks for your
nice explanation.
On Fri, Jun 17, 2022 at 03:28:10PM +0800, Muchun Song wrote:
> I think here needs a loop if it is a 1GB page (spans multiple sections).
> Right? Here is an implementation based on another approach. But I think
> your implementation is more simpler and efficient. Would you mind me
> squash your diff into my patch and with your "Co-developed-by"?
Sure, fine by me.
--
Oscar Salvador
SUSE Labs
On 17.06.22 11:54, Oscar Salvador wrote:
> On Fri, Jun 17, 2022 at 09:43:33AM +0200, David Hildenbrand wrote:
>> VmemmapSelfHosted, then the function names get nicer.
>
> Definitely.
>
>>
>>> +#endif
>>> +
>>> /*
>>> * On an anonymous page mapped into a user virtual memory area,
>>> * page->mapping points to its anon_vma, not to a struct address_space;
>>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>>> index 1089ea8a9c98..e2de7ed27e9e 100644
>>> --- a/mm/hugetlb_vmemmap.c
>>> +++ b/mm/hugetlb_vmemmap.c
>>> @@ -101,6 +101,14 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
>>> {
>>> unsigned long vmemmap_addr = (unsigned long)head;
>>> unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
>>> + struct mem_section *ms = __pfn_to_section(page_to_pfn(head));
>>> + struct page *memmap;
>>> +
>>> + memmap = sparse_decode_mem_map(ms->section_mem_map,
>>> + pfn_to_section_nr(page_to_pfn(head)));
>>
>> Why can't we check the head page directly? Either I need more coffee or
>> this can be simplified.
>
> Uhm, maybe I'm the one who needs coffe here but we have something like:
>
> [ hot-plugges section ]
> [memmap pages][normal pages]
Oh, right, we need the memmap for our hugetlb page (which resides in the
reserved section), but these are not marked. We need the memmap of that
memmap.
I assume one could directly via the page address. Because we want the
memmap of the memmap.
vmmemmap_page = virt_to_page(head);
Not sure, though, if that would work with that function.
--
Thanks,
David / dhildenb
On Fri, Jun 17, 2022 at 09:43:33AM +0200, David Hildenbrand wrote:
> VmemmapSelfHosted, then the function names get nicer.
Definitely.
>
> > +#endif
> > +
> > /*
> > * On an anonymous page mapped into a user virtual memory area,
> > * page->mapping points to its anon_vma, not to a struct address_space;
> > diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> > index 1089ea8a9c98..e2de7ed27e9e 100644
> > --- a/mm/hugetlb_vmemmap.c
> > +++ b/mm/hugetlb_vmemmap.c
> > @@ -101,6 +101,14 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
> > {
> > unsigned long vmemmap_addr = (unsigned long)head;
> > unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
> > + struct mem_section *ms = __pfn_to_section(page_to_pfn(head));
> > + struct page *memmap;
> > +
> > + memmap = sparse_decode_mem_map(ms->section_mem_map,
> > + pfn_to_section_nr(page_to_pfn(head)));
>
> Why can't we check the head page directly? Either I need more coffee or
> this can be simplified.
Uhm, maybe I'm the one who needs coffe here but we have something like:
[ hot-plugges section ]
[memmap pages][normal pages]
we only mark as VmemmapSelfHosted the memmap pages.
head page points to [normal pages] range, that is why we need to go
and get its mem_map to see whether those pages are marked.
Does it make sense? Or am I missing something?
> > +
> > + if (PageVmemmap_self_hosted(memmap))
>
> Maybe that's the right place for a comment, an ascii art, and how it is
> safe to only check the first vmemmap page due to alignment restrictions.
Yes, definitely worth putting in a comment.
> > + /*
> > + * Let us flag self-hosted memmap
> > + */
>
> I think that comment can be dropped because the code does exactly that.
Yeah, was mainly to picture it, but it needs to go as it is worthless.
--
Oscar Salvador
SUSE Labs
On Fri, Jun 17, 2022 at 12:14:02PM +0200, David Hildenbrand wrote:
> On 17.06.22 11:54, Oscar Salvador wrote:
> > On Fri, Jun 17, 2022 at 09:43:33AM +0200, David Hildenbrand wrote:
> >> VmemmapSelfHosted, then the function names get nicer.
> >
> > Definitely.
> >
> >>
> >>> +#endif
> >>> +
> >>> /*
> >>> * On an anonymous page mapped into a user virtual memory area,
> >>> * page->mapping points to its anon_vma, not to a struct address_space;
> >>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> >>> index 1089ea8a9c98..e2de7ed27e9e 100644
> >>> --- a/mm/hugetlb_vmemmap.c
> >>> +++ b/mm/hugetlb_vmemmap.c
> >>> @@ -101,6 +101,14 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
> >>> {
> >>> unsigned long vmemmap_addr = (unsigned long)head;
> >>> unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
> >>> + struct mem_section *ms = __pfn_to_section(page_to_pfn(head));
> >>> + struct page *memmap;
> >>> +
> >>> + memmap = sparse_decode_mem_map(ms->section_mem_map,
> >>> + pfn_to_section_nr(page_to_pfn(head)));
> >>
> >> Why can't we check the head page directly? Either I need more coffee or
> >> this can be simplified.
> >
> > Uhm, maybe I'm the one who needs coffe here but we have something like:
> >
> > [ hot-plugges section ]
> > [memmap pages][normal pages]
>
> Oh, right, we need the memmap for our hugetlb page (which resides in the
> reserved section), but these are not marked. We need the memmap of that
> memmap.
>
> I assume one could directly via the page address. Because we want the
> memmap of the memmap.
>
> vmmemmap_page = virt_to_page(head);
>
I think this can works, more simple.
Thanks.
> Not sure, though, if that would work with that function.
On Fri, Jun 17, 2022 at 06:49:15PM +0800, Muchun Song wrote:
> On Fri, Jun 17, 2022 at 12:14:02PM +0200, David Hildenbrand wrote:
> > On 17.06.22 11:54, Oscar Salvador wrote:
> > > On Fri, Jun 17, 2022 at 09:43:33AM +0200, David Hildenbrand wrote:
> > >> VmemmapSelfHosted, then the function names get nicer.
> > >
> > > Definitely.
> > >
> > >>
> > >>> +#endif
> > >>> +
> > >>> /*
> > >>> * On an anonymous page mapped into a user virtual memory area,
> > >>> * page->mapping points to its anon_vma, not to a struct address_space;
> > >>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> > >>> index 1089ea8a9c98..e2de7ed27e9e 100644
> > >>> --- a/mm/hugetlb_vmemmap.c
> > >>> +++ b/mm/hugetlb_vmemmap.c
> > >>> @@ -101,6 +101,14 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
> > >>> {
> > >>> unsigned long vmemmap_addr = (unsigned long)head;
> > >>> unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
> > >>> + struct mem_section *ms = __pfn_to_section(page_to_pfn(head));
> > >>> + struct page *memmap;
> > >>> +
> > >>> + memmap = sparse_decode_mem_map(ms->section_mem_map,
> > >>> + pfn_to_section_nr(page_to_pfn(head)));
> > >>
> > >> Why can't we check the head page directly? Either I need more coffee or
> > >> this can be simplified.
> > >
> > > Uhm, maybe I'm the one who needs coffe here but we have something like:
> > >
> > > [ hot-plugges section ]
> > > [memmap pages][normal pages]
> >
> > Oh, right, we need the memmap for our hugetlb page (which resides in the
> > reserved section), but these are not marked. We need the memmap of that
> > memmap.
> >
> > I assume one could directly via the page address. Because we want the
> > memmap of the memmap.
> >
> > vmmemmap_page = virt_to_page(head);
> >
>
> I think this can works, more simple.
>
Oh, I forgot virt_to_page() cannot applicable for vmemmap addresses.
I think Oscar's approach is right.
> Thanks.
>
> > Not sure, though, if that would work with that function.
>
On Fri, Jun 17, 2022 at 07:46:53AM +0200, Oscar Salvador wrote:
> On Thu, Jun 16, 2022 at 09:30:33AM +0200, David Hildenbrand wrote:
> > IIRC, that was used to skip these patches on the offlining path before
> > we provided the ranges to offline_pages().
>
> Yeah, it was designed for that purpose back then.
>
> > I'd not mess with PG_reserved, and give them a clearer name, to not
> > confuse them with other, ordinary, vmemmap pages that are not
> > self-hosted (maybe in the future we might want to flag all vmemmap pages
> > with a new type?).
>
> Not sure whether a new type is really needed, or to put it another way, I
> cannot see the benefit.
>
> >
> > I'd just try reusing the flag PG_owner_priv_1. And eventually, flag all
> > (v)memmap pages with a type PG_memmap. However, the latter would be
> > optional and might not be strictly required
> >
> >
> > So what think could make sense is
> >
> > /* vmemmap pages that are self-hosted and cannot be optimized/freed. */
> > PG_vmemmap_self_hosted = PG_owner_priv_1,
>
> Sure, I just lightly tested the below, and seems to work, but not sure
> whether that is what you are referring to.
> @Munchun: thoughts?
>
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index e66f7aa3191d..a4556afd7bda 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -193,6 +193,11 @@ enum pageflags {
>
> /* Only valid for buddy pages. Used to track pages that are reported */
> PG_reported = PG_uptodate,
> +
> +#ifdef CONFIG_MEMORY_HOTPLUG
> + /* For self-hosted memmap pages */
> + PG_vmemmap_self_hosted = PG_owner_priv_1,
> +#endif
> };
>
> #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
> @@ -628,6 +633,10 @@ PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison)
> */
> __PAGEFLAG(Reported, reported, PF_NO_COMPOUND)
>
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +PAGEFLAG(Vmemmap_self_hosted, vmemmap_self_hosted, PF_ANY)
> +#endif
> +
> /*
> * On an anonymous page mapped into a user virtual memory area,
> * page->mapping points to its anon_vma, not to a struct address_space;
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index 1089ea8a9c98..e2de7ed27e9e 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -101,6 +101,14 @@ void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
> {
> unsigned long vmemmap_addr = (unsigned long)head;
> unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
> + struct mem_section *ms = __pfn_to_section(page_to_pfn(head));
Hi Oscar,
After more thinkging, I think here should be:
struct mem_section *ms = __pfn_to_section(ALIGN_DOWN(page_to_pfn(head), memory_block_size_bytes()));
Why?
[ hotplugged memory ]
[ section ][...][ section ]
[ vmemmap ][ usable memory ]
^ | | |
+---+ | |
^ | |
+--------+ |
^ |
+-------------------------------------------+
The page_to_pfn(head) can falls onto the non-1st section, actually, we desire
1st section which ->section_mem_map is the start vmemmap of the vmemmap.
If we align the page_to_pfn(head) with the start pfn of the hotplugged memory,
then we can simplify the code further.
unsigned long size = memory_block_size_bytes();
unsigned long pfn = ALIGN_DOWN(page_to_pfn(head), size);
if (pfn_valid(pfn) && PageVmemmapSelfHosted(pfn_to_page(pfn)))
return;
Hotplugged memory block never has non-present sections, while boot memory block
can have one or more. So pfn_valid() is used to filter out the first section if
it is non-present.
Hopefully I am not wrong.
Thanks.
> + struct page *memmap;
> +
> + memmap = sparse_decode_mem_map(ms->section_mem_map,
> + pfn_to_section_nr(page_to_pfn(head)));
> +
> + if (PageVmemmap_self_hosted(memmap))
> + return;
>
> vmemmap_pages = hugetlb_optimize_vmemmap_pages(h);
> if (!vmemmap_pages)
> @@ -199,10 +207,10 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = {
> static __init int hugetlb_vmemmap_sysctls_init(void)
> {
> /*
> - * If "memory_hotplug.memmap_on_memory" is enabled or "struct page"
> - * crosses page boundaries, the vmemmap pages cannot be optimized.
> + * If "struct page" crosses page boundaries, the vmemmap pages cannot
> + * be optimized.
> */
> - if (!mhp_memmap_on_memory() && is_power_of_2(sizeof(struct page)))
> + if (is_power_of_2(sizeof(struct page)))
> register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
>
> return 0;
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 1213d0c67a53..863966c2c6f1 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -45,8 +45,6 @@
> #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
> static int memmap_on_memory_set(const char *val, const struct kernel_param *kp)
> {
> - if (hugetlb_optimize_vmemmap_enabled())
> - return 0;
> return param_set_bool(val, kp);
> }
>
> @@ -1032,6 +1030,7 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
> {
> unsigned long end_pfn = pfn + nr_pages;
> int ret;
> + int i;
>
> ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
> if (ret)
> @@ -1039,6 +1038,12 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
>
> move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
>
> + /*
> + * Let us flag self-hosted memmap
> + */
> + for (i = 0; i < nr_pages; i++)
> + SetPageVmemmap_self_hosted(pfn_to_page(pfn + i));
> +
> /*
> * It might be that the vmemmap_pages fully span sections. If that is
> * the case, mark those sections online here as otherwise they will be
>
>
> --
> Oscar Salvador
> SUSE Labs
>