Hi,
This patchset tries to solve the issue among memory_hotplug, hugetlb and
hwpoison. Based on the discussion on v1, this version goes in the
direction of changing the behavior of memory hotplug for hwpoison:
- hwpoison pages should not prevent memory hotremove,
- memory block with hwpoison pages should not be onlined.
I tested both with ACPI-based and sysfs-based memory hotplug, and passed
basic testcases.
Any comments and feedbacks would be appreciated.
Thanks,
Naoya Horiguchi
v1: https://lore.kernel.org/linux-mm/[email protected]/T
---
Summary:
Naoya Horiguchi (4):
mm,hwpoison,hugetlb,memory_hotplug: hotremove memory section with hwpoisoned hugepage
mm/hwpoison: move definitions of num_poisoned_pages_* to memory-failure.c
mm/hwpoison: pass pfn to num_poisoned_pages_*()
mm/hwpoison: introduce per-memory_block hwpoison counter
arch/parisc/kernel/pdt.c | 5 ++---
drivers/base/memory.c | 36 ++++++++++++++++++++++++++++++++++++
include/linux/memory.h | 3 +++
include/linux/mm.h | 12 ++++++++++++
include/linux/swapops.h | 25 -------------------------
mm/internal.h | 8 --------
mm/memory-failure.c | 45 +++++++++++++++++++++++----------------------
mm/memory_hotplug.c | 22 +++++++++++-----------
mm/sparse.c | 2 --
9 files changed, 87 insertions(+), 71 deletions(-)
From: Naoya Horiguchi <[email protected]>
HWPoisoned page is not supposed to be accessed once marked, but currently
such accesses can happen during memory hotremove because do_migrate_range()
can be called before dissolve_free_huge_pages() is called.
Move dissolve_free_huge_pages() before scan_movable_pages(). Recently
delayed dissolve has been implemented, so the dissolving can turn
a hwpoisoned hugepage into 4kB hwpoison page, which memory hotplug can
handle safely.
Reported-by: Miaohe Lin <[email protected]>
Signed-off-by: Naoya Horiguchi <[email protected]>
---
mm/memory_hotplug.c | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fad6d1f2262a..c24735d63b25 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1880,6 +1880,17 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
cond_resched();
+ /*
+ * Dissolve free hugepages in the memory block before doing
+ * offlining actually in order to make hugetlbfs's object
+ * counting consistent.
+ */
+ ret = dissolve_free_huge_pages(start_pfn, end_pfn);
+ if (ret) {
+ reason = "failure to dissolve huge pages";
+ goto failed_removal_isolated;
+ }
+
ret = scan_movable_pages(pfn, end_pfn, &pfn);
if (!ret) {
/*
@@ -1895,17 +1906,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
goto failed_removal_isolated;
}
- /*
- * Dissolve free hugepages in the memory block before doing
- * offlining actually in order to make hugetlbfs's object
- * counting consistent.
- */
- ret = dissolve_free_huge_pages(start_pfn, end_pfn);
- if (ret) {
- reason = "failure to dissolve huge pages";
- goto failed_removal_isolated;
- }
-
ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
} while (ret);
--
2.25.1
From: Naoya Horiguchi <[email protected]>
These interfaces will be used by drivers/base/core.c by later patch, so as a
preparatory work move them to more common header file visible to the file.
Signed-off-by: Naoya Horiguchi <[email protected]>
---
arch/parisc/kernel/pdt.c | 3 +--
include/linux/mm.h | 4 ++++
include/linux/swapops.h | 25 -------------------------
mm/memory-failure.c | 15 +++++++++++++++
4 files changed, 20 insertions(+), 27 deletions(-)
diff --git a/arch/parisc/kernel/pdt.c b/arch/parisc/kernel/pdt.c
index e391b175f5ec..fdc880e2575a 100644
--- a/arch/parisc/kernel/pdt.c
+++ b/arch/parisc/kernel/pdt.c
@@ -18,8 +18,7 @@
#include <linux/kthread.h>
#include <linux/initrd.h>
#include <linux/pgtable.h>
-#include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/mm.h>
#include <asm/pdc.h>
#include <asm/pdcpat.h>
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 21f8b27bd9fd..b81dd600e51a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3202,6 +3202,10 @@ static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
{
return 0;
}
+
+static inline void num_poisoned_pages_inc()
+{
+}
#endif
#ifndef arch_memory_failure
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index ddc98f96ad2c..55afc2aaba6b 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -459,8 +459,6 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
#ifdef CONFIG_MEMORY_FAILURE
-extern atomic_long_t num_poisoned_pages __read_mostly;
-
/*
* Support for hardware poisoned pages
*/
@@ -488,21 +486,6 @@ static inline struct page *hwpoison_entry_to_page(swp_entry_t entry)
return p;
}
-static inline void num_poisoned_pages_inc(void)
-{
- atomic_long_inc(&num_poisoned_pages);
-}
-
-static inline void num_poisoned_pages_dec(void)
-{
- atomic_long_dec(&num_poisoned_pages);
-}
-
-static inline void num_poisoned_pages_sub(long i)
-{
- atomic_long_sub(i, &num_poisoned_pages);
-}
-
#else
static inline swp_entry_t make_hwpoison_entry(struct page *page)
@@ -519,14 +502,6 @@ static inline struct page *hwpoison_entry_to_page(swp_entry_t entry)
{
return NULL;
}
-
-static inline void num_poisoned_pages_inc(void)
-{
-}
-
-static inline void num_poisoned_pages_sub(long i)
-{
-}
#endif
static inline int non_swap_entry(swp_entry_t entry)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7b077da568ff..b6236c721f54 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -74,6 +74,21 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
static bool hw_memory_failure __read_mostly = false;
+static inline void num_poisoned_pages_inc(void)
+{
+ atomic_long_inc(&num_poisoned_pages);
+}
+
+static inline void num_poisoned_pages_dec(void)
+{
+ atomic_long_dec(&num_poisoned_pages);
+}
+
+static inline void num_poisoned_pages_sub(long i)
+{
+ atomic_long_sub(i, &num_poisoned_pages);
+}
+
/*
* Return values:
* 1: the page is dissolved (if needed) and taken off from buddy,
--
2.25.1
From: Naoya Horiguchi <[email protected]>
No functional change.
Signed-off-by: Naoya Horiguchi <[email protected]>
---
arch/parisc/kernel/pdt.c | 2 +-
include/linux/mm.h | 2 +-
mm/memory-failure.c | 16 ++++++++--------
3 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/arch/parisc/kernel/pdt.c b/arch/parisc/kernel/pdt.c
index fdc880e2575a..80943a00e245 100644
--- a/arch/parisc/kernel/pdt.c
+++ b/arch/parisc/kernel/pdt.c
@@ -231,7 +231,7 @@ void __init pdc_pdt_init(void)
/* mark memory page bad */
memblock_reserve(pdt_entry[i] & PAGE_MASK, PAGE_SIZE);
- num_poisoned_pages_inc();
+ num_poisoned_pages_inc(addr >> PAGE_SHIFT);
}
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b81dd600e51a..6316973afd1d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3203,7 +3203,7 @@ static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
return 0;
}
-static inline void num_poisoned_pages_inc()
+static inline void num_poisoned_pages_inc(unsigned long pfn)
{
}
#endif
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b6236c721f54..7dd4e403e634 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -74,17 +74,17 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
static bool hw_memory_failure __read_mostly = false;
-static inline void num_poisoned_pages_inc(void)
+static inline void num_poisoned_pages_inc(unsigned long pfn)
{
atomic_long_inc(&num_poisoned_pages);
}
-static inline void num_poisoned_pages_dec(void)
+static inline void num_poisoned_pages_dec(unsigned long pfn)
{
atomic_long_dec(&num_poisoned_pages);
}
-static inline void num_poisoned_pages_sub(long i)
+static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
{
atomic_long_sub(i, &num_poisoned_pages);
}
@@ -130,7 +130,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
if (release)
put_page(page);
page_ref_inc(page);
- num_poisoned_pages_inc();
+ num_poisoned_pages_inc(page_to_pfn(page));
return true;
}
@@ -1196,7 +1196,7 @@ static void action_result(unsigned long pfn, enum mf_action_page_type type,
{
trace_memory_failure_event(pfn, type, result);
- num_poisoned_pages_inc();
+ num_poisoned_pages_inc(pfn);
pr_err("%#lx: recovery action for %s: %s\n",
pfn, action_page_types[type], action_name[result]);
}
@@ -1743,7 +1743,7 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page)
llist_add(&raw_hwp->node, head);
/* the first error event will be counted in action_result(). */
if (ret)
- num_poisoned_pages_inc();
+ num_poisoned_pages_inc(page_to_pfn(page));
} else {
/*
* Failed to save raw error info. We no longer trace all
@@ -2408,7 +2408,7 @@ int unpoison_memory(unsigned long pfn)
unlock_mutex:
mutex_unlock(&mf_mutex);
if (!ret || freeit) {
- num_poisoned_pages_sub(count);
+ num_poisoned_pages_sub(pfn, count);
unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
page_to_pfn(p), &unpoison_rs);
}
@@ -2625,7 +2625,7 @@ void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
for (i = 0; i < nr_pages; i++) {
if (PageHWPoison(&memmap[i])) {
- num_poisoned_pages_dec();
+ num_poisoned_pages_dec(page_to_pfn(&memmap[i]));
ClearPageHWPoison(&memmap[i]);
}
}
--
2.25.1
From: Naoya Horiguchi <[email protected]>
Currently PageHWPoison flag does not behave well when experiencing memory
hotremove/hotplug. Any data field in struct page is unreliable when the
associated memory is offlined, and the current mechanism can't tell whether
a memory section is onlined because a new memory devices is installed or
because previous failed offline operations are undone. Especially if
there's a hwpoisoned memory, it's unclear what the best option is.
So introduce a new mechanism to make struct memory_block remember that
a memory block has hwpoisoned memory inside it. And make any online event
fail if the onlined memory block contains hwpoison. struct memory_block
is freed and reallocated over ACPI-based hotremove/hotplug, but not over
sysfs-based hotremove/hotplug. So it's desirable to implement hwpoison
counter on this struct.
Note that clear_hwpoisoned_pages() is relocated to be called earlier than
now, just before unregistering struct memory_block. Otherwise, the
per-memory_block hwpoison counter is freed and we fail to adjust global
hwpoison counter properly.
Signed-off-by: Naoya Horiguchi <[email protected]>
---
drivers/base/memory.c | 36 ++++++++++++++++++++++++++++++++++++
include/linux/memory.h | 3 +++
include/linux/mm.h | 8 ++++++++
mm/internal.h | 8 --------
mm/memory-failure.c | 36 +++++++++++-------------------------
mm/sparse.c | 2 --
6 files changed, 58 insertions(+), 35 deletions(-)
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index bc60c9cd3230..10e45083af52 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -183,6 +183,9 @@ static int memory_block_online(struct memory_block *mem)
struct zone *zone;
int ret;
+ if (atomic_long_read(&mem->nr_hwpoison))
+ return -EHWPOISON;
+
zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group,
start_pfn, nr_pages);
@@ -864,6 +867,7 @@ void remove_memory_block_devices(unsigned long start, unsigned long size)
mem = find_memory_block_by_id(block_id);
if (WARN_ON_ONCE(!mem))
continue;
+ clear_hwpoisoned_pages(atomic_long_read(&mem->nr_hwpoison));
unregister_memory_block_under_nodes(mem);
remove_memory_block(mem);
}
@@ -1170,3 +1174,35 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
}
return ret;
}
+
+#ifdef CONFIG_MEMORY_FAILURE
+
+void memblk_nr_poison_inc(unsigned long pfn)
+{
+ const unsigned long block_id = pfn_to_block_id(pfn);
+ struct memory_block *mem = find_memory_block_by_id(block_id);
+
+ if (mem)
+ atomic_long_inc(&mem->nr_hwpoison);
+}
+
+void memblk_nr_poison_sub(unsigned long pfn, long i)
+{
+ const unsigned long block_id = pfn_to_block_id(pfn);
+ struct memory_block *mem = find_memory_block_by_id(block_id);
+
+ if (mem)
+ atomic_long_sub(i, &mem->nr_hwpoison);
+}
+
+unsigned long memblk_nr_poison(unsigned long pfn)
+{
+ const unsigned long block_id = pfn_to_block_id(pfn);
+ struct memory_block *mem = find_memory_block_by_id(block_id);
+
+ if (mem)
+ return atomic_long_read(&mem->nr_hwpoison);
+ return 0;
+}
+
+#endif
diff --git a/include/linux/memory.h b/include/linux/memory.h
index aa619464a1df..74e6b3ad947f 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -85,6 +85,9 @@ struct memory_block {
unsigned long nr_vmemmap_pages;
struct memory_group *group; /* group (if any) for this block */
struct list_head group_next; /* next block inside memory group */
+#ifdef CONFIG_MEMORY_FAILURE
+ atomic_long_t nr_hwpoison;
+#endif
};
int arch_get_memory_phys_device(unsigned long start_pfn);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6316973afd1d..951c3cdd7683 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3197,6 +3197,10 @@ extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags);
#ifdef CONFIG_MEMORY_FAILURE
extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags);
+extern void memblk_nr_poison_inc(unsigned long pfn);
+extern void memblk_nr_poison_sub(unsigned long pfn, long i);
+extern unsigned long memblk_nr_poison(unsigned long pfn);
+extern void clear_hwpoisoned_pages(long nr_poison);
#else
static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
{
@@ -3206,6 +3210,10 @@ static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
static inline void num_poisoned_pages_inc(unsigned long pfn)
{
}
+
+static inline void clear_hwpoisoned_pages(long nr_poison)
+{
+}
#endif
#ifndef arch_memory_failure
diff --git a/mm/internal.h b/mm/internal.h
index 785409805ed7..fa481fc04fb7 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -738,14 +738,6 @@ extern u64 hwpoison_filter_flags_value;
extern u64 hwpoison_filter_memcg;
extern u32 hwpoison_filter_enable;
-#ifdef CONFIG_MEMORY_FAILURE
-void clear_hwpoisoned_pages(struct page *memmap, int nr_pages);
-#else
-static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
-{
-}
-#endif
-
extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long,
unsigned long, unsigned long,
unsigned long, unsigned long);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7dd4e403e634..5f3a0351a200 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -74,19 +74,17 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
static bool hw_memory_failure __read_mostly = false;
-static inline void num_poisoned_pages_inc(unsigned long pfn)
+void num_poisoned_pages_inc(unsigned long pfn)
{
atomic_long_inc(&num_poisoned_pages);
+ memblk_nr_poison_inc(pfn);
}
-static inline void num_poisoned_pages_dec(unsigned long pfn)
-{
- atomic_long_dec(&num_poisoned_pages);
-}
-
-static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
+void num_poisoned_pages_sub(unsigned long pfn, long i)
{
atomic_long_sub(i, &num_poisoned_pages);
+ if (pfn != -1UL)
+ memblk_nr_poison_sub(pfn, i);
}
/*
@@ -2408,6 +2406,10 @@ int unpoison_memory(unsigned long pfn)
unlock_mutex:
mutex_unlock(&mf_mutex);
if (!ret || freeit) {
+ /*
+ * TODO: per-memory_block counter might break when the page
+ * size to be unpoisoned is larger than a memory_block.
+ */
num_poisoned_pages_sub(pfn, count);
unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
page_to_pfn(p), &unpoison_rs);
@@ -2610,23 +2612,7 @@ int soft_offline_page(unsigned long pfn, int flags)
return ret;
}
-void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
+void clear_hwpoisoned_pages(long nr_poison)
{
- int i;
-
- /*
- * A further optimization is to have per section refcounted
- * num_poisoned_pages. But that would need more space per memmap, so
- * for now just do a quick global check to speed up this routine in the
- * absence of bad pages.
- */
- if (atomic_long_read(&num_poisoned_pages) == 0)
- return;
-
- for (i = 0; i < nr_pages; i++) {
- if (PageHWPoison(&memmap[i])) {
- num_poisoned_pages_dec(page_to_pfn(&memmap[i]));
- ClearPageHWPoison(&memmap[i]);
- }
- }
+ num_poisoned_pages_sub(-1UL, nr_poison);
}
diff --git a/mm/sparse.c b/mm/sparse.c
index e5a8a3a0edd7..2779b419ef2a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -926,8 +926,6 @@ void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
unsigned long nr_pages, unsigned long map_offset,
struct vmem_altmap *altmap)
{
- clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
- nr_pages - map_offset);
section_deactivate(pfn, nr_pages, altmap);
}
#endif /* CONFIG_MEMORY_HOTPLUG */
--
2.25.1
On Mon, Sep 05, 2022 at 03:21:35PM +0900, Naoya Horiguchi wrote:
> From: Naoya Horiguchi <[email protected]>
>
> These interfaces will be used by drivers/base/core.c by later patch, so as a
> preparatory work move them to more common header file visible to the file.
>
> Signed-off-by: Naoya Horiguchi <[email protected]>
> ---
...
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 21f8b27bd9fd..b81dd600e51a 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3202,6 +3202,10 @@ static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
> {
> return 0;
> }
> +
> +static inline void num_poisoned_pages_inc()
Sorry, checkpatch.pl fails on this declaration, I should've give "void"
in the argument.
+static inline void num_poisoned_pages_inc(void)
- Naoya Horiguchi
> +{
> +}
> #endif
On Tue, Sep 06, 2022 at 10:59:58AM +0800, Miaohe Lin wrote:
> On 2022/9/5 14:21, Naoya Horiguchi wrote:
> > From: Naoya Horiguchi <[email protected]>
> >
> > HWPoisoned page is not supposed to be accessed once marked, but currently
> > such accesses can happen during memory hotremove because do_migrate_range()
> > can be called before dissolve_free_huge_pages() is called.
> >
> > Move dissolve_free_huge_pages() before scan_movable_pages(). Recently
> > delayed dissolve has been implemented, so the dissolving can turn
> > a hwpoisoned hugepage into 4kB hwpoison page, which memory hotplug can
> > handle safely.
>
> Yes, thanks for your work, Naoya. ;)
>
> >
> > Reported-by: Miaohe Lin <[email protected]>
> > Signed-off-by: Naoya Horiguchi <[email protected]>
> > ---
> > mm/memory_hotplug.c | 22 +++++++++++-----------
> > 1 file changed, 11 insertions(+), 11 deletions(-)
> >
> > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> > index fad6d1f2262a..c24735d63b25 100644
> > --- a/mm/memory_hotplug.c
> > +++ b/mm/memory_hotplug.c
> > @@ -1880,6 +1880,17 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
> >
> > cond_resched();
> >
> > + /*
> > + * Dissolve free hugepages in the memory block before doing
> > + * offlining actually in order to make hugetlbfs's object
> > + * counting consistent.
> > + */
> > + ret = dissolve_free_huge_pages(start_pfn, end_pfn);
> > + if (ret) {
> > + reason = "failure to dissolve huge pages";
> > + goto failed_removal_isolated;
> > + }
>
> This change has a side-effect. If hugetlb pages are in-use, dissolve_free_huge_pages() will always return -EBUSY
> even if those pages can be migrated. So we fail to hotremove the memory even if they could be offlined.
> Or am I miss something?
Thank you for the comment, you're right. (Taking a look over my test result
carefully, it showed failures for the related cases, I somehow overlooked
them, really sorry.) So my second thought is that we keep offline_pages()
as is, and insert a few line in do_migrate_range() to handle the case of
hwpoisoned hugepage like below:
@@ -1642,6 +1642,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (PageHuge(page)) {
pfn = page_to_pfn(head) + compound_nr(head) - 1;
+ if (PageHWPoison(head))
+ continue;
isolate_hugetlb(head, &source);
continue;
} else if (PageTransHuge(page))
This is slightly different from your original suggestion
https://lore.kernel.org/linux-mm/[email protected]/T
, as discussed in the thread existing "if (PageHWPoison(page))" branch in
this function can't be used for hugetlb. We could adjust them to handle
hugetlb, but maybe separating code for hugetlb first from the others looks
less compicated to me.
If you have any suggestion on this, please let me know.
Thanks,
Naoya Horiguchi
On Tue, Sep 06, 2022 at 04:14:40PM +0800, Miaohe Lin wrote:
> On 2022/9/6 14:14, HORIGUCHI NAOYA(堀口 直也) wrote:
> > On Tue, Sep 06, 2022 at 10:59:58AM +0800, Miaohe Lin wrote:
> >> On 2022/9/5 14:21, Naoya Horiguchi wrote:
> >>> From: Naoya Horiguchi <[email protected]>
> >>>
> >>> HWPoisoned page is not supposed to be accessed once marked, but currently
> >>> such accesses can happen during memory hotremove because do_migrate_range()
> >>> can be called before dissolve_free_huge_pages() is called.
> >>>
> >>> Move dissolve_free_huge_pages() before scan_movable_pages(). Recently
> >>> delayed dissolve has been implemented, so the dissolving can turn
> >>> a hwpoisoned hugepage into 4kB hwpoison page, which memory hotplug can
> >>> handle safely.
> >>
> >> Yes, thanks for your work, Naoya. ;)
> >>
> >>>
> >>> Reported-by: Miaohe Lin <[email protected]>
> >>> Signed-off-by: Naoya Horiguchi <[email protected]>
> >>> ---
> >>> mm/memory_hotplug.c | 22 +++++++++++-----------
> >>> 1 file changed, 11 insertions(+), 11 deletions(-)
> >>>
> >>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> >>> index fad6d1f2262a..c24735d63b25 100644
> >>> --- a/mm/memory_hotplug.c
> >>> +++ b/mm/memory_hotplug.c
> >>> @@ -1880,6 +1880,17 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
> >>>
> >>> cond_resched();
> >>>
> >>> + /*
> >>> + * Dissolve free hugepages in the memory block before doing
> >>> + * offlining actually in order to make hugetlbfs's object
> >>> + * counting consistent.
> >>> + */
> >>> + ret = dissolve_free_huge_pages(start_pfn, end_pfn);
> >>> + if (ret) {
> >>> + reason = "failure to dissolve huge pages";
> >>> + goto failed_removal_isolated;
> >>> + }
> >>
> >> This change has a side-effect. If hugetlb pages are in-use, dissolve_free_huge_pages() will always return -EBUSY
> >> even if those pages can be migrated. So we fail to hotremove the memory even if they could be offlined.
> >> Or am I miss something?
> >
> > Thank you for the comment, you're right. (Taking a look over my test result
> > carefully, it showed failures for the related cases, I somehow overlooked
> > them, really sorry.) So my second thought is that we keep offline_pages()
> > as is, and insert a few line in do_migrate_range() to handle the case of
> > hwpoisoned hugepage like below:
> >
> > @@ -1642,6 +1642,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
> >
> > if (PageHuge(page)) {
> > pfn = page_to_pfn(head) + compound_nr(head) - 1;
> > + if (PageHWPoison(head))
> > + continue;
>
> Thanks for your update. But it seems this is not enough. With the above code change, HWPoisoned
> hugetlb pages will always be ignored in do_migrate_range(). And if these pages are HPageMigratable,
> they will be returned in scan_movable_pages() then passed into the do_migrate_range() again. Thus
> a possible deadloop will occur until these pages become un-movable?
Yeah, so scan_movable_pages() can have an additional check for hwpoisoned hugepages, or
making hwpoisoned hugepage to be !HPageMigratable (somehow) might be another option.
I like the latter one for now, but I need look into how I can update the patch more.
>
> > isolate_hugetlb(head, &source);
> > continue;
> > } else if (PageTransHuge(page))
> >
> > This is slightly different from your original suggestion
> > https://lore.kernel.org/linux-mm/[email protected]/T
> > , as discussed in the thread existing "if (PageHWPoison(page))" branch in
> > this function can't be used for hugetlb. We could adjust them to handle
> > hugetlb, but maybe separating code for hugetlb first from the others looks
> > less compicated to me.
>
> It might be better to do something, e.g. unmap the hugetlb pages to prevent accessing from process if mapped,
> even try truncating the error page from pagecache. But I have no strong opinion as handling memory failure
> would always be a best try. ;)
This could be helpful, I'll try some.
Thank you for valuable comments.
- Naoya Horiguchi
>
> Thanks,
> Miaohe Lin
>
>
> >
> > If you have any suggestion on this, please let me know.
> >
> > Thanks,
> > Naoya Horiguchi
> >