2022-07-05 06:59:16

by 罗飞

[permalink] [raw]
Subject: [PATCH] mm,hwpoison,hugetlb: defer dissolve hwpoison hugepage when allocating vmemmap failed

When dissolving hwpoison hugepage, if the allocation of vmemmap page
failed, the faulty page should not be put back on the hugepage free
list, which will cause the faulty pages to be reused. It's better to
postpone the reexecution of dissolve operation.

Meanwhile when the page fault handling program calls
dissolve_free_huge_page() to dissolve the faulty page, the basic page
fault processing operation(such as migration pages and unmap etc)
has actually completed. There is no need to return -ENOMEM error code
to the upper layer for temporarily vmemmap page allocation failure,
which will cause the caller to make a wrong judgment. So just defer
dissolve and return success.

Signed-off-by: luofei <[email protected]>
---
mm/hugetlb.c | 34 +++++++++++++++++++++-------------
1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ca081078e814..db25458eb0a5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -90,6 +90,9 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;

/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
+static LLIST_HEAD(hpage_freelist);
+static void free_hpage_workfn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(free_hpage_work, free_hpage_workfn);

static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
@@ -1535,15 +1538,21 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;

- if (hugetlb_vmemmap_restore(h, page))
+ if (hugetlb_vmemmap_restore(h, page)) {
+ if (unlikely(PageHWPoison(page))) {
+ llist_add((struct llist_node *)&page->mapping, &hpage_freelist);
+ schedule_delayed_work(&free_hpage_work, HZ);
+ goto out;
+ }
goto fail;
+ }

/*
* Move PageHWPoison flag from head page to the raw error pages,
* which makes any healthy subpages reusable.
*/
if (unlikely(PageHWPoison(page) && hugetlb_clear_page_hwpoison(page)))
- goto fail;
+ goto out;

for (i = 0; i < pages_per_huge_page(h);
i++, subpage = mem_map_next(subpage, page, i)) {
@@ -1574,6 +1583,8 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
*/
add_hugetlb_page(h, page, true);
spin_unlock_irq(&hugetlb_lock);
+out:
+ return;
}

/*
@@ -1587,8 +1598,6 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
* to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
* structure of a lockless linked list of huge pages to be freed.
*/
-static LLIST_HEAD(hpage_freelist);
-
static void free_hpage_workfn(struct work_struct *work)
{
struct llist_node *node;
@@ -1616,12 +1625,11 @@ static void free_hpage_workfn(struct work_struct *work)
cond_resched();
}
}
-static DECLARE_WORK(free_hpage_work, free_hpage_workfn);

static inline void flush_free_hpage_work(struct hstate *h)
{
if (hugetlb_vmemmap_optimizable(h))
- flush_work(&free_hpage_work);
+ flush_delayed_work(&free_hpage_work);
}

static void update_and_free_page(struct hstate *h, struct page *page,
@@ -1634,13 +1642,9 @@ static void update_and_free_page(struct hstate *h, struct page *page,

/*
* Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
- *
- * Only call schedule_work() if hpage_freelist is previously
- * empty. Otherwise, schedule_work() had been called but the workfn
- * hasn't retrieved the list yet.
*/
- if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
- schedule_work(&free_hpage_work);
+ llist_add((struct llist_node *)&page->mapping, &hpage_freelist);
+ schedule_delayed_work(&free_hpage_work, 0);
}

static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
@@ -2118,11 +2122,15 @@ int dissolve_free_huge_page(struct page *page)
rc = hugetlb_vmemmap_restore(h, head);
if (!rc) {
update_and_free_page(h, head, false);
- } else {
+ } else if (!PageHWPoison(head)) {
spin_lock_irq(&hugetlb_lock);
add_hugetlb_page(h, head, false);
h->max_huge_pages++;
spin_unlock_irq(&hugetlb_lock);
+ } else {
+ llist_add((struct llist_node *)&head->mapping, &hpage_freelist);
+ schedule_delayed_work(&free_hpage_work, HZ);
+ rc = 0;
}

return rc;
--
2.27.0


2022-07-05 09:01:06

by Muchun Song

[permalink] [raw]
Subject: Re: [PATCH] mm,hwpoison,hugetlb: defer dissolve hwpoison hugepage when allocating vmemmap failed

On Tue, Jul 5, 2022 at 2:32 PM luofei <[email protected]> wrote:
>
> When dissolving hwpoison hugepage, if the allocation of vmemmap page
> failed, the faulty page should not be put back on the hugepage free
> list, which will cause the faulty pages to be reused. It's better to

Hi luofei,

How did it happen? If a hugepage is poisoned, then the head page's
flag will be set to PageHWPoison. See the code of
dequeue_huge_page_node_exact() which will filter out hwpoisoned
page. So the hwpoisoned pages cannot be reused, hopefully, I am
not missing something important.

> postpone the reexecution of dissolve operation.
>
> Meanwhile when the page fault handling program calls
> dissolve_free_huge_page() to dissolve the faulty page, the basic page
> fault processing operation(such as migration pages and unmap etc)
> has actually completed. There is no need to return -ENOMEM error code
> to the upper layer for temporarily vmemmap page allocation failure,
> which will cause the caller to make a wrong judgment. So just defer
> dissolve and return success.
>
> Signed-off-by: luofei <[email protected]>
> ---
> mm/hugetlb.c | 34 +++++++++++++++++++++-------------
> 1 file changed, 21 insertions(+), 13 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index ca081078e814..db25458eb0a5 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -90,6 +90,9 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
>
> /* Forward declaration */
> static int hugetlb_acct_memory(struct hstate *h, long delta);
> +static LLIST_HEAD(hpage_freelist);
> +static void free_hpage_workfn(struct work_struct *work);
> +static DECLARE_DELAYED_WORK(free_hpage_work, free_hpage_workfn);
>
> static inline bool subpool_is_free(struct hugepage_subpool *spool)
> {
> @@ -1535,15 +1538,21 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
> if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
> return;
>
> - if (hugetlb_vmemmap_restore(h, page))
> + if (hugetlb_vmemmap_restore(h, page)) {
> + if (unlikely(PageHWPoison(page))) {
> + llist_add((struct llist_node *)&page->mapping, &hpage_freelist);
> + schedule_delayed_work(&free_hpage_work, HZ);
> + goto out;
> + }
> goto fail;
> + }
>
> /*
> * Move PageHWPoison flag from head page to the raw error pages,
> * which makes any healthy subpages reusable.
> */
> if (unlikely(PageHWPoison(page) && hugetlb_clear_page_hwpoison(page)))
> - goto fail;
> + goto out;
>
> for (i = 0; i < pages_per_huge_page(h);
> i++, subpage = mem_map_next(subpage, page, i)) {
> @@ -1574,6 +1583,8 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
> */
> add_hugetlb_page(h, page, true);
> spin_unlock_irq(&hugetlb_lock);
> +out:
> + return;
> }
>
> /*
> @@ -1587,8 +1598,6 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
> * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
> * structure of a lockless linked list of huge pages to be freed.
> */
> -static LLIST_HEAD(hpage_freelist);
> -
> static void free_hpage_workfn(struct work_struct *work)
> {
> struct llist_node *node;
> @@ -1616,12 +1625,11 @@ static void free_hpage_workfn(struct work_struct *work)
> cond_resched();
> }
> }
> -static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
>
> static inline void flush_free_hpage_work(struct hstate *h)
> {
> if (hugetlb_vmemmap_optimizable(h))
> - flush_work(&free_hpage_work);
> + flush_delayed_work(&free_hpage_work);
> }
>
> static void update_and_free_page(struct hstate *h, struct page *page,
> @@ -1634,13 +1642,9 @@ static void update_and_free_page(struct hstate *h, struct page *page,
>
> /*
> * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
> - *
> - * Only call schedule_work() if hpage_freelist is previously
> - * empty. Otherwise, schedule_work() had been called but the workfn
> - * hasn't retrieved the list yet.
> */
> - if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
> - schedule_work(&free_hpage_work);
> + llist_add((struct llist_node *)&page->mapping, &hpage_freelist);
> + schedule_delayed_work(&free_hpage_work, 0);
> }
>
> static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
> @@ -2118,11 +2122,15 @@ int dissolve_free_huge_page(struct page *page)
> rc = hugetlb_vmemmap_restore(h, head);
> if (!rc) {
> update_and_free_page(h, head, false);
> - } else {
> + } else if (!PageHWPoison(head)) {
> spin_lock_irq(&hugetlb_lock);
> add_hugetlb_page(h, head, false);
> h->max_huge_pages++;
> spin_unlock_irq(&hugetlb_lock);
> + } else {
> + llist_add((struct llist_node *)&head->mapping, &hpage_freelist);
> + schedule_delayed_work(&free_hpage_work, HZ);
> + rc = 0;
> }
>
> return rc;
> --
> 2.27.0
>

2022-07-05 09:28:35

by 罗飞

[permalink] [raw]
Subject: 答复: [PATCH] mm,hwpoison,hugetlb: defer diss olve hwpoison hugepage when allocating vmemma p failed

>On Tue, Jul 5, 2022 at 2:32 PM luofei <[email protected]> wrote:
>>
> >When dissolving hwpoison hugepage, if the allocation of vmemmap page
> >failed, the faulty page should not be put back on the hugepage free
> >list, which will cause the faulty pages to be reused. It's better to
>
>Hi luofei,
>
>How did it happen? If a hugepage is poisoned, then the head page's
>flag will be set to PageHWPoison. See the code of
>dequeue_huge_page_node_exact() which will filter out hwpoisoned
>page. So the hwpoisoned pages cannot be reused, hopefully, I am
>not missing something important.

Sorry, yes, I did not notice the judgement on hwpoison page
________________________________________
??????: Muchun Song <[email protected]>
????ʱ??: 2022??7??5?? 16:57:35
?ռ???: ?޷?
????: Mike Kravetz; Andrew Morton; Linux Memory Management List; LKML
????: Re: [PATCH] mm,hwpoison,hugetlb: defer dissolve hwpoison hugepage when allocating vmemmap failed

On Tue, Jul 5, 2022 at 2:32 PM luofei <[email protected]> wrote:
>
> When dissolving hwpoison hugepage, if the allocation of vmemmap page
> failed, the faulty page should not be put back on the hugepage free
> list, which will cause the faulty pages to be reused. It's better to

Hi luofei,

How did it happen? If a hugepage is poisoned, then the head page's
flag will be set to PageHWPoison. See the code of
dequeue_huge_page_node_exact() which will filter out hwpoisoned
page. So the hwpoisoned pages cannot be reused, hopefully, I am
not missing something important.

> postpone the reexecution of dissolve operation.
>
> Meanwhile when the page fault handling program calls
> dissolve_free_huge_page() to dissolve the faulty page, the basic page
> fault processing operation(such as migration pages and unmap etc)
> has actually completed. There is no need to return -ENOMEM error code
> to the upper layer for temporarily vmemmap page allocation failure,
> which will cause the caller to make a wrong judgment. So just defer
> dissolve and return success.
>
> Signed-off-by: luofei <[email protected]>
> ---
> mm/hugetlb.c | 34 +++++++++++++++++++++-------------
> 1 file changed, 21 insertions(+), 13 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index ca081078e814..db25458eb0a5 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -90,6 +90,9 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
>
> /* Forward declaration */
> static int hugetlb_acct_memory(struct hstate *h, long delta);
> +static LLIST_HEAD(hpage_freelist);
> +static void free_hpage_workfn(struct work_struct *work);
> +static DECLARE_DELAYED_WORK(free_hpage_work, free_hpage_workfn);
>
> static inline bool subpool_is_free(struct hugepage_subpool *spool)
> {
> @@ -1535,15 +1538,21 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
> if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
> return;
>
> - if (hugetlb_vmemmap_restore(h, page))
> + if (hugetlb_vmemmap_restore(h, page)) {
> + if (unlikely(PageHWPoison(page))) {
> + llist_add((struct llist_node *)&page->mapping, &hpage_freelist);
> + schedule_delayed_work(&free_hpage_work, HZ);
> + goto out;
> + }
> goto fail;
> + }
>
> /*
> * Move PageHWPoison flag from head page to the raw error pages,
> * which makes any healthy subpages reusable.
> */
> if (unlikely(PageHWPoison(page) && hugetlb_clear_page_hwpoison(page)))
> - goto fail;
> + goto out;
>
> for (i = 0; i < pages_per_huge_page(h);
> i++, subpage = mem_map_next(subpage, page, i)) {
> @@ -1574,6 +1583,8 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
> */
> add_hugetlb_page(h, page, true);
> spin_unlock_irq(&hugetlb_lock);
> +out:
> + return;
> }
>
> /*
> @@ -1587,8 +1598,6 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
> * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
> * structure of a lockless linked list of huge pages to be freed.
> */
> -static LLIST_HEAD(hpage_freelist);
> -
> static void free_hpage_workfn(struct work_struct *work)
> {
> struct llist_node *node;
> @@ -1616,12 +1625,11 @@ static void free_hpage_workfn(struct work_struct *work)
> cond_resched();
> }
> }
> -static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
>
> static inline void flush_free_hpage_work(struct hstate *h)
> {
> if (hugetlb_vmemmap_optimizable(h))
> - flush_work(&free_hpage_work);
> + flush_delayed_work(&free_hpage_work);
> }
>
> static void update_and_free_page(struct hstate *h, struct page *page,
> @@ -1634,13 +1642,9 @@ static void update_and_free_page(struct hstate *h, struct page *page,
>
> /*
> * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
> - *
> - * Only call schedule_work() if hpage_freelist is previously
> - * empty. Otherwise, schedule_work() had been called but the workfn
> - * hasn't retrieved the list yet.
> */
> - if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
> - schedule_work(&free_hpage_work);
> + llist_add((struct llist_node *)&page->mapping, &hpage_freelist);
> + schedule_delayed_work(&free_hpage_work, 0);
> }
>
> static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
> @@ -2118,11 +2122,15 @@ int dissolve_free_huge_page(struct page *page)
> rc = hugetlb_vmemmap_restore(h, head);
> if (!rc) {
> update_and_free_page(h, head, false);
> - } else {
> + } else if (!PageHWPoison(head)) {
> spin_lock_irq(&hugetlb_lock);
> add_hugetlb_page(h, head, false);
> h->max_huge_pages++;
> spin_unlock_irq(&hugetlb_lock);
> + } else {
> + llist_add((struct llist_node *)&head->mapping, &hpage_freelist);
> + schedule_delayed_work(&free_hpage_work, HZ);
> + rc = 0;
> }
>
> return rc;
> --
> 2.27.0
>