2022-07-29 03:25:38

by Kefeng Wang

[permalink] [raw]
Subject: [PATCH v3] mm: memory-failure: convert to pr_fmt()

Use pr_fmt to prefix all pr_<level> output, but unpoison_memory()
and soft_offline_page() are used by error injection, which have
own prefixes like "Unpoison:" and "soft offline:", meanwhile,
soft_offline_page() could be used by memory hotremove, so reset
pr_fmt before unpoison_pr_info definition to keep the original
output for them.

Acked-by: Naoya Horiguchi <[email protected]>
Reviewed-by: Miaohe Lin <[email protected]>
Signed-off-by: Kefeng Wang <[email protected]>
---
v3
- fix build error via reset pr_fmt
v2
- add undef pr_fmt and update changelog

mm/memory-failure.c | 58 ++++++++++++++++++++++-----------------------
1 file changed, 28 insertions(+), 30 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 2bc1a47c3d46..14439806b5ef 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -33,6 +33,9 @@
* are rare we hope to get away with this. This avoids impacting the core
* VM.
*/
+
+#define pr_fmt(fmt) "Memory failure: " fmt
+
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
@@ -258,7 +261,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
short addr_lsb = tk->size_shift;
int ret = 0;

- pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
+ pr_err("%#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
pfn, t->comm, t->pid);

if ((flags & MF_ACTION_REQUIRED) && (t == current))
@@ -276,7 +279,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
addr_lsb, t); /* synchronous? */
if (ret < 0)
- pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
+ pr_info("Error sending signal to %s:%d: %d\n",
t->comm, t->pid, ret);
return ret;
}
@@ -358,7 +361,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,

tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
if (!tk) {
- pr_err("Memory failure: Out of memory while machine check handling\n");
+ pr_err("Out of memory while machine check handling\n");
return;
}

@@ -385,7 +388,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* has a mapping for the page.
*/
if (tk->addr == -EFAULT) {
- pr_info("Memory failure: Unable to find user space address %lx in %s\n",
+ pr_info("Unable to find user space address %lx in %s\n",
page_to_pfn(p), tsk->comm);
} else if (tk->size_shift == 0) {
kfree(tk);
@@ -418,7 +421,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
* signal and then access the memory. Just kill it.
*/
if (fail || tk->addr == -EFAULT) {
- pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
+ pr_err("%#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
pfn, tk->tsk->comm, tk->tsk->pid);
do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
tk->tsk, PIDTYPE_PID);
@@ -431,7 +434,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
* process anyways.
*/
else if (kill_proc(tk, pfn, flags) < 0)
- pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
+ pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n",
pfn, tk->tsk->comm, tk->tsk->pid);
}
put_task_struct(tk->tsk);
@@ -821,12 +824,10 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
int err = mapping->a_ops->error_remove_page(mapping, p);

if (err != 0) {
- pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
- pfn, err);
+ pr_info("%#lx: Failed to punch page: %d\n", pfn, err);
} else if (page_has_private(p) &&
!try_to_release_page(p, GFP_NOIO)) {
- pr_info("Memory failure: %#lx: failed to release buffers\n",
- pfn);
+ pr_info("%#lx: failed to release buffers\n", pfn);
} else {
ret = MF_RECOVERED;
}
@@ -838,8 +839,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
if (invalidate_inode_page(p))
ret = MF_RECOVERED;
else
- pr_info("Memory failure: %#lx: Failed to invalidate\n",
- pfn);
+ pr_info("%#lx: Failed to invalidate\n", pfn);
}

return ret;
@@ -869,7 +869,7 @@ static bool has_extra_refcount(struct page_state *ps, struct page *p,
count -= 1;

if (count > 0) {
- pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
+ pr_err("%#lx: %s still referenced by %d users\n",
page_to_pfn(p), action_page_types[ps->type], count);
return true;
}
@@ -893,7 +893,7 @@ static int me_kernel(struct page_state *ps, struct page *p)
*/
static int me_unknown(struct page_state *ps, struct page *p)
{
- pr_err("Memory failure: %#lx: Unknown page state\n", page_to_pfn(p));
+ pr_err("%#lx: Unknown page state\n", page_to_pfn(p));
unlock_page(p);
return MF_FAILED;
}
@@ -1179,7 +1179,7 @@ static void action_result(unsigned long pfn, enum mf_action_page_type type,
trace_memory_failure_event(pfn, type, result);

num_poisoned_pages_inc();
- pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
+ pr_err("%#lx: recovery action for %s: %s\n",
pfn, action_page_types[type], action_name[result]);
}

@@ -1254,8 +1254,7 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags)
if (head == compound_head(page))
return 1;

- pr_info("Memory failure: %#lx cannot catch tail\n",
- page_to_pfn(page));
+ pr_info("%#lx cannot catch tail\n", page_to_pfn(page));
put_page(head);
}

@@ -1318,7 +1317,7 @@ static int get_any_page(struct page *p, unsigned long flags)
}
out:
if (ret == -EIO)
- pr_err("Memory failure: %#lx: unhandlable page.\n", page_to_pfn(p));
+ pr_err("%#lx: unhandlable page.\n", page_to_pfn(p));

return ret;
}
@@ -1417,13 +1416,12 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
return true;

if (PageKsm(p)) {
- pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
+ pr_err("%#lx: can't handle KSM pages.\n", pfn);
return false;
}

if (PageSwapCache(p)) {
- pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
- pfn);
+ pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
ttu |= TTU_IGNORE_HWPOISON;
}

@@ -1441,7 +1439,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
} else {
kill = 0;
ttu |= TTU_IGNORE_HWPOISON;
- pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
+ pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
pfn);
}
}
@@ -1470,14 +1468,14 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
i_mmap_unlock_write(mapping);
} else
- pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
+ pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn);
} else {
try_to_unmap(folio, ttu);
}

unmap_success = !page_mapped(hpage);
if (!unmap_success)
- pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
+ pr_err("%#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));

/*
@@ -1844,7 +1842,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
*hugetlb = 0;
return 0;
} else if (res == -EHWPOISON) {
- pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn);
+ pr_err("%#lx: already hardware poisoned\n", pfn);
if (flags & MF_ACTION_REQUIRED) {
head = compound_head(p);
res = kill_accessing_process(current, page_to_pfn(head), flags);
@@ -2003,8 +2001,7 @@ int memory_failure(unsigned long pfn, int flags)
goto unlock_mutex;
}
}
- pr_err("Memory failure: %#lx: memory outside kernel control\n",
- pfn);
+ pr_err("%#lx: memory outside kernel control\n", pfn);
res = -ENXIO;
goto unlock_mutex;
}
@@ -2015,8 +2012,7 @@ int memory_failure(unsigned long pfn, int flags)
goto unlock_mutex;

if (TestSetPageHWPoison(p)) {
- pr_err("Memory failure: %#lx: already hardware poisoned\n",
- pfn);
+ pr_err("%#lx: already hardware poisoned\n", pfn);
res = -EHWPOISON;
if (flags & MF_ACTION_REQUIRED)
res = kill_accessing_process(current, pfn, flags);
@@ -2232,7 +2228,7 @@ void memory_failure_queue(unsigned long pfn, int flags)
if (kfifo_put(&mf_cpu->fifo, entry))
schedule_work_on(smp_processor_id(), &mf_cpu->work);
else
- pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
+ pr_err("buffer overflow when queuing memory failure at %#lx\n",
pfn);
spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
put_cpu_var(memory_failure_cpu);
@@ -2289,6 +2285,8 @@ static int __init memory_failure_init(void)
}
core_initcall(memory_failure_init);

+#undef pr_fmt
+#define pr_fmt(fmt) "" fmt
#define unpoison_pr_info(fmt, pfn, rs) \
({ \
if (__ratelimit(rs)) \
--
2.35.3


2022-07-30 09:42:36

by Miaohe Lin

[permalink] [raw]
Subject: Re: [PATCH v3] mm: memory-failure: convert to pr_fmt()

On 2022/7/29 11:19, Kefeng Wang wrote:
> Use pr_fmt to prefix all pr_<level> output, but unpoison_memory()
> and soft_offline_page() are used by error injection, which have
> own prefixes like "Unpoison:" and "soft offline:", meanwhile,
> soft_offline_page() could be used by memory hotremove, so reset
> pr_fmt before unpoison_pr_info definition to keep the original
> output for them.
>
> Acked-by: Naoya Horiguchi <[email protected]>
> Reviewed-by: Miaohe Lin <[email protected]>
> Signed-off-by: Kefeng Wang <[email protected]>

I'm sorry but this patch will make pr_info in try_to_split_thp_page looks like below?

Memory failure: Memory failure: 0x1b8200: thp split failed
or
Memory failure: soft offline: 0x1b8200: thp split failed

Thanks.

2022-08-02 03:20:47

by Kefeng Wang

[permalink] [raw]
Subject: [PATCH] mm: memory-failure: cleanup try_to_split_thp_page()

Since commit 5d1fd5dc877b ("mm,hwpoison: introduce MF_MSG_UNSPLIT_THP"),
the action_result() called to show memory error event in memory_failure(),
so the pr_info() in try_to_split_thp_page() is only needed in
soft_offline_in_use_page().

Signed-off-by: Kefeng Wang <[email protected]>
---
mm/memory-failure.c | 23 ++++++++++++-----------
1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index f0e1961d4482..59633a617a0a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1524,20 +1524,18 @@ static int identify_page_state(unsigned long pfn, struct page *p,
return page_action(ps, p, pfn);
}

-static int try_to_split_thp_page(struct page *page, const char *msg)
+static int try_to_split_thp_page(struct page *page)
{
+ int ret;
+
lock_page(page);
- if (unlikely(split_huge_page(page))) {
- unsigned long pfn = page_to_pfn(page);
+ ret = split_huge_page(page);
+ unlock_page(page);

- unlock_page(page);
- pr_info("%s: %#lx: thp split failed\n", msg, pfn);
+ if (unlikely(ret))
put_page(page);
- return -EBUSY;
- }
- unlock_page(page);

- return 0;
+ return ret;
}

static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
@@ -2079,7 +2077,7 @@ int memory_failure(unsigned long pfn, int flags)
* page is a valid handlable page.
*/
SetPageHasHWPoisoned(hpage);
- if (try_to_split_thp_page(p, "Memory Failure") < 0) {
+ if (try_to_split_thp_page(p) < 0) {
action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
res = -EBUSY;
goto unlock_mutex;
@@ -2503,8 +2501,11 @@ static int soft_offline_in_use_page(struct page *page)
struct page *hpage = compound_head(page);

if (!PageHuge(page) && PageTransHuge(hpage))
- if (try_to_split_thp_page(page, "soft offline") < 0)
+ if (try_to_split_thp_page(page) < 0) {
+ pr_info("soft offline: %#lx: thp split failed\n",
+ page_to_pfn(page));
return -EBUSY;
+ }
return __soft_offline_page(page);
}

--
2.35.3


2022-08-04 01:15:44

by Kefeng Wang

[permalink] [raw]
Subject: Re: [PATCH] mm: memory-failure: cleanup try_to_split_thp_page()


On 2022/8/4 8:45, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Tue, Aug 02, 2022 at 10:12:56AM +0800, Kefeng Wang wrote:
>> Since commit 5d1fd5dc877b ("mm,hwpoison: introduce MF_MSG_UNSPLIT_THP"),
>> the action_result() called to show memory error event in memory_failure(),
>> so the pr_info() in try_to_split_thp_page() is only needed in
>> soft_offline_in_use_page().
>>
>> Signed-off-by: Kefeng Wang <[email protected]>
> Thanks, looks good to me. Probably this patch may come before
> "mm: memory-failure: convert to pr_fmt()" to be a cleanup.
>
> And recently another patch [1] is trying to change the same function, so we
> might need to resolve the conflict with it. I expect it's not so hard, but
> I think your series had better come after [1] because [1] is a bug fix and
> might be backported to stable-5.19.
OK, I could repost after[1] merged into mm tree,  thanks.
>
> [1] https://lore.kernel.org/linux-mm/[email protected]/T/#u
>
> Thanks,
> Naoya Horiguchi

Subject: Re: [PATCH] mm: memory-failure: cleanup try_to_split_thp_page()

On Tue, Aug 02, 2022 at 10:12:56AM +0800, Kefeng Wang wrote:
> Since commit 5d1fd5dc877b ("mm,hwpoison: introduce MF_MSG_UNSPLIT_THP"),
> the action_result() called to show memory error event in memory_failure(),
> so the pr_info() in try_to_split_thp_page() is only needed in
> soft_offline_in_use_page().
>
> Signed-off-by: Kefeng Wang <[email protected]>

Thanks, looks good to me. Probably this patch may come before
"mm: memory-failure: convert to pr_fmt()" to be a cleanup.

And recently another patch [1] is trying to change the same function, so we
might need to resolve the conflict with it. I expect it's not so hard, but
I think your series had better come after [1] because [1] is a bug fix and
might be backported to stable-5.19.

[1] https://lore.kernel.org/linux-mm/[email protected]/T/#u

Thanks,
Naoya Horiguchi

> ---
> mm/memory-failure.c | 23 ++++++++++++-----------
> 1 file changed, 12 insertions(+), 11 deletions(-)
>
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index f0e1961d4482..59633a617a0a 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1524,20 +1524,18 @@ static int identify_page_state(unsigned long pfn, struct page *p,
> return page_action(ps, p, pfn);
> }
>
> -static int try_to_split_thp_page(struct page *page, const char *msg)
> +static int try_to_split_thp_page(struct page *page)
> {
> + int ret;
> +
> lock_page(page);
> - if (unlikely(split_huge_page(page))) {
> - unsigned long pfn = page_to_pfn(page);
> + ret = split_huge_page(page);
> + unlock_page(page);
>
> - unlock_page(page);
> - pr_info("%s: %#lx: thp split failed\n", msg, pfn);
> + if (unlikely(ret))
> put_page(page);
> - return -EBUSY;
> - }
> - unlock_page(page);
>
> - return 0;
> + return ret;
> }
>
> static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
> @@ -2079,7 +2077,7 @@ int memory_failure(unsigned long pfn, int flags)
> * page is a valid handlable page.
> */
> SetPageHasHWPoisoned(hpage);
> - if (try_to_split_thp_page(p, "Memory Failure") < 0) {
> + if (try_to_split_thp_page(p) < 0) {
> action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
> res = -EBUSY;
> goto unlock_mutex;
> @@ -2503,8 +2501,11 @@ static int soft_offline_in_use_page(struct page *page)
> struct page *hpage = compound_head(page);
>
> if (!PageHuge(page) && PageTransHuge(hpage))
> - if (try_to_split_thp_page(page, "soft offline") < 0)
> + if (try_to_split_thp_page(page) < 0) {
> + pr_info("soft offline: %#lx: thp split failed\n",
> + page_to_pfn(page));
> return -EBUSY;
> + }
> return __soft_offline_page(page);
> }
>
> --
> 2.35.3

2022-08-05 03:25:57

by Yin, Fengwei

[permalink] [raw]
Subject: Re: [PATCH] mm: memory-failure: cleanup try_to_split_thp_page()

On 2022/8/4 09:03, Kefeng Wang wrote:
>
> On 2022/8/4 8:45, HORIGUCHI NAOYA(堀口 直也) wrote:
>> On Tue, Aug 02, 2022 at 10:12:56AM +0800, Kefeng Wang wrote:
>>> Since commit 5d1fd5dc877b ("mm,hwpoison: introduce MF_MSG_UNSPLIT_THP"),
>>> the action_result() called to show memory error event in
>>> memory_failure(),
>>> so the pr_info() in try_to_split_thp_page() is only needed in
>>> soft_offline_in_use_page().
>>>
>>> Signed-off-by: Kefeng Wang <[email protected]>
>> Thanks, looks good to me.  Probably this patch may come before
>> "mm: memory-failure: convert to pr_fmt()" to be a cleanup.
>>
>> And recently another patch [1] is trying to change the same function,
>> so we
>> might need to resolve the conflict with it.  I expect it's not so
>> hard, but
>> I think your series had better come after [1] because [1] is a bug fix
>> and
>> might be backported to stable-5.19.
> OK, I could repost after[1] merged into mm tree,  thanks.
Thanks a lot for the considering merge conflict. I suppose the private
data releasing will be moved to split_huge_page. There will no code
conflict and not block this patch merging.


Regards
Yin, Fengwei

>>
>> [1]
>> https://lore.kernel.org/linux-mm/[email protected]/T/#u
>>
>>
>> Thanks,
>> Naoya Horiguchi
>