2023-04-17 04:38:28

by Kefeng Wang

[permalink] [raw]
Subject: [PATCH v2] mm: hwpoison: coredump: support recovery from dump_user_range()

The dump_user_range() is used to copy the user page to a coredump file,
but if a hardware memory error occurred during copy, which called from
__kernel_write_iter() in dump_user_range(), it crashes,

CPU: 112 PID: 7014 Comm: mca-recover Not tainted 6.3.0-rc2 #425

pc : __memcpy+0x110/0x260
lr : _copy_from_iter+0x3bc/0x4c8
...
Call trace:
__memcpy+0x110/0x260
copy_page_from_iter+0xcc/0x130
pipe_write+0x164/0x6d8
__kernel_write_iter+0x9c/0x210
dump_user_range+0xc8/0x1d8
elf_core_dump+0x308/0x368
do_coredump+0x2e8/0xa40
get_signal+0x59c/0x788
do_signal+0x118/0x1f8
do_notify_resume+0xf0/0x280
el0_da+0x130/0x138
el0t_64_sync_handler+0x68/0xc0
el0t_64_sync+0x188/0x190

Generally, the '->write_iter' of file ops will use copy_page_from_iter()
and copy_page_from_iter_atomic(), change memcpy() to copy_mc_to_kernel()
in both of them to handle #MC during source read, which stop coredump
processing and kill the task instead of kernel panic, but the source
address may not always a user address, so introduce a new copy_mc flag in
struct iov_iter{} to indicate that the iter could do a safe memory copy,
also introduce the helpers to set/cleck the flag, for now, it's only
used in coredump's dump_user_range(), but it could expand to any other
scenarios to fix the similar issue.

Cc: Alexander Viro <[email protected]>
Cc: Christian Brauner <[email protected]>
Cc: Miaohe Lin <[email protected]>
Cc: Naoya Horiguchi <[email protected]>
Cc: Tong Tiangen <[email protected]>
Cc: Jens Axboe <[email protected]>
Signed-off-by: Kefeng Wang <[email protected]>
---
v2:
- move the helper functions under pre-existing CONFIG_ARCH_HAS_COPY_MC
- reposition the copy_mc in struct iov_iter for easy merge, suggested
by Andrew Morton
- drop unnecessary clear flag helper
- fix checkpatch warning
fs/coredump.c | 1 +
include/linux/uio.h | 16 ++++++++++++++++
lib/iov_iter.c | 17 +++++++++++++++--
3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/fs/coredump.c b/fs/coredump.c
index 5df1e6e1eb2b..ece7badf701b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -882,6 +882,7 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
pos = file->f_pos;
bvec_set_page(&bvec, page, PAGE_SIZE, 0);
iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE);
+ iov_iter_set_copy_mc(&iter);
n = __kernel_write_iter(cprm->file, &iter, &pos);
if (n != PAGE_SIZE)
return 0;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index c459e1d5772b..aa3a4c6ba585 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -40,6 +40,7 @@ struct iov_iter_state {

struct iov_iter {
u8 iter_type;
+ bool copy_mc;
bool nofault;
bool data_source;
bool user_backed;
@@ -241,8 +242,22 @@ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);

#ifdef CONFIG_ARCH_HAS_COPY_MC
size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
+static inline void iov_iter_set_copy_mc(struct iov_iter *i)
+{
+ i->copy_mc = true;
+}
+
+static inline bool iov_iter_is_copy_mc(const struct iov_iter *i)
+{
+ return i->copy_mc;
+}
#else
#define _copy_mc_to_iter _copy_to_iter
+static inline void iov_iter_set_copy_mc(struct iov_iter *i) { }
+static inline bool iov_iter_is_copy_mc(const struct iov_iter *i)
+{
+ return false;
+}
#endif

size_t iov_iter_zero(size_t bytes, struct iov_iter *);
@@ -357,6 +372,7 @@ static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
WARN_ON(direction & ~(READ | WRITE));
*i = (struct iov_iter) {
.iter_type = ITER_UBUF,
+ .copy_mc = false,
.user_backed = true,
.data_source = direction,
.ubuf = buf,
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 08587feb94cc..7b9d8419fee7 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -288,6 +288,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
WARN_ON(direction & ~(READ | WRITE));
*i = (struct iov_iter) {
.iter_type = ITER_IOVEC,
+ .copy_mc = false,
.nofault = false,
.user_backed = true,
.data_source = direction,
@@ -371,6 +372,14 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
#endif /* CONFIG_ARCH_HAS_COPY_MC */

+static void *memcpy_from_iter(struct iov_iter *i, void *to, const void *from,
+ size_t size)
+{
+ if (iov_iter_is_copy_mc(i))
+ return (void *)copy_mc_to_kernel(to, from, size);
+ return memcpy(to, from, size);
+}
+
size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
if (WARN_ON_ONCE(!i->data_source))
@@ -380,7 +389,7 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
might_fault();
iterate_and_advance(i, bytes, base, len, off,
copyin(addr + off, base, len),
- memcpy(addr + off, base, len)
+ memcpy_from_iter(i, addr + off, base, len)
)

return bytes;
@@ -571,7 +580,7 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t byt
}
iterate_and_advance(i, bytes, base, len, off,
copyin(p + off, base, len),
- memcpy(p + off, base, len)
+ memcpy_from_iter(i, p + off, base, len)
)
kunmap_atomic(kaddr);
return bytes;
@@ -704,6 +713,7 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
WARN_ON(direction & ~(READ | WRITE));
*i = (struct iov_iter){
.iter_type = ITER_KVEC,
+ .copy_mc = false,
.data_source = direction,
.kvec = kvec,
.nr_segs = nr_segs,
@@ -720,6 +730,7 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
WARN_ON(direction & ~(READ | WRITE));
*i = (struct iov_iter){
.iter_type = ITER_BVEC,
+ .copy_mc = false,
.data_source = direction,
.bvec = bvec,
.nr_segs = nr_segs,
@@ -748,6 +759,7 @@ void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
BUG_ON(direction & ~1);
*i = (struct iov_iter) {
.iter_type = ITER_XARRAY,
+ .copy_mc = false,
.data_source = direction,
.xarray = xarray,
.xarray_start = start,
@@ -771,6 +783,7 @@ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
BUG_ON(direction != READ);
*i = (struct iov_iter){
.iter_type = ITER_DISCARD,
+ .copy_mc = false,
.data_source = false,
.count = count,
.iov_offset = 0
--
2.35.3


Subject: Re: [PATCH v2] mm: hwpoison: coredump: support recovery from dump_user_range()

On Mon, Apr 17, 2023 at 12:53:23PM +0800, Kefeng Wang wrote:
> The dump_user_range() is used to copy the user page to a coredump file,
> but if a hardware memory error occurred during copy, which called from
> __kernel_write_iter() in dump_user_range(), it crashes,
>
> CPU: 112 PID: 7014 Comm: mca-recover Not tainted 6.3.0-rc2 #425
>
> pc : __memcpy+0x110/0x260
> lr : _copy_from_iter+0x3bc/0x4c8
> ...
> Call trace:
> __memcpy+0x110/0x260
> copy_page_from_iter+0xcc/0x130
> pipe_write+0x164/0x6d8
> __kernel_write_iter+0x9c/0x210
> dump_user_range+0xc8/0x1d8
> elf_core_dump+0x308/0x368
> do_coredump+0x2e8/0xa40
> get_signal+0x59c/0x788
> do_signal+0x118/0x1f8
> do_notify_resume+0xf0/0x280
> el0_da+0x130/0x138
> el0t_64_sync_handler+0x68/0xc0
> el0t_64_sync+0x188/0x190
>
> Generally, the '->write_iter' of file ops will use copy_page_from_iter()
> and copy_page_from_iter_atomic(), change memcpy() to copy_mc_to_kernel()
> in both of them to handle #MC during source read, which stop coredump
> processing and kill the task instead of kernel panic, but the source
> address may not always a user address, so introduce a new copy_mc flag in
> struct iov_iter{} to indicate that the iter could do a safe memory copy,
> also introduce the helpers to set/cleck the flag, for now, it's only
> used in coredump's dump_user_range(), but it could expand to any other
> scenarios to fix the similar issue.
>
> Cc: Alexander Viro <[email protected]>
> Cc: Christian Brauner <[email protected]>
> Cc: Miaohe Lin <[email protected]>
> Cc: Naoya Horiguchi <[email protected]>
> Cc: Tong Tiangen <[email protected]>
> Cc: Jens Axboe <[email protected]>
> Signed-off-by: Kefeng Wang <[email protected]>
> ---
> v2:
> - move the helper functions under pre-existing CONFIG_ARCH_HAS_COPY_MC
> - reposition the copy_mc in struct iov_iter for easy merge, suggested
> by Andrew Morton
> - drop unnecessary clear flag helper
> - fix checkpatch warning
> fs/coredump.c | 1 +
> include/linux/uio.h | 16 ++++++++++++++++
> lib/iov_iter.c | 17 +++++++++++++++--
> 3 files changed, 32 insertions(+), 2 deletions(-)
>
...
> @@ -371,6 +372,14 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
> EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
> #endif /* CONFIG_ARCH_HAS_COPY_MC */
>
> +static void *memcpy_from_iter(struct iov_iter *i, void *to, const void *from,
> + size_t size)
> +{
> + if (iov_iter_is_copy_mc(i))
> + return (void *)copy_mc_to_kernel(to, from, size);

Is it helpful to call memory_failure_queue() if copy_mc_to_kernel() fails
due to a memory error?

Thanks,
Naoya Horiguchi

> + return memcpy(to, from, size);
> +}
> +
> size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
> {
> if (WARN_ON_ONCE(!i->data_source))

2023-04-18 09:48:37

by Kefeng Wang

[permalink] [raw]
Subject: Re: [PATCH v2] mm: hwpoison: coredump: support recovery from dump_user_range()



On 2023/4/18 11:13, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Mon, Apr 17, 2023 at 12:53:23PM +0800, Kefeng Wang wrote:
>> The dump_user_range() is used to copy the user page to a coredump file,
>> but if a hardware memory error occurred during copy, which called from
>> __kernel_write_iter() in dump_user_range(), it crashes,
>>
>> CPU: 112 PID: 7014 Comm: mca-recover Not tainted 6.3.0-rc2 #425
>>
>> pc : __memcpy+0x110/0x260
>> lr : _copy_from_iter+0x3bc/0x4c8
>> ...
>> Call trace:
>> __memcpy+0x110/0x260
>> copy_page_from_iter+0xcc/0x130
>> pipe_write+0x164/0x6d8
>> __kernel_write_iter+0x9c/0x210
>> dump_user_range+0xc8/0x1d8
>> elf_core_dump+0x308/0x368
>> do_coredump+0x2e8/0xa40
>> get_signal+0x59c/0x788
>> do_signal+0x118/0x1f8
>> do_notify_resume+0xf0/0x280
>> el0_da+0x130/0x138
>> el0t_64_sync_handler+0x68/0xc0
>> el0t_64_sync+0x188/0x190
>>
>> Generally, the '->write_iter' of file ops will use copy_page_from_iter()
>> and copy_page_from_iter_atomic(), change memcpy() to copy_mc_to_kernel()
>> in both of them to handle #MC during source read, which stop coredump
>> processing and kill the task instead of kernel panic, but the source
>> address may not always a user address, so introduce a new copy_mc flag in
>> struct iov_iter{} to indicate that the iter could do a safe memory copy,
>> also introduce the helpers to set/cleck the flag, for now, it's only
>> used in coredump's dump_user_range(), but it could expand to any other
>> scenarios to fix the similar issue.
>>
>> Cc: Alexander Viro <[email protected]>
>> Cc: Christian Brauner <[email protected]>
>> Cc: Miaohe Lin <[email protected]>
>> Cc: Naoya Horiguchi <[email protected]>
>> Cc: Tong Tiangen <[email protected]>
>> Cc: Jens Axboe <[email protected]>
>> Signed-off-by: Kefeng Wang <[email protected]>
>> ---
>> v2:
>> - move the helper functions under pre-existing CONFIG_ARCH_HAS_COPY_MC
>> - reposition the copy_mc in struct iov_iter for easy merge, suggested
>> by Andrew Morton
>> - drop unnecessary clear flag helper
>> - fix checkpatch warning
>> fs/coredump.c | 1 +
>> include/linux/uio.h | 16 ++++++++++++++++
>> lib/iov_iter.c | 17 +++++++++++++++--
>> 3 files changed, 32 insertions(+), 2 deletions(-)
>>
> ...
>> @@ -371,6 +372,14 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
>> EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
>> #endif /* CONFIG_ARCH_HAS_COPY_MC */
>>
>> +static void *memcpy_from_iter(struct iov_iter *i, void *to, const void *from,
>> + size_t size)
>> +{
>> + if (iov_iter_is_copy_mc(i))
>> + return (void *)copy_mc_to_kernel(to, from, size);
>
> Is it helpful to call memory_failure_queue() if copy_mc_to_kernel() fails
> due to a memory error?

For dump_user_range(), the task is dying, if copy incomplete size, the
coredump will fail and task will exit, also memory_failure will
be called by kill_me_maybe(),

CPU: 0 PID: 1418 Comm: test Tainted: G M 6.3.0-rc5 #29
Call Trace:
<TASK>
dump_stack_lvl+0x37/0x50
memory_failure+0x51/0x970
kill_me_maybe+0x5b/0xc0
task_work_run+0x5a/0x90
exit_to_user_mode_prepare+0x194/0x1a0
irqentry_exit_to_user_mode+0x9/0x30
noist_exc_machine_check+0x40/0x80
asm_exc_machine_check+0x33/0x40


>
> Thanks,
> Naoya Horiguchi
>
>> + return memcpy(to, from, size);
>> +}
>> +
>> size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
>> {
>> if (WARN_ON_ONCE(!i->data_source))

Subject: Re: [PATCH v2] mm: hwpoison: coredump: support recovery from dump_user_range()

On Tue, Apr 18, 2023 at 05:45:06PM +0800, Kefeng Wang wrote:
>
>
> On 2023/4/18 11:13, HORIGUCHI NAOYA(堀口 直也) wrote:
> > On Mon, Apr 17, 2023 at 12:53:23PM +0800, Kefeng Wang wrote:
> > > The dump_user_range() is used to copy the user page to a coredump file,
> > > but if a hardware memory error occurred during copy, which called from
> > > __kernel_write_iter() in dump_user_range(), it crashes,
> > >
> > > CPU: 112 PID: 7014 Comm: mca-recover Not tainted 6.3.0-rc2 #425
> > > pc : __memcpy+0x110/0x260
> > > lr : _copy_from_iter+0x3bc/0x4c8
> > > ...
> > > Call trace:
> > > __memcpy+0x110/0x260
> > > copy_page_from_iter+0xcc/0x130
> > > pipe_write+0x164/0x6d8
> > > __kernel_write_iter+0x9c/0x210
> > > dump_user_range+0xc8/0x1d8
> > > elf_core_dump+0x308/0x368
> > > do_coredump+0x2e8/0xa40
> > > get_signal+0x59c/0x788
> > > do_signal+0x118/0x1f8
> > > do_notify_resume+0xf0/0x280
> > > el0_da+0x130/0x138
> > > el0t_64_sync_handler+0x68/0xc0
> > > el0t_64_sync+0x188/0x190
> > >
> > > Generally, the '->write_iter' of file ops will use copy_page_from_iter()
> > > and copy_page_from_iter_atomic(), change memcpy() to copy_mc_to_kernel()
> > > in both of them to handle #MC during source read, which stop coredump
> > > processing and kill the task instead of kernel panic, but the source
> > > address may not always a user address, so introduce a new copy_mc flag in
> > > struct iov_iter{} to indicate that the iter could do a safe memory copy,
> > > also introduce the helpers to set/cleck the flag, for now, it's only
> > > used in coredump's dump_user_range(), but it could expand to any other
> > > scenarios to fix the similar issue.
> > >
> > > Cc: Alexander Viro <[email protected]>
> > > Cc: Christian Brauner <[email protected]>
> > > Cc: Miaohe Lin <[email protected]>
> > > Cc: Naoya Horiguchi <[email protected]>
> > > Cc: Tong Tiangen <[email protected]>
> > > Cc: Jens Axboe <[email protected]>
> > > Signed-off-by: Kefeng Wang <[email protected]>
> > > ---
> > > v2:
> > > - move the helper functions under pre-existing CONFIG_ARCH_HAS_COPY_MC
> > > - reposition the copy_mc in struct iov_iter for easy merge, suggested
> > > by Andrew Morton
> > > - drop unnecessary clear flag helper
> > > - fix checkpatch warning
> > > fs/coredump.c | 1 +
> > > include/linux/uio.h | 16 ++++++++++++++++
> > > lib/iov_iter.c | 17 +++++++++++++++--
> > > 3 files changed, 32 insertions(+), 2 deletions(-)
> > >
> > ...
> > > @@ -371,6 +372,14 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
> > > EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
> > > #endif /* CONFIG_ARCH_HAS_COPY_MC */
> > > +static void *memcpy_from_iter(struct iov_iter *i, void *to, const void *from,
> > > + size_t size)
> > > +{
> > > + if (iov_iter_is_copy_mc(i))
> > > + return (void *)copy_mc_to_kernel(to, from, size);
> >
> > Is it helpful to call memory_failure_queue() if copy_mc_to_kernel() fails
> > due to a memory error?
>
> For dump_user_range(), the task is dying, if copy incomplete size, the
> coredump will fail and task will exit, also memory_failure will
> be called by kill_me_maybe(),
>
> CPU: 0 PID: 1418 Comm: test Tainted: G M 6.3.0-rc5 #29
> Call Trace:
> <TASK>
> dump_stack_lvl+0x37/0x50
> memory_failure+0x51/0x970
> kill_me_maybe+0x5b/0xc0
> task_work_run+0x5a/0x90
> exit_to_user_mode_prepare+0x194/0x1a0
> irqentry_exit_to_user_mode+0x9/0x30
> noist_exc_machine_check+0x40/0x80
> asm_exc_machine_check+0x33/0x40

Is this call trace printed out when copy_mc_to_kernel() failed by finding
a memory error (or in some testcase using error injection)?
In my understanding, an MCE should not be triggered when MC-safe copy tries
to access to a memory error. So I feel that we might be talking about
different scenarios.

When I questioned previously, I thought about the following scenario:

- a process terminates abnormally for any reason like segmentation fault,
- then, kernel tries to create a coredump,
- during this, the copying routine accesses to corrupted page to read.

In this case the corrupted page should not be handled by memory_failure()
yet (because otherwise properly handled hwpoisoned page should be ignored
by coredump process). The coredump process would exit with failure with
your patch, but then, the corrupted page is still left unhandled and can
be reused, so any other thread can easily access to it again.

You can find a few other places (like __wp_page_copy_user and ksm_might_need_to_copy)
to call memory_failure_queue() to cope with such unhandled error pages.
So does memcpy_from_iter() do the same?

Thanks,
Naoya Horiguchi

2023-04-19 12:05:49

by Kefeng Wang

[permalink] [raw]
Subject: Re: [PATCH v2] mm: hwpoison: coredump: support recovery from dump_user_range()



On 2023/4/19 15:25, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Tue, Apr 18, 2023 at 05:45:06PM +0800, Kefeng Wang wrote:
>>
>>
>> On 2023/4/18 11:13, HORIGUCHI NAOYA(堀口 直也) wrote:
>>> On Mon, Apr 17, 2023 at 12:53:23PM +0800, Kefeng Wang wrote:
>>>> The dump_user_range() is used to copy the user page to a coredump file,
>>>> but if a hardware memory error occurred during copy, which called from
>>>> __kernel_write_iter() in dump_user_range(), it crashes,
>>>>
>>>> CPU: 112 PID: 7014 Comm: mca-recover Not tainted 6.3.0-rc2 #425
>>>> pc : __memcpy+0x110/0x260
>>>> lr : _copy_from_iter+0x3bc/0x4c8
>>>> ...
>>>> Call trace:
>>>> __memcpy+0x110/0x260
>>>> copy_page_from_iter+0xcc/0x130
>>>> pipe_write+0x164/0x6d8
>>>> __kernel_write_iter+0x9c/0x210
>>>> dump_user_range+0xc8/0x1d8
>>>> elf_core_dump+0x308/0x368
>>>> do_coredump+0x2e8/0xa40
>>>> get_signal+0x59c/0x788
>>>> do_signal+0x118/0x1f8
>>>> do_notify_resume+0xf0/0x280
>>>> el0_da+0x130/0x138
>>>> el0t_64_sync_handler+0x68/0xc0
>>>> el0t_64_sync+0x188/0x190
>>>>
>>>> Generally, the '->write_iter' of file ops will use copy_page_from_iter()
>>>> and copy_page_from_iter_atomic(), change memcpy() to copy_mc_to_kernel()
>>>> in both of them to handle #MC during source read, which stop coredump
>>>> processing and kill the task instead of kernel panic, but the source
>>>> address may not always a user address, so introduce a new copy_mc flag in
>>>> struct iov_iter{} to indicate that the iter could do a safe memory copy,
>>>> also introduce the helpers to set/cleck the flag, for now, it's only
>>>> used in coredump's dump_user_range(), but it could expand to any other
>>>> scenarios to fix the similar issue.
>>>>
>>>> Cc: Alexander Viro <[email protected]>
>>>> Cc: Christian Brauner <[email protected]>
>>>> Cc: Miaohe Lin <[email protected]>
>>>> Cc: Naoya Horiguchi <[email protected]>
>>>> Cc: Tong Tiangen <[email protected]>
>>>> Cc: Jens Axboe <[email protected]>
>>>> Signed-off-by: Kefeng Wang <[email protected]>
>>>> ---
>>>> v2:
>>>> - move the helper functions under pre-existing CONFIG_ARCH_HAS_COPY_MC
>>>> - reposition the copy_mc in struct iov_iter for easy merge, suggested
>>>> by Andrew Morton
>>>> - drop unnecessary clear flag helper
>>>> - fix checkpatch warning
>>>> fs/coredump.c | 1 +
>>>> include/linux/uio.h | 16 ++++++++++++++++
>>>> lib/iov_iter.c | 17 +++++++++++++++--
>>>> 3 files changed, 32 insertions(+), 2 deletions(-)
>>>>
>>> ...
>>>> @@ -371,6 +372,14 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
>>>> EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
>>>> #endif /* CONFIG_ARCH_HAS_COPY_MC */
>>>> +static void *memcpy_from_iter(struct iov_iter *i, void *to, const void *from,
>>>> + size_t size)
>>>> +{
>>>> + if (iov_iter_is_copy_mc(i))
>>>> + return (void *)copy_mc_to_kernel(to, from, size);
>>>
>>> Is it helpful to call memory_failure_queue() if copy_mc_to_kernel() fails
>>> due to a memory error?
>>
>> For dump_user_range(), the task is dying, if copy incomplete size, the
>> coredump will fail and task will exit, also memory_failure will
>> be called by kill_me_maybe(),
>>
>> CPU: 0 PID: 1418 Comm: test Tainted: G M 6.3.0-rc5 #29
>> Call Trace:
>> <TASK>
>> dump_stack_lvl+0x37/0x50
>> memory_failure+0x51/0x970
>> kill_me_maybe+0x5b/0xc0
>> task_work_run+0x5a/0x90
>> exit_to_user_mode_prepare+0x194/0x1a0
>> irqentry_exit_to_user_mode+0x9/0x30
>> noist_exc_machine_check+0x40/0x80
>> asm_exc_machine_check+0x33/0x40
>
> Is this call trace printed out when copy_mc_to_kernel() failed by finding
> a memory error (or in some testcase using error injection)?

I add dump_stack() into memory_failure() to check whether the poisoned
memory is called or not, and the call trace shows it do call
memory_failure(), but I get confused when do the test.

> In my understanding, an MCE should not be triggered when MC-safe copy tries
> to access to a memory error. So I feel that we might be talking about
> different scenarios.
>
> When I questioned previously, I thought about the following scenario:
>
> - a process terminates abnormally for any reason like segmentation fault,
> - then, kernel tries to create a coredump,
> - during this, the copying routine accesses to corrupted page to read.
>
Yes, we tested like your described,

1) inject memory error into a process
2) send a SIGABT/SIGBUS to process to trigger the coredump

Without patch, the system panic, and with patch only process exits.

> In this case the corrupted page should not be handled by memory_failure()
> yet (because otherwise properly handled hwpoisoned page should be ignored
> by coredump process). The coredump process would exit with failure with
> your patch, but then, the corrupted page is still left unhandled and can
> be reused, so any other thread can easily access to it again.

As shown above, the corrupted page will be handled by memory_failure(),
but what I'm wondering,
1) memory_failure() is not always called
2) look at the above call trace, it looks like from asynchronous
interrupt, not from synchronous exception, right?

>
> You can find a few other places (like __wp_page_copy_user and ksm_might_need_to_copy)
> to call memory_failure_queue() to cope with such unhandled error pages.
> So does memcpy_from_iter() do the same?

I add some debug print in do_machine_check() on x86:

1) COW,
m.kflags: MCE_IN_KERNEL_RECOV
fixup_type: EX_TYPE_DEFAULT_MCE_SAFE

CPU: 11 PID: 2038 Comm: einj_mem_uc
Call Trace:
<#MC>
dump_stack_lvl+0x37/0x50
do_machine_check+0x7ad/0x840
exc_machine_check+0x5a/0x90
asm_exc_machine_check+0x1e/0x40
RIP: 0010:copy_mc_fragile+0x35/0x62

if (m.kflags & MCE_IN_KERNEL_RECOV) {
if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
mce_panic("Failed kernel mode recovery", &m, msg);
}

if (m.kflags & MCE_IN_KERNEL_COPYIN)
queue_task_work(&m, msg, kill_me_never);

There is no memory_failure() called when
EX_TYPE_DEFAULT_MCE_SAFE, also EX_TYPE_FAULT_MCE_SAFE too,
so we manually add a memory_failure_queue() to handle with
the poisoned page.

2) Coredump, nothing print about m.kflags and fixup_type,
with above check, add a memory_failure_queue() or memory_failure() seems
to be needed for memcpy_from_iter(), but it is totally different from
the COW scenario


Another question, other copy_mc_to_kernel() callers, eg,
nvdimm/dm-writecache/dax, there are not call memory_failure_queue(),
should they need a memory_failure_queue(), if so, why not add it into
do_machine_check() ?

Thanks.



>
> Thanks,
> Naoya Horiguchi

2023-04-20 02:04:41

by Jane Chu

[permalink] [raw]
Subject: Re: [PATCH v2] mm: hwpoison: coredump: support recovery from dump_user_range()


On 4/19/2023 5:03 AM, Kefeng Wang wrote:
>
>
> On 2023/4/19 15:25, HORIGUCHI NAOYA(堀口 直也) wrote:
>> On Tue, Apr 18, 2023 at 05:45:06PM +0800, Kefeng Wang wrote:
>>>
>>>
>>> On 2023/4/18 11:13, HORIGUCHI NAOYA(堀口 直也) wrote:
>>>> On Mon, Apr 17, 2023 at 12:53:23PM +0800, Kefeng Wang wrote:
>>>>> The dump_user_range() is used to copy the user page to a coredump
>>>>> file,
>>>>> but if a hardware memory error occurred during copy, which called from
>>>>> __kernel_write_iter() in dump_user_range(), it crashes,
>>>>>
>>>>>     CPU: 112 PID: 7014 Comm: mca-recover Not tainted 6.3.0-rc2 #425
>>>>>     pc : __memcpy+0x110/0x260
>>>>>     lr : _copy_from_iter+0x3bc/0x4c8
>>>>>     ...
>>>>>     Call trace:
>>>>>      __memcpy+0x110/0x260
>>>>>      copy_page_from_iter+0xcc/0x130
>>>>>      pipe_write+0x164/0x6d8
>>>>>      __kernel_write_iter+0x9c/0x210
>>>>>      dump_user_range+0xc8/0x1d8
>>>>>      elf_core_dump+0x308/0x368
>>>>>      do_coredump+0x2e8/0xa40
>>>>>      get_signal+0x59c/0x788
>>>>>      do_signal+0x118/0x1f8
>>>>>      do_notify_resume+0xf0/0x280
>>>>>      el0_da+0x130/0x138
>>>>>      el0t_64_sync_handler+0x68/0xc0
>>>>>      el0t_64_sync+0x188/0x190
>>>>>
>>>>> Generally, the '->write_iter' of file ops will use
>>>>> copy_page_from_iter()
>>>>> and copy_page_from_iter_atomic(), change memcpy() to
>>>>> copy_mc_to_kernel()
>>>>> in both of them to handle #MC during source read, which stop coredump
>>>>> processing and kill the task instead of kernel panic, but the source
>>>>> address may not always a user address, so introduce a new copy_mc
>>>>> flag in
>>>>> struct iov_iter{} to indicate that the iter could do a safe memory
>>>>> copy,
>>>>> also introduce the helpers to set/cleck the flag, for now, it's only
>>>>> used in coredump's dump_user_range(), but it could expand to any other
>>>>> scenarios to fix the similar issue.
>>>>>
>>>>> Cc: Alexander Viro <[email protected]>
>>>>> Cc: Christian Brauner <[email protected]>
>>>>> Cc: Miaohe Lin <[email protected]>
>>>>> Cc: Naoya Horiguchi <[email protected]>
>>>>> Cc: Tong Tiangen <[email protected]>
>>>>> Cc: Jens Axboe <[email protected]>
>>>>> Signed-off-by: Kefeng Wang <[email protected]>
>>>>> ---
>>>>> v2:
>>>>> - move the helper functions under pre-existing CONFIG_ARCH_HAS_COPY_MC
>>>>> - reposition the copy_mc in struct iov_iter for easy merge, suggested
>>>>>     by Andrew Morton
>>>>> - drop unnecessary clear flag helper
>>>>> - fix checkpatch warning
>>>>>    fs/coredump.c       |  1 +
>>>>>    include/linux/uio.h | 16 ++++++++++++++++
>>>>>    lib/iov_iter.c      | 17 +++++++++++++++--
>>>>>    3 files changed, 32 insertions(+), 2 deletions(-)
>>>>>
>>>> ...
>>>>> @@ -371,6 +372,14 @@ size_t _copy_mc_to_iter(const void *addr,
>>>>> size_t bytes, struct iov_iter *i)
>>>>>    EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
>>>>>    #endif /* CONFIG_ARCH_HAS_COPY_MC */
>>>>> +static void *memcpy_from_iter(struct iov_iter *i, void *to, const
>>>>> void *from,
>>>>> +                 size_t size)
>>>>> +{
>>>>> +    if (iov_iter_is_copy_mc(i))
>>>>> +        return (void *)copy_mc_to_kernel(to, from, size);
>>>>
>>>> Is it helpful to call memory_failure_queue() if copy_mc_to_kernel()
>>>> fails
>>>> due to a memory error?
>>>
>>> For dump_user_range(), the task is dying, if copy incomplete size, the
>>> coredump will fail and task will exit, also memory_failure will
>>> be called by kill_me_maybe(),
>>>
>>>   CPU: 0 PID: 1418 Comm: test Tainted: G   M               6.3.0-rc5 #29
>>>   Call Trace:
>>>    <TASK>
>>>    dump_stack_lvl+0x37/0x50
>>>    memory_failure+0x51/0x970
>>>    kill_me_maybe+0x5b/0xc0
>>>    task_work_run+0x5a/0x90
>>>    exit_to_user_mode_prepare+0x194/0x1a0
>>>    irqentry_exit_to_user_mode+0x9/0x30
>>>    noist_exc_machine_check+0x40/0x80
>>>    asm_exc_machine_check+0x33/0x40
>>
>> Is this call trace printed out when copy_mc_to_kernel() failed by finding
>> a memory error (or in some testcase using error injection)?
>
> I add dump_stack() into memory_failure() to check whether the poisoned
> memory is called or not, and the call trace shows it do call
> memory_failure(), but I get confused when do the test.
>
>> In my understanding, an MCE should not be triggered when MC-safe copy
>> tries
>> to access to a memory error.  So I feel that we might be talking about
>> different scenarios.
>>
>> When I questioned previously, I thought about the following scenario:
>>
>>    - a process terminates abnormally for any reason like segmentation
>> fault,
>>    - then, kernel tries to create a coredump,
>>    - during this, the copying routine accesses to corrupted page to read.
>>
> Yes, we tested like your described,
>
> 1) inject memory error into a process
> 2) send a SIGABT/SIGBUS to process to trigger the coredump
>
> Without patch, the system panic, and with patch only process exits.
>
>> In this case the corrupted page should not be handled by memory_failure()
>> yet (because otherwise properly handled hwpoisoned page should be ignored
>> by coredump process).  The coredump process would exit with failure with
>> your patch, but then, the corrupted page is still left unhandled and can
>> be reused, so any other thread can easily access to it again.
>
> As shown above, the corrupted page will be handled by memory_failure(),
> but what I'm wondering,
> 1) memory_failure() is not always called
> 2) look at the above call trace, it looks like from asynchronous
>    interrupt, not from synchronous exception, right?
>
>>
>> You can find a few other places (like __wp_page_copy_user and
>> ksm_might_need_to_copy)
>> to call memory_failure_queue() to cope with such unhandled error pages.
>> So does memcpy_from_iter() do the same?
>
> I add some debug print in do_machine_check() on x86:
>
> 1) COW,
>   m.kflags: MCE_IN_KERNEL_RECOV
>   fixup_type: EX_TYPE_DEFAULT_MCE_SAFE
>
>   CPU: 11 PID: 2038 Comm: einj_mem_uc
>   Call Trace:
>    <#MC>
>    dump_stack_lvl+0x37/0x50
>    do_machine_check+0x7ad/0x840
>    exc_machine_check+0x5a/0x90
>    asm_exc_machine_check+0x1e/0x40
>   RIP: 0010:copy_mc_fragile+0x35/0x62
>
>   if (m.kflags & MCE_IN_KERNEL_RECOV) {
>           if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
>                   mce_panic("Failed kernel mode recovery", &m, msg);
>   }
>
>   if (m.kflags & MCE_IN_KERNEL_COPYIN)
>           queue_task_work(&m, msg, kill_me_never);
>
> There is no memory_failure() called when
> EX_TYPE_DEFAULT_MCE_SAFE, also EX_TYPE_FAULT_MCE_SAFE too,
> so we manually add a memory_failure_queue() to handle with
> the poisoned page.
>
> 2) Coredump,  nothing print about m.kflags and fixup_type,
> with above check, add a memory_failure_queue() or memory_failure() seems
> to be needed for memcpy_from_iter(), but it is totally different from
> the COW scenario
>
>
> Another question, other copy_mc_to_kernel() callers, eg,
> nvdimm/dm-writecache/dax, there are not call memory_failure_queue(),
> should they need a memory_failure_queue(), if so, why not add it into
> do_machine_check() ?

In the dax case, if the source address is poisoned, and we do follow up
with memory_failure_queue(pfn, flags), what should the value of the
'flags' be ?

thanks,
-jane

>
> Thanks.
>
>
>
>>
>> Thanks,
>> Naoya Horiguchi
>

2023-04-20 03:09:41

by Kefeng Wang

[permalink] [raw]
Subject: Re: [PATCH v2] mm: hwpoison: coredump: support recovery from dump_user_range()



On 2023/4/20 10:03, Jane Chu wrote:
>
> On 4/19/2023 5:03 AM, Kefeng Wang wrote:
>>
>>
>> On 2023/4/19 15:25, HORIGUCHI NAOYA(堀口 直也) wrote:
>>> On Tue, Apr 18, 2023 at 05:45:06PM +0800, Kefeng Wang wrote:
>>>>
>>>>
...
>>>>>> @@ -371,6 +372,14 @@ size_t _copy_mc_to_iter(const void *addr,
>>>>>> size_t bytes, struct iov_iter *i)
>>>>>>    EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
>>>>>>    #endif /* CONFIG_ARCH_HAS_COPY_MC */
>>>>>> +static void *memcpy_from_iter(struct iov_iter *i, void *to, const
>>>>>> void *from,
>>>>>> +                 size_t size)
>>>>>> +{
>>>>>> +    if (iov_iter_is_copy_mc(i))
>>>>>> +        return (void *)copy_mc_to_kernel(to, from, size);
>>>>>
>>>>> Is it helpful to call memory_failure_queue() if copy_mc_to_kernel()
>>>>> fails
>>>>> due to a memory error?
>>>>
>>>> For dump_user_range(), the task is dying, if copy incomplete size, the
>>>> coredump will fail and task will exit, also memory_failure will
>>>> be called by kill_me_maybe(),
>>>>
>>>>   CPU: 0 PID: 1418 Comm: test Tainted: G   M               6.3.0-rc5
>>>> #29
>>>>   Call Trace:
>>>>    <TASK>
>>>>    dump_stack_lvl+0x37/0x50
>>>>    memory_failure+0x51/0x970
>>>>    kill_me_maybe+0x5b/0xc0
>>>>    task_work_run+0x5a/0x90
>>>>    exit_to_user_mode_prepare+0x194/0x1a0
>>>>    irqentry_exit_to_user_mode+0x9/0x30
>>>>    noist_exc_machine_check+0x40/0x80
>>>>    asm_exc_machine_check+0x33/0x40
>>>
>>> Is this call trace printed out when copy_mc_to_kernel() failed by
>>> finding
>>> a memory error (or in some testcase using error injection)?
>>
>> I add dump_stack() into memory_failure() to check whether the poisoned
>> memory is called or not, and the call trace shows it do call
>> memory_failure(), but I get confused when do the test.
>>
>>> In my understanding, an MCE should not be triggered when MC-safe copy
>>> tries
>>> to access to a memory error.  So I feel that we might be talking about
>>> different scenarios.
>>>
>>> When I questioned previously, I thought about the following scenario:
>>>
>>>    - a process terminates abnormally for any reason like segmentation
>>> fault,
>>>    - then, kernel tries to create a coredump,
>>>    - during this, the copying routine accesses to corrupted page to
>>> read.
>>>
>> Yes, we tested like your described,
>>
>> 1) inject memory error into a process
>> 2) send a SIGABT/SIGBUS to process to trigger the coredump
>>
>> Without patch, the system panic, and with patch only process exits.
>>
>>> In this case the corrupted page should not be handled by
>>> memory_failure()
>>> yet (because otherwise properly handled hwpoisoned page should be
>>> ignored
>>> by coredump process).  The coredump process would exit with failure with
>>> your patch, but then, the corrupted page is still left unhandled and can
>>> be reused, so any other thread can easily access to it again.
>>
>> As shown above, the corrupted page will be handled by
>> memory_failure(), but what I'm wondering,
>> 1) memory_failure() is not always called
>> 2) look at the above call trace, it looks like from asynchronous
>>     interrupt, not from synchronous exception, right?
>>
>>>
>>> You can find a few other places (like __wp_page_copy_user and
>>> ksm_might_need_to_copy)
>>> to call memory_failure_queue() to cope with such unhandled error pages.
>>> So does memcpy_from_iter() do the same?
>>
>> I add some debug print in do_machine_check() on x86:
>>
>> 1) COW,
>>    m.kflags: MCE_IN_KERNEL_RECOV
>>    fixup_type: EX_TYPE_DEFAULT_MCE_SAFE
>>
>>    CPU: 11 PID: 2038 Comm: einj_mem_uc
>>    Call Trace:
>>     <#MC>
>>     dump_stack_lvl+0x37/0x50
>>     do_machine_check+0x7ad/0x840
>>     exc_machine_check+0x5a/0x90
>>     asm_exc_machine_check+0x1e/0x40
>>    RIP: 0010:copy_mc_fragile+0x35/0x62
>>
>>    if (m.kflags & MCE_IN_KERNEL_RECOV) {
>>            if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
>>                    mce_panic("Failed kernel mode recovery", &m, msg);
>>    }
>>
>>    if (m.kflags & MCE_IN_KERNEL_COPYIN)
>>            queue_task_work(&m, msg, kill_me_never);
>>
>> There is no memory_failure() called when
>> EX_TYPE_DEFAULT_MCE_SAFE, also EX_TYPE_FAULT_MCE_SAFE too,
>> so we manually add a memory_failure_queue() to handle with
>> the poisoned page.
>>
>> 2) Coredump,  nothing print about m.kflags and fixup_type,
>> with above check, add a memory_failure_queue() or memory_failure() seems
>> to be needed for memcpy_from_iter(), but it is totally different from
>> the COW scenario
>>
>>
>> Another question, other copy_mc_to_kernel() callers, eg,
>> nvdimm/dm-writecache/dax, there are not call memory_failure_queue(),
>> should they need a memory_failure_queue(), if so, why not add it into
>> do_machine_check() ?
>

What I mean is that EX_TYPE_DEFAULT_MCE_SAFE/EX_TYPE_FAULT_MCE_SAFE
is designed to identify fixups which allow in kernel #MC recovery,
that is, the caller of copy_mc_to_kernel() must know the source
is a user address, so we could add a MCE_IN_KERNEL_COPYIN fro
the MCE_SAFE type.

diff --git a/arch/x86/kernel/cpu/mce/severity.c
b/arch/x86/kernel/cpu/mce/severity.c
index c4477162c07d..63e94484c5d6 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -293,12 +293,11 @@ static noinstr int error_context(struct mce *m,
struct pt_regs *regs)
case EX_TYPE_COPY:
if (!copy_user)
return IN_KERNEL;
- m->kflags |= MCE_IN_KERNEL_COPYIN;
fallthrough;

case EX_TYPE_FAULT_MCE_SAFE:
case EX_TYPE_DEFAULT_MCE_SAFE:
- m->kflags |= MCE_IN_KERNEL_RECOV;
+ m->kflags |= MCE_IN_KERNEL_RECOV | MCE_IN_KERNEL_COPYIN;
return IN_KERNEL_RECOV;

default:

then we could drop memory_failure_queue(pfn, flags) from cow/ksm copy,
or every Machine Check safe memory copy will need a memory_failure_xx()
call.

+Thomas,who add the two types, could you share some comments about
this,thanks.

> In the dax case, if the source address is poisoned, and we do follow up
> with memory_failure_queue(pfn, flags), what should the value of the
> 'flags' be ?


I think flags = 0 is enough to for all copy_mc_xxx to isolate the
poisoned page.

Thanks.

2023-04-20 15:22:33

by Kefeng Wang

[permalink] [raw]
Subject: Re: [PATCH v2] mm: hwpoison: coredump: support recovery from dump_user_range()



On 2023/4/20 10:59, Kefeng Wang wrote:
>
>
> On 2023/4/20 10:03, Jane Chu wrote:
>>
>> On 4/19/2023 5:03 AM, Kefeng Wang wrote:
>>>
>>>
>>> On 2023/4/19 15:25, HORIGUCHI NAOYA(堀口 直也) wrote:
>>>> On Tue, Apr 18, 2023 at 05:45:06PM +0800, Kefeng Wang wrote:
>>>>>
>>>>>
> ...
>>>>>>> @@ -371,6 +372,14 @@ size_t _copy_mc_to_iter(const void *addr,
>>>>>>> size_t bytes, struct iov_iter *i)
>>>>>>>    EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
>>>>>>>    #endif /* CONFIG_ARCH_HAS_COPY_MC */
>>>>>>> +static void *memcpy_from_iter(struct iov_iter *i, void *to,
>>>>>>> const void *from,
>>>>>>> +                 size_t size)
>>>>>>> +{
>>>>>>> +    if (iov_iter_is_copy_mc(i))
>>>>>>> +        return (void *)copy_mc_to_kernel(to, from, size);
>>>>>>
>>>>>> Is it helpful to call memory_failure_queue() if
>>>>>> copy_mc_to_kernel() fails
>>>>>> due to a memory error?
>>>>>
>>>>> For dump_user_range(), the task is dying, if copy incomplete size, the
>>>>> coredump will fail and task will exit, also memory_failure will
>>>>> be called by kill_me_maybe(),
>>>>>
>>>>>   CPU: 0 PID: 1418 Comm: test Tainted: G   M
>>>>> 6.3.0-rc5 #29
>>>>>   Call Trace:
>>>>>    <TASK>
>>>>>    dump_stack_lvl+0x37/0x50
>>>>>    memory_failure+0x51/0x970
>>>>>    kill_me_maybe+0x5b/0xc0
>>>>>    task_work_run+0x5a/0x90
>>>>>    exit_to_user_mode_prepare+0x194/0x1a0
>>>>>    irqentry_exit_to_user_mode+0x9/0x30
>>>>>    noist_exc_machine_check+0x40/0x80
>>>>>    asm_exc_machine_check+0x33/0x40
>>>>
>>>> Is this call trace printed out when copy_mc_to_kernel() failed by
>>>> finding
>>>> a memory error (or in some testcase using error injection)?
>>>
>>> I add dump_stack() into memory_failure() to check whether the poisoned
>>> memory is called or not, and the call trace shows it do call
>>> memory_failure(), but I get confused when do the test.
>>>
>>>> In my understanding, an MCE should not be triggered when MC-safe
>>>> copy tries
>>>> to access to a memory error.  So I feel that we might be talking about
>>>> different scenarios.
>>>>
>>>> When I questioned previously, I thought about the following scenario:
>>>>
>>>>    - a process terminates abnormally for any reason like
>>>> segmentation fault,
>>>>    - then, kernel tries to create a coredump,
>>>>    - during this, the copying routine accesses to corrupted page to
>>>> read.
>>>>
>>> Yes, we tested like your described,
>>>
>>> 1) inject memory error into a process
>>> 2) send a SIGABT/SIGBUS to process to trigger the coredump
>>>
>>> Without patch, the system panic, and with patch only process exits.
>>>
>>>> In this case the corrupted page should not be handled by
>>>> memory_failure()
>>>> yet (because otherwise properly handled hwpoisoned page should be
>>>> ignored
>>>> by coredump process).  The coredump process would exit with failure
>>>> with
>>>> your patch, but then, the corrupted page is still left unhandled and
>>>> can
>>>> be reused, so any other thread can easily access to it again.
>>>
>>> As shown above, the corrupted page will be handled by
>>> memory_failure(), but what I'm wondering,
>>> 1) memory_failure() is not always called
>>> 2) look at the above call trace, it looks like from asynchronous
>>>     interrupt, not from synchronous exception, right?
>>>
>>>>
>>>> You can find a few other places (like __wp_page_copy_user and
>>>> ksm_might_need_to_copy)
>>>> to call memory_failure_queue() to cope with such unhandled error pages.
>>>> So does memcpy_from_iter() do the same?
>>>
>>> I add some debug print in do_machine_check() on x86:
>>>
>>> 1) COW,
>>>    m.kflags: MCE_IN_KERNEL_RECOV
>>>    fixup_type: EX_TYPE_DEFAULT_MCE_SAFE
>>>
>>>    CPU: 11 PID: 2038 Comm: einj_mem_uc
>>>    Call Trace:
>>>     <#MC>
>>>     dump_stack_lvl+0x37/0x50
>>>     do_machine_check+0x7ad/0x840
>>>     exc_machine_check+0x5a/0x90
>>>     asm_exc_machine_check+0x1e/0x40
>>>    RIP: 0010:copy_mc_fragile+0x35/0x62
>>>
>>>    if (m.kflags & MCE_IN_KERNEL_RECOV) {
>>>            if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
>>>                    mce_panic("Failed kernel mode recovery", &m, msg);
>>>    }
>>>
>>>    if (m.kflags & MCE_IN_KERNEL_COPYIN)
>>>            queue_task_work(&m, msg, kill_me_never);
>>>
>>> There is no memory_failure() called when
>>> EX_TYPE_DEFAULT_MCE_SAFE, also EX_TYPE_FAULT_MCE_SAFE too,
>>> so we manually add a memory_failure_queue() to handle with
>>> the poisoned page.
>>>
>>> 2) Coredump,  nothing print about m.kflags and fixup_type,

Sorry,I forget to set coredump file size :(

The coredump do trigger the do_machine_check() with same m.kflags and
fixup_type like cow


>>> with above check, add a memory_failure_queue() or memory_failure() seems
>>> to be needed for memcpy_from_iter(), but it is totally different from
>>> the COW scenario
>>>

so the memcpy_from_iter() from coredump is same as cow scenario.

>>>
>>> Another question, other copy_mc_to_kernel() callers, eg,
>>> nvdimm/dm-writecache/dax, there are not call memory_failure_queue(),
>>> should they need a memory_failure_queue(), if so, why not add it into
>>> do_machine_check() ?
>>
>
> What I mean is that EX_TYPE_DEFAULT_MCE_SAFE/EX_TYPE_FAULT_MCE_SAFE
> is designed to identify fixups which allow in kernel #MC recovery,
> that is, the caller of copy_mc_to_kernel() must know the source
> is a user address, so we could add a MCE_IN_KERNEL_COPYIN fro
> the MCE_SAFE type.

And I think we need the following change for MCE_SAFE copy to set
MCE_IN_KERNEL_COPYIN.

>
> diff --git a/arch/x86/kernel/cpu/mce/severity.c
> b/arch/x86/kernel/cpu/mce/severity.c
> index c4477162c07d..63e94484c5d6 100644
> --- a/arch/x86/kernel/cpu/mce/severity.c
> +++ b/arch/x86/kernel/cpu/mce/severity.c
> @@ -293,12 +293,11 @@ static noinstr int error_context(struct mce *m,
> struct pt_regs *regs)
>         case EX_TYPE_COPY:
>                 if (!copy_user)
>                         return IN_KERNEL;
> -               m->kflags |= MCE_IN_KERNEL_COPYIN;
>                 fallthrough;
>
>         case EX_TYPE_FAULT_MCE_SAFE:
>         case EX_TYPE_DEFAULT_MCE_SAFE:
> -               m->kflags |= MCE_IN_KERNEL_RECOV;
> +               m->kflags |= MCE_IN_KERNEL_RECOV | MCE_IN_KERNEL_COPYIN;
>                 return IN_KERNEL_RECOV;
>
>         default:
>
> then we could drop memory_failure_queue(pfn, flags) from cow/ksm copy,
> or every Machine Check safe memory copy will need a memory_failure_xx()
> call.

which help use to kill unneeded memory_failure_queue() call, any comments?

>
> +Thomas,who add the two types, could you share some comments about
> this,thanks.
>
>> In the dax case, if the source address is poisoned, and we do follow
>> up with memory_failure_queue(pfn, flags), what should the value of the
>> 'flags' be ?
>

With above diff change, we don't add a memory_failure_queue() into dax too.

Thanks

>
> I think flags = 0 is enough to for all copy_mc_xxx to isolate the
> poisoned page.
>
> Thanks.

Subject: Re: [PATCH v2] mm: hwpoison: coredump: support recovery from dump_user_range()

On Thu, Apr 20, 2023 at 11:05:12PM +0800, Kefeng Wang wrote:
>
>
> On 2023/4/20 10:59, Kefeng Wang wrote:
> >
> >
> > On 2023/4/20 10:03, Jane Chu wrote:
> > >
> > > On 4/19/2023 5:03 AM, Kefeng Wang wrote:
> > > >
> > > >
> > > > On 2023/4/19 15:25, HORIGUCHI NAOYA(堀口 直也) wrote:
> > > > > On Tue, Apr 18, 2023 at 05:45:06PM +0800, Kefeng Wang wrote:
> > > > > >
> > > > > >
> > ...
> > > > > > > > @@ -371,6 +372,14 @@ size_t
> > > > > > > > _copy_mc_to_iter(const void *addr, size_t bytes,
> > > > > > > > struct iov_iter *i)
> > > > > > > >    EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
> > > > > > > >    #endif /* CONFIG_ARCH_HAS_COPY_MC */
> > > > > > > > +static void *memcpy_from_iter(struct iov_iter
> > > > > > > > *i, void *to, const void *from,
> > > > > > > > +                 size_t size)
> > > > > > > > +{
> > > > > > > > +    if (iov_iter_is_copy_mc(i))
> > > > > > > > +        return (void *)copy_mc_to_kernel(to, from, size);
> > > > > > >
> > > > > > > Is it helpful to call memory_failure_queue() if
> > > > > > > copy_mc_to_kernel() fails
> > > > > > > due to a memory error?
> > > > > >
> > > > > > For dump_user_range(), the task is dying, if copy incomplete size, the
> > > > > > coredump will fail and task will exit, also memory_failure will
> > > > > > be called by kill_me_maybe(),
> > > > > >
> > > > > >   CPU: 0 PID: 1418 Comm: test Tainted: G   M
> > > > > > 6.3.0-rc5 #29
> > > > > >   Call Trace:
> > > > > >    <TASK>
> > > > > >    dump_stack_lvl+0x37/0x50
> > > > > >    memory_failure+0x51/0x970
> > > > > >    kill_me_maybe+0x5b/0xc0
> > > > > >    task_work_run+0x5a/0x90
> > > > > >    exit_to_user_mode_prepare+0x194/0x1a0
> > > > > >    irqentry_exit_to_user_mode+0x9/0x30
> > > > > >    noist_exc_machine_check+0x40/0x80
> > > > > >    asm_exc_machine_check+0x33/0x40
> > > > >
> > > > > Is this call trace printed out when copy_mc_to_kernel()
> > > > > failed by finding
> > > > > a memory error (or in some testcase using error injection)?
> > > >
> > > > I add dump_stack() into memory_failure() to check whether the poisoned
> > > > memory is called or not, and the call trace shows it do call
> > > > memory_failure(), but I get confused when do the test.
> > > >
> > > > > In my understanding, an MCE should not be triggered when
> > > > > MC-safe copy tries
> > > > > to access to a memory error.  So I feel that we might be talking about
> > > > > different scenarios.
> > > > >
> > > > > When I questioned previously, I thought about the following scenario:
> > > > >
> > > > >    - a process terminates abnormally for any reason like
> > > > > segmentation fault,
> > > > >    - then, kernel tries to create a coredump,
> > > > >    - during this, the copying routine accesses to corrupted
> > > > > page to read.
> > > > >
> > > > Yes, we tested like your described,
> > > >
> > > > 1) inject memory error into a process
> > > > 2) send a SIGABT/SIGBUS to process to trigger the coredump
> > > >
> > > > Without patch, the system panic, and with patch only process exits.
> > > >
> > > > > In this case the corrupted page should not be handled by
> > > > > memory_failure()
> > > > > yet (because otherwise properly handled hwpoisoned page
> > > > > should be ignored
> > > > > by coredump process).  The coredump process would exit with
> > > > > failure with
> > > > > your patch, but then, the corrupted page is still left
> > > > > unhandled and can
> > > > > be reused, so any other thread can easily access to it again.
> > > >
> > > > As shown above, the corrupted page will be handled by
> > > > memory_failure(), but what I'm wondering,
> > > > 1) memory_failure() is not always called
> > > > 2) look at the above call trace, it looks like from asynchronous
> > > >     interrupt, not from synchronous exception, right?
> > > >
> > > > >
> > > > > You can find a few other places (like __wp_page_copy_user
> > > > > and ksm_might_need_to_copy)
> > > > > to call memory_failure_queue() to cope with such unhandled error pages.
> > > > > So does memcpy_from_iter() do the same?
> > > >
> > > > I add some debug print in do_machine_check() on x86:
> > > >
> > > > 1) COW,
> > > >    m.kflags: MCE_IN_KERNEL_RECOV
> > > >    fixup_type: EX_TYPE_DEFAULT_MCE_SAFE
> > > >
> > > >    CPU: 11 PID: 2038 Comm: einj_mem_uc
> > > >    Call Trace:
> > > >     <#MC>
> > > >     dump_stack_lvl+0x37/0x50
> > > >     do_machine_check+0x7ad/0x840
> > > >     exc_machine_check+0x5a/0x90
> > > >     asm_exc_machine_check+0x1e/0x40
> > > >    RIP: 0010:copy_mc_fragile+0x35/0x62
> > > >
> > > >    if (m.kflags & MCE_IN_KERNEL_RECOV) {
> > > >            if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
> > > >                    mce_panic("Failed kernel mode recovery", &m, msg);
> > > >    }
> > > >
> > > >    if (m.kflags & MCE_IN_KERNEL_COPYIN)
> > > >            queue_task_work(&m, msg, kill_me_never);
> > > >
> > > > There is no memory_failure() called when
> > > > EX_TYPE_DEFAULT_MCE_SAFE, also EX_TYPE_FAULT_MCE_SAFE too,
> > > > so we manually add a memory_failure_queue() to handle with
> > > > the poisoned page.
> > > >
> > > > 2) Coredump,  nothing print about m.kflags and fixup_type,
>
> Sorry,I forget to set coredump file size :(
>
> The coredump do trigger the do_machine_check() with same m.kflags and
> fixup_type like cow
>
>
> > > > with above check, add a memory_failure_queue() or memory_failure() seems
> > > > to be needed for memcpy_from_iter(), but it is totally different from
> > > > the COW scenario
> > > >
>
> so the memcpy_from_iter() from coredump is same as cow scenario.

Okay, thank you for confirmation.

>
> > > >
> > > > Another question, other copy_mc_to_kernel() callers, eg,
> > > > nvdimm/dm-writecache/dax, there are not call memory_failure_queue(),
> > > > should they need a memory_failure_queue(), if so, why not add it into
> > > > do_machine_check() ?
> > >
> >
> > What I mean is that EX_TYPE_DEFAULT_MCE_SAFE/EX_TYPE_FAULT_MCE_SAFE
> > is designed to identify fixups which allow in kernel #MC recovery,
> > that is, the caller of copy_mc_to_kernel() must know the source
> > is a user address, so we could add a MCE_IN_KERNEL_COPYIN fro
> > the MCE_SAFE type.
>
> And I think we need the following change for MCE_SAFE copy to set
> MCE_IN_KERNEL_COPYIN.
>
> >
> > diff --git a/arch/x86/kernel/cpu/mce/severity.c
> > b/arch/x86/kernel/cpu/mce/severity.c
> > index c4477162c07d..63e94484c5d6 100644
> > --- a/arch/x86/kernel/cpu/mce/severity.c
> > +++ b/arch/x86/kernel/cpu/mce/severity.c
> > @@ -293,12 +293,11 @@ static noinstr int error_context(struct mce *m,
> > struct pt_regs *regs)
> >         case EX_TYPE_COPY:
> >                 if (!copy_user)
> >                         return IN_KERNEL;
> > -               m->kflags |= MCE_IN_KERNEL_COPYIN;

This change seems to not related to what you try to fix.
Could this break some other workloads like copying from user address?

> >                 fallthrough;
> >
> >         case EX_TYPE_FAULT_MCE_SAFE:
> >         case EX_TYPE_DEFAULT_MCE_SAFE:
> > -               m->kflags |= MCE_IN_KERNEL_RECOV;
> > +               m->kflags |= MCE_IN_KERNEL_RECOV | MCE_IN_KERNEL_COPYIN;
> >                 return IN_KERNEL_RECOV;
> >
> >         default:
> >
> > then we could drop memory_failure_queue(pfn, flags) from cow/ksm copy,
> > or every Machine Check safe memory copy will need a memory_failure_xx()
> > call.
>
> which help use to kill unneeded memory_failure_queue() call, any comments?

I'm not 100% sure that we can safely use queue_task_work() instead of
memory_failure_queue() (due to the difference between workqueue and task
work, which should be recently discussed in thread [1]). So I prefer to
keep the approach of memory_failure_queue() to keep the impact minimum.

[1] https://lore.kernel.org/lkml/[email protected]/T/#u

Thanks,
Naoya Horiguchi

>
>
> >
> > +Thomas,who add the two types, could you share some comments about
> > this,thanks.
> >
> > > In the dax case, if the source address is poisoned, and we do follow
> > > up with memory_failure_queue(pfn, flags), what should the value of
> > > the 'flags' be ?
> >
>
> With above diff change, we don't add a memory_failure_queue() into dax too.

2023-04-21 05:54:22

by Kefeng Wang

[permalink] [raw]
Subject: Re: [PATCH v2] mm: hwpoison: coredump: support recovery from dump_user_range()



On 2023/4/21 11:13, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Thu, Apr 20, 2023 at 11:05:12PM +0800, Kefeng Wang wrote:
>>
>>
>> On 2023/4/20 10:59, Kefeng Wang wrote:
>>>
>>>
>>> On 2023/4/20 10:03, Jane Chu wrote:
>>>>
>>>> On 4/19/2023 5:03 AM, Kefeng Wang wrote:
>>>>>
>>>>>
>>>>> On 2023/4/19 15:25, HORIGUCHI NAOYA(堀口 直也) wrote:
>>>>>> On Tue, Apr 18, 2023 at 05:45:06PM +0800, Kefeng Wang wrote:
>>>>>>>
>>>>>>>
>>> ...
>>>>>>>>> @@ -371,6 +372,14 @@ size_t
>>>>>>>>> _copy_mc_to_iter(const void *addr, size_t bytes,
>>>>>>>>> struct iov_iter *i)
>>>>>>>>>    EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
>>>>>>>>>    #endif /* CONFIG_ARCH_HAS_COPY_MC */
>>>>>>>>> +static void *memcpy_from_iter(struct iov_iter
>>>>>>>>> *i, void *to, const void *from,
>>>>>>>>> +                 size_t size)
>>>>>>>>> +{
>>>>>>>>> +    if (iov_iter_is_copy_mc(i))
>>>>>>>>> +        return (void *)copy_mc_to_kernel(to, from, size);
>>>>>>>>
>>>>>>>> Is it helpful to call memory_failure_queue() if
>>>>>>>> copy_mc_to_kernel() fails
>>>>>>>> due to a memory error?
>>>>>>>
>>>>>>> For dump_user_range(), the task is dying, if copy incomplete size, the
>>>>>>> coredump will fail and task will exit, also memory_failure will
>>>>>>> be called by kill_me_maybe(),
>>>>>>>
>>>>>>>   CPU: 0 PID: 1418 Comm: test Tainted: G   M
>>>>>>> 6.3.0-rc5 #29
>>>>>>>   Call Trace:
>>>>>>>    <TASK>
>>>>>>>    dump_stack_lvl+0x37/0x50
>>>>>>>    memory_failure+0x51/0x970
>>>>>>>    kill_me_maybe+0x5b/0xc0
>>>>>>>    task_work_run+0x5a/0x90
>>>>>>>    exit_to_user_mode_prepare+0x194/0x1a0
>>>>>>>    irqentry_exit_to_user_mode+0x9/0x30
>>>>>>>    noist_exc_machine_check+0x40/0x80
>>>>>>>    asm_exc_machine_check+0x33/0x40
>>>>>>
>>>>>> Is this call trace printed out when copy_mc_to_kernel()
>>>>>> failed by finding
>>>>>> a memory error (or in some testcase using error injection)?
>>>>>
>>>>> I add dump_stack() into memory_failure() to check whether the poisoned
>>>>> memory is called or not, and the call trace shows it do call
>>>>> memory_failure(), but I get confused when do the test.
>>>>>
>>>>>> In my understanding, an MCE should not be triggered when
>>>>>> MC-safe copy tries
>>>>>> to access to a memory error.  So I feel that we might be talking about
>>>>>> different scenarios.
>>>>>>
>>>>>> When I questioned previously, I thought about the following scenario:
>>>>>>
>>>>>>    - a process terminates abnormally for any reason like
>>>>>> segmentation fault,
>>>>>>    - then, kernel tries to create a coredump,
>>>>>>    - during this, the copying routine accesses to corrupted
>>>>>> page to read.
>>>>>>
>>>>> Yes, we tested like your described,
>>>>>
>>>>> 1) inject memory error into a process
>>>>> 2) send a SIGABT/SIGBUS to process to trigger the coredump
>>>>>
>>>>> Without patch, the system panic, and with patch only process exits.
>>>>>
>>>>>> In this case the corrupted page should not be handled by
>>>>>> memory_failure()
>>>>>> yet (because otherwise properly handled hwpoisoned page
>>>>>> should be ignored
>>>>>> by coredump process).  The coredump process would exit with
>>>>>> failure with
>>>>>> your patch, but then, the corrupted page is still left
>>>>>> unhandled and can
>>>>>> be reused, so any other thread can easily access to it again.
>>>>>
>>>>> As shown above, the corrupted page will be handled by
>>>>> memory_failure(), but what I'm wondering,
>>>>> 1) memory_failure() is not always called
>>>>> 2) look at the above call trace, it looks like from asynchronous
>>>>>     interrupt, not from synchronous exception, right?
>>>>>
>>>>>>
>>>>>> You can find a few other places (like __wp_page_copy_user
>>>>>> and ksm_might_need_to_copy)
>>>>>> to call memory_failure_queue() to cope with such unhandled error pages.
>>>>>> So does memcpy_from_iter() do the same?
>>>>>
>>>>> I add some debug print in do_machine_check() on x86:
>>>>>
>>>>> 1) COW,
>>>>>    m.kflags: MCE_IN_KERNEL_RECOV
>>>>>    fixup_type: EX_TYPE_DEFAULT_MCE_SAFE
>>>>>
>>>>>    CPU: 11 PID: 2038 Comm: einj_mem_uc
>>>>>    Call Trace:
>>>>>     <#MC>
>>>>>     dump_stack_lvl+0x37/0x50
>>>>>     do_machine_check+0x7ad/0x840
>>>>>     exc_machine_check+0x5a/0x90
>>>>>     asm_exc_machine_check+0x1e/0x40
>>>>>    RIP: 0010:copy_mc_fragile+0x35/0x62
>>>>>
>>>>>    if (m.kflags & MCE_IN_KERNEL_RECOV) {
>>>>>            if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
>>>>>                    mce_panic("Failed kernel mode recovery", &m, msg);
>>>>>    }
>>>>>
>>>>>    if (m.kflags & MCE_IN_KERNEL_COPYIN)
>>>>>            queue_task_work(&m, msg, kill_me_never);
>>>>>
>>>>> There is no memory_failure() called when
>>>>> EX_TYPE_DEFAULT_MCE_SAFE, also EX_TYPE_FAULT_MCE_SAFE too,
>>>>> so we manually add a memory_failure_queue() to handle with
>>>>> the poisoned page.
>>>>>
>>>>> 2) Coredump,  nothing print about m.kflags and fixup_type,
>>
>> Sorry,I forget to set coredump file size :(
>>
>> The coredump do trigger the do_machine_check() with same m.kflags and
>> fixup_type like cow
>>
>>
>>>>> with above check, add a memory_failure_queue() or memory_failure() seems
>>>>> to be needed for memcpy_from_iter(), but it is totally different from
>>>>> the COW scenario
>>>>>
>>
>> so the memcpy_from_iter() from coredump is same as cow scenario.
>
> Okay, thank you for confirmation.
>
>>
>>>>>
>>>>> Another question, other copy_mc_to_kernel() callers, eg,
>>>>> nvdimm/dm-writecache/dax, there are not call memory_failure_queue(),
>>>>> should they need a memory_failure_queue(), if so, why not add it into
>>>>> do_machine_check() ?
>>>>
>>>
>>> What I mean is that EX_TYPE_DEFAULT_MCE_SAFE/EX_TYPE_FAULT_MCE_SAFE
>>> is designed to identify fixups which allow in kernel #MC recovery,
>>> that is, the caller of copy_mc_to_kernel() must know the source
>>> is a user address, so we could add a MCE_IN_KERNEL_COPYIN fro
>>> the MCE_SAFE type.
>>
>> And I think we need the following change for MCE_SAFE copy to set
>> MCE_IN_KERNEL_COPYIN.
>>
>>>
>>> diff --git a/arch/x86/kernel/cpu/mce/severity.c
>>> b/arch/x86/kernel/cpu/mce/severity.c
>>> index c4477162c07d..63e94484c5d6 100644
>>> --- a/arch/x86/kernel/cpu/mce/severity.c
>>> +++ b/arch/x86/kernel/cpu/mce/severity.c
>>> @@ -293,12 +293,11 @@ static noinstr int error_context(struct mce *m,
>>> struct pt_regs *regs)
>>>         case EX_TYPE_COPY:
>>>                 if (!copy_user)
>>>                         return IN_KERNEL;
>>> -               m->kflags |= MCE_IN_KERNEL_COPYIN;
>
> This change seems to not related to what you try to fix.
> Could this break some other workloads like copying from user address?
>

Yes, this move MCE_IN_KERNEL_COPYIN set into next case, both COPY and
MCE_SAFE type will set MCE_IN_KERNEL_COPYIN, for EX_TYPE_COPY, we don't
break it.


>>>                 fallthrough;
>>>
>>>         case EX_TYPE_FAULT_MCE_SAFE:
>>>         case EX_TYPE_DEFAULT_MCE_SAFE:
>>> -               m->kflags |= MCE_IN_KERNEL_RECOV;
>>> +               m->kflags |= MCE_IN_KERNEL_RECOV | MCE_IN_KERNEL_COPYIN;
>>>                 return IN_KERNEL_RECOV;
>>>
>>>         default:
>>>
>>> then we could drop memory_failure_queue(pfn, flags) from cow/ksm copy,
>>> or every Machine Check safe memory copy will need a memory_failure_xx()
>>> call.
>>
>> which help use to kill unneeded memory_failure_queue() call, any comments?
>
> I'm not 100% sure that we can safely use queue_task_work() instead of
> memory_failure_queue() (due to the difference between workqueue and task
> work, which should be recently discussed in thread [1]). So I prefer to
> keep the approach of memory_failure_queue() to keep the impact minimum.
>

+tony for x86 mce

The x86 call queue_task_work() for EX_TYPE_COPY, so EX_TYPE_FAULT_MCE_SAFE
and EX_TYPE_DEFAULT_MCE_SAFE should be similar to EX_TYPE_COPY,
memcpy_mc_xxx return bytes not copied, let the task to decide
what to do next, and call memory_failure(pfn, 0) to isolate
the poisoned page.

1) queue_task_work() will make the memory_failure() called before
return-to-user
2) memory_failure_queue() called in COW will put the work on a specific
cpu(current task is running), and memory_failure() will be called in
the work. see more from commit d302c2398ba2 ("mm, hwpoison: when copy-
on-write hits poison, take page offline"), "It is important, but not
urgent, to mark the source page as h/w poisoned and unmap it from other
tasks."

Both of them just wants to isolate memory, they shouldn't add action,
they set flag=0 for memory_failure(). so preliminarily, there are not
different.



> [1] https://lore.kernel.org/lkml/[email protected]/T/#u
>

The COPY_MC support on arm64 is still under review[1], xueshuai's patch
is only trying to fix the uncorrected si_code of synchronous exceptions
when memory error occurred, so I think it is not involved the COPY_MC.


[1]
https://lore.kernel.org/lkml/[email protected]/


Thanks

> Thanks,
> Naoya Horiguchi
>
>>
>>
>>>
>>> +Thomas,who add the two types, could you share some comments about
>>> this,thanks.
>>>
>>>> In the dax case, if the source address is poisoned, and we do follow
>>>> up with memory_failure_queue(pfn, flags), what should the value of
>>>> the 'flags' be ?
>>>
>>
>> With above diff change, we don't add a memory_failure_queue() into dax too.

Subject: Re: [PATCH v2] mm: hwpoison: coredump: support recovery from dump_user_range()

On Fri, Apr 21, 2023 at 01:43:39PM +0800, Kefeng Wang wrote:
...
> > > > > >
> > > > > > Another question, other copy_mc_to_kernel() callers, eg,
> > > > > > nvdimm/dm-writecache/dax, there are not call memory_failure_queue(),
> > > > > > should they need a memory_failure_queue(), if so, why not add it into
> > > > > > do_machine_check() ?
> > > > >
> > > >
> > > > What I mean is that EX_TYPE_DEFAULT_MCE_SAFE/EX_TYPE_FAULT_MCE_SAFE
> > > > is designed to identify fixups which allow in kernel #MC recovery,
> > > > that is, the caller of copy_mc_to_kernel() must know the source
> > > > is a user address, so we could add a MCE_IN_KERNEL_COPYIN fro
> > > > the MCE_SAFE type.
> > >
> > > And I think we need the following change for MCE_SAFE copy to set
> > > MCE_IN_KERNEL_COPYIN.
> > >
> > > >
> > > > diff --git a/arch/x86/kernel/cpu/mce/severity.c
> > > > b/arch/x86/kernel/cpu/mce/severity.c
> > > > index c4477162c07d..63e94484c5d6 100644
> > > > --- a/arch/x86/kernel/cpu/mce/severity.c
> > > > +++ b/arch/x86/kernel/cpu/mce/severity.c
> > > > @@ -293,12 +293,11 @@ static noinstr int error_context(struct mce *m,
> > > > struct pt_regs *regs)
> > > >         case EX_TYPE_COPY:
> > > >                 if (!copy_user)
> > > >                         return IN_KERNEL;
> > > > -               m->kflags |= MCE_IN_KERNEL_COPYIN;
> >
> > This change seems to not related to what you try to fix.
> > Could this break some other workloads like copying from user address?
> >
>
> Yes, this move MCE_IN_KERNEL_COPYIN set into next case, both COPY and
> MCE_SAFE type will set MCE_IN_KERNEL_COPYIN, for EX_TYPE_COPY, we don't
> break it.
>
>
> > > >                 fallthrough;

Sorry, I overlooked this fallthrough. So this change is fine to me.

> > > >
> > > >         case EX_TYPE_FAULT_MCE_SAFE:
> > > >         case EX_TYPE_DEFAULT_MCE_SAFE:
> > > > -               m->kflags |= MCE_IN_KERNEL_RECOV;
> > > > +               m->kflags |= MCE_IN_KERNEL_RECOV | MCE_IN_KERNEL_COPYIN;
> > > >                 return IN_KERNEL_RECOV;
> > > >
> > > >         default:
> > > >
> > > > then we could drop memory_failure_queue(pfn, flags) from cow/ksm copy,
> > > > or every Machine Check safe memory copy will need a memory_failure_xx()
> > > > call.
> > >
> > > which help use to kill unneeded memory_failure_queue() call, any comments?
> >
> > I'm not 100% sure that we can safely use queue_task_work() instead of
> > memory_failure_queue() (due to the difference between workqueue and task
> > work, which should be recently discussed in thread [1]). So I prefer to
> > keep the approach of memory_failure_queue() to keep the impact minimum.
> >
>
> +tony for x86 mce
>
> The x86 call queue_task_work() for EX_TYPE_COPY, so EX_TYPE_FAULT_MCE_SAFE
> and EX_TYPE_DEFAULT_MCE_SAFE should be similar to EX_TYPE_COPY,
> memcpy_mc_xxx return bytes not copied, let the task to decide
> what to do next, and call memory_failure(pfn, 0) to isolate
> the poisoned page.
>
> 1) queue_task_work() will make the memory_failure() called before
> return-to-user
> 2) memory_failure_queue() called in COW will put the work on a specific
> cpu(current task is running), and memory_failure() will be called in
> the work. see more from commit d302c2398ba2 ("mm, hwpoison: when copy-
> on-write hits poison, take page offline"), "It is important, but not
> urgent, to mark the source page as h/w poisoned and unmap it from other
> tasks."
>
> Both of them just wants to isolate memory, they shouldn't add action,
> they set flag=0 for memory_failure(). so preliminarily, there are not
> different.

Thanks, sounds good to me.

- Naoya Horiguchi

>
>
>
> > [1] https://lore.kernel.org/lkml/[email protected]/T/#u
> >
>
> The COPY_MC support on arm64 is still under review[1], xueshuai's patch
> is only trying to fix the uncorrected si_code of synchronous exceptions
> when memory error occurred, so I think it is not involved the COPY_MC.

2023-04-24 16:25:30

by Luck, Tony

[permalink] [raw]
Subject: RE: [PATCH v2] mm: hwpoison: coredump: support recovery from dump_user_range()

> > This change seems to not related to what you try to fix.
> > Could this break some other workloads like copying from user address?
> >
>
> Yes, this move MCE_IN_KERNEL_COPYIN set into next case, both COPY and
> MCE_SAFE type will set MCE_IN_KERNEL_COPYIN, for EX_TYPE_COPY, we don't
> break it.

Should Linux even try to take a core dump for a SIGBUS generated because
the application accessed a poisoned page?

It doesn't seem like it would be useful. Core dumps are for debugging s/w
program errors in applications and libraries. That isn't the case when there
is a poison consumption. The application did nothing wrong.

This patch is still useful though. There may be an undiscovered poison
page in the application. Avoiding a kernel crash when dumping core
is still a good thing.

-Tony

2023-04-25 01:56:23

by Kefeng Wang

[permalink] [raw]
Subject: Re: [PATCH v2] mm: hwpoison: coredump: support recovery from dump_user_range()



On 2023/4/25 0:17, Luck, Tony wrote:
>>> This change seems to not related to what you try to fix.
>>> Could this break some other workloads like copying from user address?
>>>
>>
>> Yes, this move MCE_IN_KERNEL_COPYIN set into next case, both COPY and
>> MCE_SAFE type will set MCE_IN_KERNEL_COPYIN, for EX_TYPE_COPY, we don't
>> break it.
>
> Should Linux even try to take a core dump for a SIGBUS generated because
> the application accessed a poisoned page?
>
> It doesn't seem like it would be useful. Core dumps are for debugging s/w
> program errors in applications and libraries. That isn't the case when there
> is a poison consumption. The application did nothing wrong.
>
> This patch is still useful though. There may be an undiscovered poison
> page in the application. Avoiding a kernel crash when dumping core
> is still a good thing.

Thanks for your confirm, and what your option about add
MCE_IN_KERNEL_COPYIN to EX_TYPE_DEFAULT_MCE_SAFE/FAULT_MCE_SAFE type
to let do_machine_check call queue_task_work(&m, msg, kill_me_never),
which kill every call memory_failure_queue() after mc safe copy return?

>
> -Tony

2023-04-25 17:22:26

by Luck, Tony

[permalink] [raw]
Subject: RE: [PATCH v2] mm: hwpoison: coredump: support recovery from dump_user_range()

> Thanks for your confirm, and what your option about add
> MCE_IN_KERNEL_COPYIN to EX_TYPE_DEFAULT_MCE_SAFE/FAULT_MCE_SAFE type
> to let do_machine_check call queue_task_work(&m, msg, kill_me_never),
> which kill every call memory_failure_queue() after mc safe copy return?

I haven't been following this thread closely. Can you give a link to the e-mail
where you posted a patch that does this? Or just repost that patch if easier.

-Tony

2023-04-26 01:25:50

by Kefeng Wang

[permalink] [raw]
Subject: Re: [PATCH v2] mm: hwpoison: coredump: support recovery from dump_user_range()



On 2023/4/26 1:16, Luck, Tony wrote:
>> Thanks for your confirm, and what your option about add
>> MCE_IN_KERNEL_COPYIN to EX_TYPE_DEFAULT_MCE_SAFE/FAULT_MCE_SAFE type
>> to let do_machine_check call queue_task_work(&m, msg, kill_me_never),
>> which kill every call memory_failure_queue() after mc safe copy return?
>
> I haven't been following this thread closely. Can you give a link to the e-mail
> where you posted a patch that does this? Or just repost that patch if easier.

The major diff changes is [1], I will post a formal patch when -rc1 out,
thanks.

[1]
https://lore.kernel.org/linux-mm/[email protected]/
>
> -Tony

2023-04-26 15:59:08

by Luck, Tony

[permalink] [raw]
Subject: RE: [PATCH v2] mm: hwpoison: coredump: support recovery from dump_user_range()

> >> Thanks for your confirm, and what your option about add
> >> MCE_IN_KERNEL_COPYIN to EX_TYPE_DEFAULT_MCE_SAFE/FAULT_MCE_SAFE type
> >> to let do_machine_check call queue_task_work(&m, msg, kill_me_never),
> >> which kill every call memory_failure_queue() after mc safe copy return?
> >
> > I haven't been following this thread closely. Can you give a link to the e-mail
> > where you posted a patch that does this? Or just repost that patch if easier.
>
> The major diff changes is [1], I will post a formal patch when -rc1 out,
> thanks.
>
> [1]
> https://lore.kernel.org/linux-mm/[email protected]/

There seem to be a few misconceptions in that message. Not sure if all of them
were resolved. Here are some pertinent points:

>>> In my understanding, an MCE should not be triggered when MC-safe copy
>>> tries
>>> to access to a memory error. So I feel that we might be talking about
>>> different scenarios.

This is wrong. There is still a machine check when a MC-safe copy does a read
from a location that has a memory error.

The recovery flow in this case does not involve queue_task_work(). That is only
useful for machine check exceptions taken in user context. The queued work will
be executed to call memory_failure() from the kernel, but in process context (not
from the machine check exception stack) to handle the error.

For machine checks taken by kernel code (MC-safe copy functions) the recovery
path is here:

if (m.kflags & MCE_IN_KERNEL_RECOV) {
if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
mce_panic("Failed kernel mode recovery", &m, msg);
}

if (m.kflags & MCE_IN_KERNEL_COPYIN)
queue_task_work(&m, msg, kill_me_never);

The "fixup_exception()" ensures that on return from the machine check handler
code returns to the extable[] fixup location instead of the instruction that was
loading from the memory error location.

When the exception was from one of the copy_from_user() variants it makes
sense to also do the queue_task_work() because the kernel is going to return
to the user context (with an EFAULT error code from whatever system call was
attempting the copy_from_user()).

But in the core dump case there is no return to user. The process is being
terminated by the signal that leads to this core dump. So even though you
may consider the page being accessed to be a "user" page, you can't fix
it by queueing work to run on return to user.

I don't have an well thought out suggestion on how to make sure that memory_failure()
is called for the page in this case. Maybe the core dump code can check for the
return from the MC-safe copy it is using and handle it in the error path?

-Tony

2023-04-27 01:16:30

by Kefeng Wang

[permalink] [raw]
Subject: Re: [PATCH v2] mm: hwpoison: coredump: support recovery from dump_user_range()



On 2023/4/26 23:45, Luck, Tony wrote:
>>>> Thanks for your confirm, and what your option about add
>>>> MCE_IN_KERNEL_COPYIN to EX_TYPE_DEFAULT_MCE_SAFE/FAULT_MCE_SAFE type
>>>> to let do_machine_check call queue_task_work(&m, msg, kill_me_never),
>>>> which kill every call memory_failure_queue() after mc safe copy return?
>>>
>>> I haven't been following this thread closely. Can you give a link to the e-mail
>>> where you posted a patch that does this? Or just repost that patch if easier.
>>
>> The major diff changes is [1], I will post a formal patch when -rc1 out,
>> thanks.
>>
>> [1]
>> https://lore.kernel.org/linux-mm/[email protected]/
>
> There seem to be a few misconceptions in that message. Not sure if all of them
> were resolved. Here are some pertinent points:
>
>>>> In my understanding, an MCE should not be triggered when MC-safe copy
>>>> tries
>>>> to access to a memory error. So I feel that we might be talking about
>>>> different scenarios.
>
> This is wrong. There is still a machine check when a MC-safe copy does a read
> from a location that has a memory error.
>
> The recovery flow in this case does not involve queue_task_work(). That is only
> useful for machine check exceptions taken in user context. The queued work will
> be executed to call memory_failure() from the kernel, but in process context (not
> from the machine check exception stack) to handle the error.
>
> For machine checks taken by kernel code (MC-safe copy functions) the recovery
> path is here:
>
> if (m.kflags & MCE_IN_KERNEL_RECOV) {
> if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
> mce_panic("Failed kernel mode recovery", &m, msg);
> }
>
> if (m.kflags & MCE_IN_KERNEL_COPYIN)
> queue_task_work(&m, msg, kill_me_never);
>
> The "fixup_exception()" ensures that on return from the machine check handler
> code returns to the extable[] fixup location instead of the instruction that was
> loading from the memory error location.
>
> When the exception was from one of the copy_from_user() variants it makes
> sense to also do the queue_task_work() because the kernel is going to return
> to the user context (with an EFAULT error code from whatever system call was
> attempting the copy_from_user()).
>
> But in the core dump case there is no return to user. The process is being
> terminated by the signal that leads to this core dump. So even though you
> may consider the page being accessed to be a "user" page, you can't fix
> it by queueing work to run on return to user.

For coredump,the task work will be called too, see following code,

get_signal
sig_kernel_coredump
elf_core_dump
dump_user_range
_copy_from_iter // with MC-safe copy, return without panic
do_group_exit(ksig->info.si_signo);
do_exit
exit_task_work
task_work_run
kill_me_never
memory_failure

I also add debug print to check the memory_failure() processing after
add MCE_IN_KERNEL_COPYIN to MCE_SAFE exception type, also tested CoW of
normal page and huge page, it works too.

>
> I don't have an well thought out suggestion on how to make sure that memory_failure()
> is called for the page in this case. Maybe the core dump code can check for the
> return from the MC-safe copy it is using and handle it in the error path?
>

So we could safely add MCE_IN_KERNEL_COPYIN to all MCE_SAFE exception type?

The kernel diff based on next-20230425 shown bellow

diff --git a/arch/x86/kernel/cpu/mce/severity.c
b/arch/x86/kernel/cpu/mce/severity.c
index c4477162c07d..63e94484c5d6 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -293,12 +293,11 @@ static noinstr int error_context(struct mce *m,
struct pt_regs *regs)
case EX_TYPE_COPY:
if (!copy_user)
return IN_KERNEL;
- m->kflags |= MCE_IN_KERNEL_COPYIN;
fallthrough;

case EX_TYPE_FAULT_MCE_SAFE:
case EX_TYPE_DEFAULT_MCE_SAFE:
- m->kflags |= MCE_IN_KERNEL_RECOV;
+ m->kflags |= MCE_IN_KERNEL_RECOV | MCE_IN_KERNEL_COPYIN;
return IN_KERNEL_RECOV;

default:
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4509a566fe6c..59a7afe3dfce 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3667,23 +3667,19 @@ enum mf_flags {
int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
unsigned long count, int mf_flags);
extern int memory_failure(unsigned long pfn, int flags);
+extern void memory_failure_queue(unsigned long pfn, int flags);
extern void memory_failure_queue_kick(int cpu);
extern int unpoison_memory(unsigned long pfn);
extern void shake_page(struct page *p);
extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags);
#ifdef CONFIG_MEMORY_FAILURE
-extern void memory_failure_queue(unsigned long pfn, int flags);
extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
bool *migratable_cleared);
void num_poisoned_pages_inc(unsigned long pfn);
void num_poisoned_pages_sub(unsigned long pfn, long i);
struct task_struct *task_early_kill(struct task_struct *tsk, int
force_early);
#else
-static inline void memory_failure_queue(unsigned long pfn, int flags)
-{
-}
-
static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int
flags,
bool *migratable_cleared)
{
diff --git a/mm/ksm.c b/mm/ksm.c
index 0156bded3a66..7abdf4892387 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2794,7 +2794,6 @@ struct page *ksm_might_need_to_copy(struct page *page,
if (new_page) {
if (copy_mc_user_highpage(new_page, page, address, vma)) {
put_page(new_page);
- memory_failure_queue(page_to_pfn(page), 0);
return ERR_PTR(-EHWPOISON);
}
SetPageDirty(new_page);
diff --git a/mm/memory.c b/mm/memory.c
index 5e2c6b1fc00e..c0f586257017 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2814,10 +2814,8 @@ static inline int __wp_page_copy_user(struct page
*dst, struct page *src,
unsigned long addr = vmf->address;

if (likely(src)) {
- if (copy_mc_user_highpage(dst, src, addr, vma)) {
- memory_failure_queue(page_to_pfn(src), 0);
+ if (copy_mc_user_highpage(dst, src, addr, vma))
return -EHWPOISON;
- }
return 0;
}

@@ -5852,10 +5850,8 @@ static int copy_user_gigantic_page(struct folio
*dst, struct folio *src,

cond_resched();
if (copy_mc_user_highpage(dst_page, src_page,
- addr + i*PAGE_SIZE, vma)) {
- memory_failure_queue(page_to_pfn(src_page), 0);
+ addr + i*PAGE_SIZE, vma))
return -EHWPOISON;
- }
}
return 0;
}
@@ -5871,10 +5867,8 @@ static int copy_subpage(unsigned long addr, int
idx, void *arg)
struct copy_subpage_arg *copy_arg = arg;

if (copy_mc_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
- addr, copy_arg->vma)) {
- memory_failure_queue(page_to_pfn(copy_arg->src + idx), 0);
+ addr, copy_arg->vma))
return -EHWPOISON;
- }
return 0;
}

--
2.35.3


Thanks,

> -Tony

Subject: Re: [PATCH v2] mm: hwpoison: coredump: support recovery from dump_user_range()

On Thu, Apr 27, 2023 at 09:06:46AM +0800, Kefeng Wang wrote:
>
>
> On 2023/4/26 23:45, Luck, Tony wrote:
> > > > > Thanks for your confirm, and what your option about add
> > > > > MCE_IN_KERNEL_COPYIN to EX_TYPE_DEFAULT_MCE_SAFE/FAULT_MCE_SAFE type
> > > > > to let do_machine_check call queue_task_work(&m, msg, kill_me_never),
> > > > > which kill every call memory_failure_queue() after mc safe copy return?
> > > >
> > > > I haven't been following this thread closely. Can you give a link to the e-mail
> > > > where you posted a patch that does this? Or just repost that patch if easier.
> > >
> > > The major diff changes is [1], I will post a formal patch when -rc1 out,
> > > thanks.
> > >
> > > [1]
> > > https://lore.kernel.org/linux-mm/[email protected]/
> >
> > There seem to be a few misconceptions in that message. Not sure if all of them
> > were resolved. Here are some pertinent points:
> >
> > > > > In my understanding, an MCE should not be triggered when MC-safe copy
> > > > > tries
> > > > > to access to a memory error. So I feel that we might be talking about
> > > > > different scenarios.
> >
> > This is wrong. There is still a machine check when a MC-safe copy does a read
> > from a location that has a memory error.

Yes, the above was my first impression to be proven wrong ;)

> >
> > The recovery flow in this case does not involve queue_task_work(). That is only
> > useful for machine check exceptions taken in user context. The queued work will
> > be executed to call memory_failure() from the kernel, but in process context (not
> > from the machine check exception stack) to handle the error.
> >
> > For machine checks taken by kernel code (MC-safe copy functions) the recovery
> > path is here:
> >
> > if (m.kflags & MCE_IN_KERNEL_RECOV) {
> > if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
> > mce_panic("Failed kernel mode recovery", &m, msg);
> > }
> >
> > if (m.kflags & MCE_IN_KERNEL_COPYIN)
> > queue_task_work(&m, msg, kill_me_never);
> >
> > The "fixup_exception()" ensures that on return from the machine check handler
> > code returns to the extable[] fixup location instead of the instruction that was
> > loading from the memory error location.
> >
> > When the exception was from one of the copy_from_user() variants it makes
> > sense to also do the queue_task_work() because the kernel is going to return
> > to the user context (with an EFAULT error code from whatever system call was
> > attempting the copy_from_user()).
> >
> > But in the core dump case there is no return to user. The process is being
> > terminated by the signal that leads to this core dump. So even though you
> > may consider the page being accessed to be a "user" page, you can't fix
> > it by queueing work to run on return to user.
>
> For coredump,the task work will be called too, see following code,
>
> get_signal
> sig_kernel_coredump
> elf_core_dump
> dump_user_range
> _copy_from_iter // with MC-safe copy, return without panic
> do_group_exit(ksig->info.si_signo);
> do_exit
> exit_task_work
> task_work_run
> kill_me_never
> memory_failure
>
> I also add debug print to check the memory_failure() processing after
> add MCE_IN_KERNEL_COPYIN to MCE_SAFE exception type, also tested CoW of
> normal page and huge page, it works too.

Sounds nice to me.
Maybe this information is worth documenting in the patch description.

Thanks,
Naoya Horiguchi

2023-04-27 16:49:43

by Luck, Tony

[permalink] [raw]
Subject: RE: [PATCH v2] mm: hwpoison: coredump: support recovery from dump_user_range()

> > But in the core dump case there is no return to user. The process is being
> > terminated by the signal that leads to this core dump. So even though you
> > may consider the page being accessed to be a "user" page, you can't fix
> > it by queueing work to run on return to user.
>
> For coredump,the task work will be called too, see following code,
>
> get_signal
> sig_kernel_coredump
> elf_core_dump
> dump_user_range
> _copy_from_iter // with MC-safe copy, return without panic
> do_group_exit(ksig->info.si_signo);
> do_exit
> exit_task_work
> task_work_run
> kill_me_never
> memory_failure
>

Nice. I didn't realize that the exit code path would clear any pending task_work() requests.
But it makes sense that this happens. Thanks for filling a gap in my knowledge.

-Tony

2023-04-28 09:00:16

by Kefeng Wang

[permalink] [raw]
Subject: Re: [PATCH v2] mm: hwpoison: coredump: support recovery from dump_user_range()



On 2023/4/27 10:31, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Thu, Apr 27, 2023 at 09:06:46AM +0800, Kefeng Wang wrote:
>>
>>
...
>> For coredump,the task work will be called too, see following code,
>>
>> get_signal
>> sig_kernel_coredump
>> elf_core_dump
>> dump_user_range
>> _copy_from_iter // with MC-safe copy, return without panic
>> do_group_exit(ksig->info.si_signo);
>> do_exit
>> exit_task_work
>> task_work_run
>> kill_me_never
>> memory_failure
>>
>> I also add debug print to check the memory_failure() processing after
>> add MCE_IN_KERNEL_COPYIN to MCE_SAFE exception type, also tested CoW of
>> normal page and huge page, it works too.
>
> Sounds nice to me.
> Maybe this information is worth documenting in the patch description.
>

Sure, will make a formal patch and send out, thanks.

> Thanks,
> Naoya Horiguchi

2023-04-28 09:00:54

by Kefeng Wang

[permalink] [raw]
Subject: Re: [PATCH v2] mm: hwpoison: coredump: support recovery from dump_user_range()



On 2023/4/28 0:45, Luck, Tony wrote:
>>> But in the core dump case there is no return to user. The process is being
>>> terminated by the signal that leads to this core dump. So even though you
>>> may consider the page being accessed to be a "user" page, you can't fix
>>> it by queueing work to run on return to user.
>>
>> For coredump,the task work will be called too, see following code,
>>
>> get_signal
>> sig_kernel_coredump
>> elf_core_dump
>> dump_user_range
>> _copy_from_iter // with MC-safe copy, return without panic
>> do_group_exit(ksig->info.si_signo);
>> do_exit
>> exit_task_work
>> task_work_run
>> kill_me_never
>> memory_failure
>>
>
> Nice. I didn't realize that the exit code path would clear any pending task_work() requests.
> But it makes sense that this happens. Thanks for filling a gap in my knowledge.
>
Yep, we could be benefit from it to unify memory failure handling :)

> -Tony