With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
are consumed. According to the existing process, errors occurred in the
kernel, leading to direct panic, if it occurred the user-space, we should
just kill process.
But there is a class of error, in fact, is not necessary to kill
process, you can recover and continue to run the process. Such as
the instruction data corrupted, where the memory page might be
read-only, which is has not been modified, the disk might have the
correct data, so you can directly drop the page, ant reload it when
necessary.
So this patchset is just try to solve such problem: if the error is
consumed in user-space and the error occurs on a clean page, you can
directly drop the memory page without killing process.
If the corrupted page is clean, just dropped it and return to user-space
without side effects. And if corrupted page is dirty, memory_failure()
will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset,
do_sea() will just send SIGBUS, so the process was killed in the same place.
Because memory_failure() may sleep, we can not call it directly in SEA
exception context. So we saved faulting physical address associated with
a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return
from SEA exception context and get into do_notify_resume() before the
process running, we could check it and call memory_failure() to do
recovery. It's safe, because we are in process context.
In some platform, when SEA triggerred, physical address could be reported
by memory section or by processor section, so we save address at this two
place.
---
v5 - v4:
- rebased on top of 4.15-rc9 + efi patches
efi patches:
https://patchwork.codeaurora.org/patch/415877/
https://patchwork.codeaurora.org/patch/415879/
- add Tyler & Xiongfeng's Tested-by.
v4 - v3:
- rebase on top of the latest mainline
- make ghes_arm_process_error as a weak function
- only pick cache error from arm processor section for error recovery
- fix s-o-b issue
https://lkml.org/lkml/2017/9/7/98
v3 - v2:
- fix patch style issue
v2 - v1:
- wrap arm_proc_error_check and log_arm_hw_error in a single arm_process_error()
- fix sea_save_info return value issue
- fix link error if this CONFIG_ARM64_ERR_RECOV is not selected
- use a notify chain instead of call arch_apei_report_mem_error directly
https://lkml.org/lkml/2017/9/1/189
Xie XiuQi (3):
arm64/ras: support sea error recovery
GHES: add a notify chain for process memory section
arm64/ras: save error address from memory section for recovery
arch/arm64/Kconfig | 11 +++
arch/arm64/include/asm/ras.h | 23 +++++
arch/arm64/include/asm/thread_info.h | 4 +-
arch/arm64/kernel/Makefile | 1 +
arch/arm64/kernel/ras.c | 173 +++++++++++++++++++++++++++++++++++
arch/arm64/kernel/signal.c | 7 ++
arch/arm64/mm/fault.c | 27 ++++--
drivers/acpi/apei/ghes.c | 18 +++-
include/acpi/ghes.h | 11 +++
9 files changed, 265 insertions(+), 10 deletions(-)
create mode 100644 arch/arm64/include/asm/ras.h
create mode 100644 arch/arm64/kernel/ras.c
--
1.8.3.1
In some platform, when SEA triggerred, physical address might be
reported by memory section, so we save it for error recovery later.
Signed-off-by: Xie XiuQi <[email protected]>
Tested-by: Wang Xiongfeng <[email protected]>
Tested-by: Tyler Baicar <[email protected]>
---
arch/arm64/kernel/ras.c | 31 +++++++++++++++++++++++++++++++
1 file changed, 31 insertions(+)
diff --git a/arch/arm64/kernel/ras.c b/arch/arm64/kernel/ras.c
index 05d3871..4690193 100644
--- a/arch/arm64/kernel/ras.c
+++ b/arch/arm64/kernel/ras.c
@@ -140,3 +140,34 @@ void ghes_arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err)
if (info_saved)
set_thread_flag(TIF_SEA_NOTIFY);
}
+
+int ghes_mem_err_callback(struct notifier_block *nb, unsigned long val, void *data)
+{
+ bool info_saved = false;
+ struct ghes_mem_err *ghes_mem = (struct ghes_mem_err *)data;
+ struct cper_sec_mem_err *mem_err = ghes_mem->mem_err;
+
+ if ((ghes_mem->notify_type != ACPI_HEST_NOTIFY_SEA) ||
+ (ghes_mem->severity != CPER_SEV_RECOVERABLE))
+ return 0;
+
+ if (mem_err->validation_bits & CPER_MEM_VALID_PA)
+ info_saved = sea_save_info(mem_err->physical_addr);
+
+ if (info_saved)
+ set_thread_flag(TIF_SEA_NOTIFY);
+
+ return 0;
+}
+
+static struct notifier_block ghes_mem_err_nb = {
+ .notifier_call = ghes_mem_err_callback,
+};
+
+static int arm64_err_recov_init(void)
+{
+ atomic_notifier_chain_register(&ghes_mem_err_chain, &ghes_mem_err_nb);
+ return 0;
+}
+
+late_initcall(arm64_err_recov_init);
--
1.8.3.1
Add a notify chain for process memory section, with
which other modules might do error recovery.
Signed-off-by: Xie XiuQi <[email protected]>
Tested-by: Wang Xiongfeng <[email protected]>
Tested-by: Tyler Baicar <[email protected]>
---
drivers/acpi/apei/ghes.c | 10 ++++++++++
include/acpi/ghes.h | 8 ++++++++
2 files changed, 18 insertions(+)
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index cff671d..1f0ebfb 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -109,6 +109,9 @@ static inline bool is_hest_type_generic_v2(struct ghes *ghes)
static LIST_HEAD(ghes_hed);
static DEFINE_MUTEX(ghes_list_mutex);
+ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain);
+EXPORT_SYMBOL(ghes_mem_err_chain);
+
/*
* Because the memory area used to transfer hardware error information
* from BIOS to Linux can be determined only in NMI, IRQ or timer
@@ -441,6 +444,13 @@ static void ghes_do_proc(struct ghes *ghes,
if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
+ struct ghes_mem_err mem;
+
+ mem.notify_type = ghes->generic->notify.type;
+ mem.severity = gdata->error_severity;
+ mem.mem_err = mem_err;
+
+ atomic_notifier_call_chain(&ghes_mem_err_chain, 0, &mem);
ghes_edac_report_mem_error(ghes, sev, mem_err);
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index d626367..921ebf1 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -124,4 +124,12 @@ static inline void *acpi_hest_get_next(struct acpi_hest_generic_data *gdata)
extern void ghes_arm_process_error(struct ghes *ghes,
struct cper_sec_proc_arm *err);
+struct ghes_mem_err {
+ int notify_type;
+ int severity;
+ struct cper_sec_mem_err *mem_err;
+};
+
+extern struct atomic_notifier_head ghes_mem_err_chain;
+
#endif /* GHES_H */
--
1.8.3.1
With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
are consumed. According to the existing process, errors occurred in the
kernel, leading to direct panic, if it occurred the user-space, we should
just kill process.
But there is a class of error, in fact, is not necessary to kill
process, you can recover and continue to run the process. Such as
the instruction data corrupted, where the memory page might be
read-only, which is has not been modified, the disk might have the
correct data, so you can directly drop the page, ant reload it when
necessary.
So this patchset is just try to solve such problem: if the error is
consumed in user-space and the error occurs on a clean page, you can
directly drop the memory page without killing process.
If the corrupted page is clean, just dropped it and return to user-space
without side effects. And if corrupted page is dirty, memory_failure()
will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset,
do_sea() will just send SIGBUS, so the process was killed in the same place.
Because memory_failure() may sleep, we can not call it directly in SEA
exception context. So we saved faulting physical address associated with
a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return
from SEA exception context and get into do_notify_resume() before the
process running, we could check it and call memory_failure() to do
recovery. It's safe, because we are in process context.
Signed-off-by: Xie XiuQi <[email protected]>
Tested-by: Wang Xiongfeng <[email protected]>
Tested-by: Tyler Baicar <[email protected]>
Cc: James Morse <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Julien Thierry <[email protected]>
Cc: Hanjun Guo <[email protected]>
---
arch/arm64/Kconfig | 11 +++
arch/arm64/include/asm/ras.h | 23 ++++++
arch/arm64/include/asm/thread_info.h | 4 +-
arch/arm64/kernel/Makefile | 1 +
arch/arm64/kernel/ras.c | 142 +++++++++++++++++++++++++++++++++++
arch/arm64/kernel/signal.c | 7 ++
arch/arm64/mm/fault.c | 27 +++++--
drivers/acpi/apei/ghes.c | 8 +-
include/acpi/ghes.h | 3 +
9 files changed, 216 insertions(+), 10 deletions(-)
create mode 100644 arch/arm64/include/asm/ras.h
create mode 100644 arch/arm64/kernel/ras.c
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index c9a7e9e..8892923 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -688,6 +688,17 @@ config HOTPLUG_CPU
Say Y here to experiment with turning CPUs off and on. CPUs
can be controlled through /sys/devices/system/cpu.
+config ARM64_ERR_RECOV
+ bool "Support arm64 RAS error recovery"
+ depends on ACPI_APEI_SEA && MEMORY_FAILURE
+ help
+ With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
+ are consumed. In some cases, if the error address is in a clean page or a
+ read-only page, there is a chance to recover. Such as error occurs in a
+ instruction page, we can reread this page from disk instead of killing process.
+
+ Say Y if unsure.
+
# Common NUMA Features
config NUMA
bool "Numa Memory Allocation and Scheduler Support"
diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h
new file mode 100644
index 0000000..f0f18da
--- /dev/null
+++ b/arch/arm64/include/asm/ras.h
@@ -0,0 +1,23 @@
+/*
+ * ARM64 SEA error recoery support
+ *
+ * Copyright 2017 Huawei Technologies Co., Ltd.
+ * Author: Xie XiuQi <[email protected]>
+ * Author: Wang Xiongfeng <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _ASM_RAS_H
+#define _ASM_RAS_H
+
+extern void sea_notify_process(void);
+
+#endif /*_ASM_RAS_H*/
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index eb43128..250d769 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -84,6 +84,7 @@ struct thread_info {
#define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */
#define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */
#define TIF_FSCHECK 5 /* Check FS is USER_DS on return */
+#define TIF_SEA_NOTIFY 6 /* notify to do an error recovery */
#define TIF_NOHZ 7
#define TIF_SYSCALL_TRACE 8
#define TIF_SYSCALL_AUDIT 9
@@ -102,6 +103,7 @@ struct thread_info {
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
#define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE)
#define _TIF_NOHZ (1 << TIF_NOHZ)
+#define _TIF_SEA_NOTIFY (1 << TIF_SEA_NOTIFY)
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
@@ -113,7 +115,7 @@ struct thread_info {
#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
_TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
- _TIF_UPROBE | _TIF_FSCHECK)
+ _TIF_UPROBE | _TIF_FSCHECK | _TIF_SEA_NOTIFY)
#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
_TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 067baac..a87f0a2 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -42,6 +42,7 @@ arm64-obj-$(CONFIG_EFI) += efi.o efi-entry.stub.o
arm64-obj-$(CONFIG_PCI) += pci.o
arm64-obj-$(CONFIG_ARMV8_DEPRECATED) += armv8_deprecated.o
arm64-obj-$(CONFIG_ACPI) += acpi.o
+arm64-obj-$(CONFIG_ARM64_ERR_RECOV) += ras.o
arm64-obj-$(CONFIG_ACPI_NUMA) += acpi_numa.o
arm64-obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL) += acpi_parking_protocol.o
arm64-obj-$(CONFIG_PARAVIRT) += paravirt.o
diff --git a/arch/arm64/kernel/ras.c b/arch/arm64/kernel/ras.c
new file mode 100644
index 0000000..05d3871
--- /dev/null
+++ b/arch/arm64/kernel/ras.c
@@ -0,0 +1,142 @@
+/*
+ * ARM64 SEA error recoery support
+ *
+ * Copyright 2017 Huawei Technologies Co., Ltd.
+ * Author: Xie XiuQi <[email protected]>
+ * Author: Wang Xiongfeng <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/cper.h>
+#include <linux/mm.h>
+#include <linux/preempt.h>
+#include <linux/acpi.h>
+#include <linux/sched/signal.h>
+#include <linux/ras.h>
+
+#include <acpi/actbl1.h>
+#include <acpi/ghes.h>
+#include <acpi/apei.h>
+
+#include <asm/thread_info.h>
+#include <asm/atomic.h>
+#include <asm/ras.h>
+
+/*
+ * Need to save faulting physical address associated with a process
+ * in the sea ghes handler some place where we can grab it back
+ * later in sea_notify_process()
+ */
+#define SEA_INFO_MAX 16
+
+struct sea_info {
+ atomic_t inuse;
+ struct task_struct *t;
+ __u64 paddr;
+} sea_info[SEA_INFO_MAX];
+
+static bool sea_save_info(__u64 addr)
+{
+ struct sea_info *si;
+
+ for (si = sea_info; si < &sea_info[SEA_INFO_MAX]; si++) {
+ if (atomic_cmpxchg(&si->inuse, 0, 1) == 0) {
+ si->t = current;
+ si->paddr = addr;
+ return true;
+ }
+ }
+
+ pr_err("Too many concurrent recoverable errors\n");
+ return false;
+}
+
+static struct sea_info *sea_find_info(void)
+{
+ struct sea_info *si;
+
+ for (si = sea_info; si < &sea_info[SEA_INFO_MAX]; si++)
+ if (atomic_read(&si->inuse) && si->t == current)
+ return si;
+ return NULL;
+}
+
+static void sea_clear_info(struct sea_info *si)
+{
+ atomic_set(&si->inuse, 0);
+}
+
+/*
+ * Called in process context that interrupted by SEA and marked with
+ * TIF_SEA_NOTIFY, just before returning to erroneous userland.
+ * This code is allowed to sleep.
+ * Attempt possible recovery such as calling the high level VM handler to
+ * process any corrupted pages, and kill/signal current process if required.
+ * Action required errors are handled here.
+ */
+void sea_notify_process(void)
+{
+ unsigned long pfn;
+ int fail = 0, flags = MF_ACTION_REQUIRED;
+ struct sea_info *si = sea_find_info();
+
+ if (!si)
+ panic("Lost physical address for consumed uncorrectable error");
+
+ clear_thread_flag(TIF_SEA_NOTIFY);
+ do {
+ pfn = si->paddr >> PAGE_SHIFT;
+
+
+ pr_err("Uncorrected hardware memory error in user-access at %llx\n",
+ si->paddr);
+ /*
+ * We must call memory_failure() here even if the current process is
+ * doomed. We still need to mark the page as poisoned and alert any
+ * other users of the page.
+ */
+ if (memory_failure(pfn, 0, flags) < 0)
+ fail++;
+
+ sea_clear_info(si);
+
+ si = sea_find_info();
+ } while (si);
+
+ if (fail) {
+ pr_err("Memory error not recovered\n");
+ force_sig(SIGBUS, current);
+ }
+}
+
+void ghes_arm_process_error(struct ghes *ghes, struct cper_sec_proc_arm *err)
+{
+ int i;
+ bool info_saved = false;
+ struct cper_arm_err_info *err_info;
+
+ log_arm_hw_error(err);
+
+ if ((ghes->generic->notify.type != ACPI_HEST_NOTIFY_SEA) ||
+ (ghes->estatus->error_severity != CPER_SEV_RECOVERABLE))
+ return;
+
+ err_info = (struct cper_arm_err_info *)(err + 1);
+ for (i = 0; i < err->err_info_num; i++, err_info++) {
+ if ((err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR) &&
+ (err_info->type == CPER_ARM_CACHE_ERROR))
+ info_saved |= sea_save_info(err_info->physical_fault_addr);
+ }
+
+ if (info_saved)
+ set_thread_flag(TIF_SEA_NOTIFY);
+}
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index b120111..1fe60dd 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -41,6 +41,7 @@
#include <asm/ptrace.h>
#include <asm/signal32.h>
#include <asm/vdso.h>
+#include <asm/ras.h>
/*
* Do a signal return; undo the signal stack. These are aligned to 128-bit.
@@ -907,6 +908,12 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
*/
trace_hardirqs_off();
+#ifdef CONFIG_ARM64_ERR_RECOV
+ /* notify userspace of pending SEAs */
+ if (unlikely(thread_flags & _TIF_SEA_NOTIFY))
+ sea_notify_process();
+#endif /* CONFIG_ARM64_ERR_RECOV */
+
do {
/* Check valid user FS if needed */
addr_limit_user_check();
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 9b7f89d..15597e1 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -594,14 +594,25 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
nmi_exit();
}
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = 0;
- if (esr & ESR_ELx_FnV)
- info.si_addr = NULL;
- else
- info.si_addr = (void __user *)addr;
- arm64_notify_die("", regs, &info, esr);
+ if (user_mode(regs)) {
+ if (test_thread_flag(TIF_SEA_NOTIFY))
+ return 0;
+
+ info.si_signo = SIGBUS;
+ info.si_errno = 0;
+ info.si_code = 0;
+ if (esr & ESR_ELx_FnV)
+ info.si_addr = NULL;
+ else
+ info.si_addr = (void __user *)addr;
+
+ current->thread.fault_address = 0;
+ current->thread.fault_code = esr;
+ force_sig_info(info.si_signo, &info, current);
+ } else {
+ die("Uncorrected hardware memory error in kernel-access\n",
+ regs, esr);
+ }
return 0;
}
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 6402f7f..cff671d 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -414,6 +414,12 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
#endif
}
+void __weak ghes_arm_process_error(struct ghes *ghes,
+ struct cper_sec_proc_arm *err)
+{
+ log_arm_hw_error(err);
+}
+
static void ghes_do_proc(struct ghes *ghes,
const struct acpi_hest_generic_status *estatus)
{
@@ -476,7 +482,7 @@ static void ghes_do_proc(struct ghes *ghes,
else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
- log_arm_hw_error(err);
+ ghes_arm_process_error(ghes, err);
} else {
void *err = acpi_hest_get_payload(gdata);
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 8feb0c8..d626367 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -121,4 +121,7 @@ static inline void *acpi_hest_get_next(struct acpi_hest_generic_data *gdata)
int ghes_notify_sea(void);
+extern void ghes_arm_process_error(struct ghes *ghes,
+ struct cper_sec_proc_arm *err);
+
#endif /* GHES_H */
--
1.8.3.1
Hi Xie XiuQi,
On 26/01/18 12:31, Xie XiuQi wrote:
> With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
> are consumed. According to the existing process, errors occurred in the
> kernel, leading to direct panic, if it occurred the user-space, we should
> just kill process.
>
> But there is a class of error, in fact, is not necessary to kill
> process, you can recover and continue to run the process. Such as
> the instruction data corrupted, where the memory page might be
> read-only, which is has not been modified, the disk might have the
> correct data, so you can directly drop the page, ant reload it when
> necessary.
With firmware-first support, we do all this...
> So this patchset is just try to solve such problem: if the error is
> consumed in user-space and the error occurs on a clean page, you can
> directly drop the memory page without killing process.
>
> If the corrupted page is clean, just dropped it and return to user-space
> without side effects. And if corrupted page is dirty, memory_failure()
> will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset,
> do_sea() will just send SIGBUS, so the process was killed in the same place.
... but this happens too. I agree its something we should fix, but I don't think
this is the best way to do it.
This series is pulling the memory-failure-queue details back into the arch-code
to build a second list, that gets processed as extra work when we return to
user-space.
The root of the issue is ghes_notify_sea() claims the notification as something
APEI has dealt with, ... but it hasn't done it yet. The signals will be
generated by something currently stuck in a queue. (Evidently x86 doesn't handle
synchronous errors like this using firmware-first).
I think a smaller fix is to give the queues that may be holding the
memory_failure() work a kick as part of the code that calls ghes_notify_sea().
This means that by the time we return to do_sea() ghes_notify_sea()'s claim that
APEI has dealt with it is true as any generated signals are pending. We can then
skip the existing SIGBUS generation code.
> Because memory_failure() may sleep, we can not call it directly in SEA
(this one is more serious, I've attempted to fix it by moving all NMI-like
GHES-notifications to use the estatus queue).
> exception context. So we saved faulting physical address associated with
> a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return
> from SEA exception context and get into do_notify_resume() before the
> process running, we could check it and call memory_failure() to do
> recovery.
> It's safe, because we are in process context.
I think this is the trick. When we take a Synchronous-external-abort out of
userspace, we're in process context too. We can add helpers to drain the
memory_failure_queue which can be called when do_sea() when we know we're
preemptible and interrupts-et-al are unmasked.
Thanks,
James
[0] https://www.spinics.net/lists/linux-acpi/msg80149.html
> ---
> arch/arm64/Kconfig | 11 +++
> arch/arm64/include/asm/ras.h | 23 ++++++
> arch/arm64/include/asm/thread_info.h | 4 +-
> arch/arm64/kernel/Makefile | 1 +
> arch/arm64/kernel/ras.c | 142 +++++++++++++++++++++++++++++++++++
> arch/arm64/kernel/signal.c | 7 ++
> arch/arm64/mm/fault.c | 27 +++++--
> drivers/acpi/apei/ghes.c | 8 +-
> include/acpi/ghes.h | 3 +
> 9 files changed, 216 insertions(+), 10 deletions(-)
> create mode 100644 arch/arm64/include/asm/ras.h
> create mode 100644 arch/arm64/kernel/ras.c
On Fri, Jan 26, 2018 at 08:31:24PM +0800, Xie XiuQi wrote:
> Add a notify chain for process memory section, with
> which other modules might do error recovery.
>
> Signed-off-by: Xie XiuQi <[email protected]>
> Tested-by: Wang Xiongfeng <[email protected]>
> Tested-by: Tyler Baicar <[email protected]>
> ---
> drivers/acpi/apei/ghes.c | 10 ++++++++++
> include/acpi/ghes.h | 8 ++++++++
> 2 files changed, 18 insertions(+)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index cff671d..1f0ebfb 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -109,6 +109,9 @@ static inline bool is_hest_type_generic_v2(struct ghes *ghes)
> static LIST_HEAD(ghes_hed);
> static DEFINE_MUTEX(ghes_list_mutex);
>
> +ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain);
> +EXPORT_SYMBOL(ghes_mem_err_chain);
Definitely not EXPORT_SYMBOL.
And certainly not export the notifier head. Have register and unregister
functions instead.
That is, *if* you can't solve it differently with James. Notifiers
should be avoided if possible.
--
Regards/Gruss,
Boris.
SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
--
Hi Xie XiuQi,
On 30/01/18 19:19, James Morse wrote:
> On 26/01/18 12:31, Xie XiuQi wrote:
>> With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
>> are consumed. According to the existing process, errors occurred in the
>> kernel, leading to direct panic, if it occurred the user-space, we should
>> just kill process.
>>
>> But there is a class of error, in fact, is not necessary to kill
>> process, you can recover and continue to run the process. Such as
>> the instruction data corrupted, where the memory page might be
>> read-only, which is has not been modified, the disk might have the
>> correct data, so you can directly drop the page, ant reload it when
>> necessary.
>
> With firmware-first support, we do all this...
>
>
>> So this patchset is just try to solve such problem: if the error is
>> consumed in user-space and the error occurs on a clean page, you can
>> directly drop the memory page without killing process.
>>
>> If the corrupted page is clean, just dropped it and return to user-space
>> without side effects. And if corrupted page is dirty, memory_failure()
>> will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset,
>> do_sea() will just send SIGBUS, so the process was killed in the same place.
>
> ... but this happens too. I agree its something we should fix, but I don't think
> this is the best way to do it.
>
> This series is pulling the memory-failure-queue details back into the arch-code
> to build a second list, that gets processed as extra work when we return to
> user-space.
>
>
> The root of the issue is ghes_notify_sea() claims the notification as something
> APEI has dealt with, ... but it hasn't done it yet. The signals will be
> generated by something currently stuck in a queue. (Evidently x86 doesn't handle
> synchronous errors like this using firmware-first).
>
> I think a smaller fix is to give the queues that may be holding the
> memory_failure() work a kick as part of the code that calls ghes_notify_sea().
> This means that by the time we return to do_sea() ghes_notify_sea()'s claim that
> APEI has dealt with it is true as any generated signals are pending. We can then
> skip the existing SIGBUS generation code.
>
>
>> Because memory_failure() may sleep, we can not call it directly in SEA
>
> (this one is more serious, I've attempted to fix it by moving all NMI-like
> GHES-notifications to use the estatus queue).
>
>
>> exception context. So we saved faulting physical address associated with
>> a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return
>> from SEA exception context and get into do_notify_resume() before the
>> process running, we could check it and call memory_failure() to do
>> recovery.
>
>> It's safe, because we are in process context.
>
> I think this is the trick. When we take a Synchronous-external-abort out of
> userspace, we're in process context too. We can add helpers to drain the
> memory_failure_queue which can be called when do_sea() when we know we're
> preemptible and interrupts-et-al are unmasked.
Something like... base on [0], in arch/arm64/kernel/acpi.c:
-----------------%<-----------------
int apei_claim_sea(struct pt_regs *regs)
{
int cpu;
int err = -ENOENT;
unsigned long current_flags = arch_local_save_flags();
unsigned long interrupted_flags = current_flags;
if (!IS_ENABLED(CONFIG_ACPI_APEI_SEA))
return err;
if (regs)
interrupted_flags = regs->pstate;
/*
* APEI expects an NMI-like notification to always be called
* in NMI context.
*/
local_daif_restore(DAIF_ERRCTX);
nmi_enter();
err = ghes_notify_sea();
cpu = smp_processor_id();
nmi_exit();
/*
* APEI NMI-like notifications are deferred to irq_work. Unless
* we interrupted irqs-masked code, we can do that now.
*/
if (!err) {
if (!arch_irqs_disabled_flags(interrupted_flags)) {
local_daif_restore(DAIF_PROCCTX_NOIRQ);
irq_work_run();
} else {
err = -EINPROGRESS;
}
}
local_daif_restore(current_flags);
if (IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE) && !err) {
/*
* Memory failure work is scheduled on the local CPU.
* If we interrupted userspace, or are in process context
* we can do that now.
*/
if ((regs && !user_mode(regs)) || !preemptible())
err = -EINPROGRESS;
else
memory_failure_queue_kick(cpu);
}
return err;
}
-----------------%<-----------------
and to mm/memory-failure.c:
-----------------%<-----------------
@@ -1355,7 +1355,7 @@ static void memory_failure_work_func(struct work_struct *w
ork)
unsigned long proc_flags;
int gotten;
- mf_cpu = this_cpu_ptr(&memory_failure_cpu);
+ mf_cpu = container_of(work, struct memory_failure_cpu, work);
for (;;) {
spin_lock_irqsave(&mf_cpu->lock, proc_flags);
gotten = kfifo_get(&mf_cpu->fifo, &entry);
@@ -1369,6 +1369,22 @@ static void memory_failure_work_func(struct work_struct *
work)
}
}
+/*
+ * Process memory_failure work queued on the specified CPU.
+ * Used to avoid return-to-userspace racing with the memory_failure workqueue.
+ */
+void memory_failure_queue_kick(int cpu)
+{
+ unsigned long flags;
+ struct memory_failure_cpu *mf_cpu;
+
+ might_sleep();
+
+ mf_cpu = &per_cpu(memory_failure_cpu, cpu);
+ cancel_work_sync(&mf_cpu->work);
+ memory_failure_work_func(&mf_cpu->work);
+}
+
static int __init memory_failure_init(void)
{
struct memory_failure_cpu *mf_cpu;
-----------------%<-----------------
I've cooked up some NOTFIY_SEA-ing APEI firmware using kvmtool to test this. I
haven't yet managed to hit irq-masked code with NOTIFY_SEA. I'll try and tidy
this up and post a branch to make it easier to test...
I prefer this as it doesn't duplicate the state then come back on a TIF flag.
I'd like to move the kicking logic into ghes.c, as that is where the queueing
happened, but the 'do-this, restore these flags, do-that' is somewhat tasteless,
and it looks like on arm64 has synchronous nmi-like notifications that must be
handled before returning to user-space...
Thanks,
James
[0] https://www.spinics.net/lists/linux-acpi/msg80149.html
Hi James,
Sorry for reply late.
On 2018/2/8 3:03, James Morse wrote:
> Hi Xie XiuQi,
>
> On 30/01/18 19:19, James Morse wrote:
>> On 26/01/18 12:31, Xie XiuQi wrote:
>>> With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
>>> are consumed. According to the existing process, errors occurred in the
>>> kernel, leading to direct panic, if it occurred the user-space, we should
>>> just kill process.
>>>
>>> But there is a class of error, in fact, is not necessary to kill
>>> process, you can recover and continue to run the process. Such as
>>> the instruction data corrupted, where the memory page might be
>>> read-only, which is has not been modified, the disk might have the
>>> correct data, so you can directly drop the page, ant reload it when
>>> necessary.
>>
>> With firmware-first support, we do all this...
>>
>>
>>> So this patchset is just try to solve such problem: if the error is
>>> consumed in user-space and the error occurs on a clean page, you can
>>> directly drop the memory page without killing process.
>>>
>>> If the corrupted page is clean, just dropped it and return to user-space
>>> without side effects. And if corrupted page is dirty, memory_failure()
>>> will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset,
>>> do_sea() will just send SIGBUS, so the process was killed in the same place.
>>
>> ... but this happens too. I agree its something we should fix, but I don't think
>> this is the best way to do it.
>>
>> This series is pulling the memory-failure-queue details back into the arch-code
>> to build a second list, that gets processed as extra work when we return to
>> user-space.
>>
>>
>> The root of the issue is ghes_notify_sea() claims the notification as something
>> APEI has dealt with, ... but it hasn't done it yet. The signals will be
>> generated by something currently stuck in a queue. (Evidently x86 doesn't handle
>> synchronous errors like this using firmware-first).
>>
>> I think a smaller fix is to give the queues that may be holding the
>> memory_failure() work a kick as part of the code that calls ghes_notify_sea().
>> This means that by the time we return to do_sea() ghes_notify_sea()'s claim that
>> APEI has dealt with it is true as any generated signals are pending. We can then
>> skip the existing SIGBUS generation code.
>>
>>
>>> Because memory_failure() may sleep, we can not call it directly in SEA
>>
>> (this one is more serious, I've attempted to fix it by moving all NMI-like
>> GHES-notifications to use the estatus queue).
>>
>>
>>> exception context. So we saved faulting physical address associated with
>>> a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return
>>> from SEA exception context and get into do_notify_resume() before the
>>> process running, we could check it and call memory_failure() to do
>>> recovery.
>>
>>> It's safe, because we are in process context.
>>
>> I think this is the trick. When we take a Synchronous-external-abort out of
>> userspace, we're in process context too. We can add helpers to drain the
>> memory_failure_queue which can be called when do_sea() when we know we're
>> preemptible and interrupts-et-al are unmasked.
>
> Something like... base on [0], in arch/arm64/kernel/acpi.c:
I am very glad that you are trying to solve the problem, which is very helpful.
I agree with your proposal, and I'll test it on by box latter.
Indeed, we're in precess context when we are in sea handler. I was thought we
can't call schedule() in the exception handler before.
Thank you very much!
> -----------------%<-----------------
> int apei_claim_sea(struct pt_regs *regs)
> {
> int cpu;
> int err = -ENOENT;
> unsigned long current_flags = arch_local_save_flags();
> unsigned long interrupted_flags = current_flags;
>
> if (!IS_ENABLED(CONFIG_ACPI_APEI_SEA))
> return err;
>
> if (regs)
> interrupted_flags = regs->pstate;
>
> /*
> * APEI expects an NMI-like notification to always be called
> * in NMI context.
> */
> local_daif_restore(DAIF_ERRCTX);
> nmi_enter();
> err = ghes_notify_sea();
> cpu = smp_processor_id();
> nmi_exit();
>
> /*
> * APEI NMI-like notifications are deferred to irq_work. Unless
> * we interrupted irqs-masked code, we can do that now.
> */
> if (!err) {
> if (!arch_irqs_disabled_flags(interrupted_flags)) {
> local_daif_restore(DAIF_PROCCTX_NOIRQ);
> irq_work_run();
> } else {
> err = -EINPROGRESS;
> }
> }
>
> local_daif_restore(current_flags);
>
> if (IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE) && !err) {
> /*
> * Memory failure work is scheduled on the local CPU.
> * If we interrupted userspace, or are in process context
> * we can do that now.
> */
> if ((regs && !user_mode(regs)) || !preemptible())
> err = -EINPROGRESS;
> else
> memory_failure_queue_kick(cpu);
> }
>
> return err;
> }
> -----------------%<-----------------
>
>
> and to mm/memory-failure.c:
> -----------------%<-----------------
> @@ -1355,7 +1355,7 @@ static void memory_failure_work_func(struct work_struct *w
> ork)
> unsigned long proc_flags;
> int gotten;
>
> - mf_cpu = this_cpu_ptr(&memory_failure_cpu);
> + mf_cpu = container_of(work, struct memory_failure_cpu, work);
> for (;;) {
> spin_lock_irqsave(&mf_cpu->lock, proc_flags);
> gotten = kfifo_get(&mf_cpu->fifo, &entry);
>
> @@ -1369,6 +1369,22 @@ static void memory_failure_work_func(struct work_struct *
> work)
> }
> }
>
> +/*
> + * Process memory_failure work queued on the specified CPU.
> + * Used to avoid return-to-userspace racing with the memory_failure workqueue.
> + */
> +void memory_failure_queue_kick(int cpu)
> +{
> + unsigned long flags;
> + struct memory_failure_cpu *mf_cpu;
> +
> + might_sleep();
> +
> + mf_cpu = &per_cpu(memory_failure_cpu, cpu);
> + cancel_work_sync(&mf_cpu->work);
> + memory_failure_work_func(&mf_cpu->work);
> +}
> +
> static int __init memory_failure_init(void)
> {
> struct memory_failure_cpu *mf_cpu;
> -----------------%<-----------------
>
> I've cooked up some NOTFIY_SEA-ing APEI firmware using kvmtool to test this. I
> haven't yet managed to hit irq-masked code with NOTIFY_SEA. I'll try and tidy
> this up and post a branch to make it easier to test...
>
> I prefer this as it doesn't duplicate the state then come back on a TIF flag.
> I'd like to move the kicking logic into ghes.c, as that is where the queueing
> happened, but the 'do-this, restore these flags, do-that' is somewhat tasteless,
> and it looks like on arm64 has synchronous nmi-like notifications that must be
> handled before returning to user-space...
>
> Thanks,
>
> James
>
> [0] https://www.spinics.net/lists/linux-acpi/msg80149.html
>
--
Thanks,
Xie XiuQi
Hi Boris,
Thanks for your comments.
On 2018/2/7 18:31, Borislav Petkov wrote:
> On Fri, Jan 26, 2018 at 08:31:24PM +0800, Xie XiuQi wrote:
>> Add a notify chain for process memory section, with
>> which other modules might do error recovery.
>>
>> Signed-off-by: Xie XiuQi <[email protected]>
>> Tested-by: Wang Xiongfeng <[email protected]>
>> Tested-by: Tyler Baicar <[email protected]>
>> ---
>> drivers/acpi/apei/ghes.c | 10 ++++++++++
>> include/acpi/ghes.h | 8 ++++++++
>> 2 files changed, 18 insertions(+)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index cff671d..1f0ebfb 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -109,6 +109,9 @@ static inline bool is_hest_type_generic_v2(struct ghes *ghes)
>> static LIST_HEAD(ghes_hed);
>> static DEFINE_MUTEX(ghes_list_mutex);
>>
>> +ATOMIC_NOTIFIER_HEAD(ghes_mem_err_chain);
>> +EXPORT_SYMBOL(ghes_mem_err_chain);
>
> Definitely not EXPORT_SYMBOL.
>
> And certainly not export the notifier head. Have register and unregister
> functions instead.
>
> That is, *if* you can't solve it differently with James. Notifiers
> should be avoided if possible.
>
Yes, I saw the patchset, I agree. I will work with James to solve the problem.
--
Thanks,
Xie XiuQi
On 2018/2/8 3:03, James Morse wrote:
> Hi Xie XiuQi,
>
> On 30/01/18 19:19, James Morse wrote:
>> On 26/01/18 12:31, Xie XiuQi wrote:
>>> With ARM v8.2 RAS Extension, SEA are usually triggered when memory errors
>>> are consumed. According to the existing process, errors occurred in the
>>> kernel, leading to direct panic, if it occurred the user-space, we should
>>> just kill process.
>>>
>>> But there is a class of error, in fact, is not necessary to kill
>>> process, you can recover and continue to run the process. Such as
>>> the instruction data corrupted, where the memory page might be
>>> read-only, which is has not been modified, the disk might have the
>>> correct data, so you can directly drop the page, ant reload it when
>>> necessary.
>>
>> With firmware-first support, we do all this...
>>
>>
>>> So this patchset is just try to solve such problem: if the error is
>>> consumed in user-space and the error occurs on a clean page, you can
>>> directly drop the memory page without killing process.
>>>
>>> If the corrupted page is clean, just dropped it and return to user-space
>>> without side effects. And if corrupted page is dirty, memory_failure()
>>> will send SIGBUS with code=BUS_MCEERR_AR. While without this patchset,
>>> do_sea() will just send SIGBUS, so the process was killed in the same place.
>>
>> ... but this happens too. I agree its something we should fix, but I don't think
>> this is the best way to do it.
>>
>> This series is pulling the memory-failure-queue details back into the arch-code
>> to build a second list, that gets processed as extra work when we return to
>> user-space.
>>
>>
>> The root of the issue is ghes_notify_sea() claims the notification as something
>> APEI has dealt with, ... but it hasn't done it yet. The signals will be
>> generated by something currently stuck in a queue. (Evidently x86 doesn't handle
>> synchronous errors like this using firmware-first).
>>
>> I think a smaller fix is to give the queues that may be holding the
>> memory_failure() work a kick as part of the code that calls ghes_notify_sea().
>> This means that by the time we return to do_sea() ghes_notify_sea()'s claim that
>> APEI has dealt with it is true as any generated signals are pending. We can then
>> skip the existing SIGBUS generation code.
>>
>>
>>> Because memory_failure() may sleep, we can not call it directly in SEA
>>
>> (this one is more serious, I've attempted to fix it by moving all NMI-like
>> GHES-notifications to use the estatus queue).
>>
>>
>>> exception context. So we saved faulting physical address associated with
>>> a process in the ghes handler and set __TIF_SEA_NOTIFY. When we return
>>> from SEA exception context and get into do_notify_resume() before the
>>> process running, we could check it and call memory_failure() to do
>>> recovery.
>>
>>> It's safe, because we are in process context.
>>
>> I think this is the trick. When we take a Synchronous-external-abort out of
>> userspace, we're in process context too. We can add helpers to drain the
>> memory_failure_queue which can be called when do_sea() when we know we're
>> preemptible and interrupts-et-al are unmasked.
>
> Something like... base on [0], in arch/arm64/kernel/acpi.c:
> -----------------%<-----------------
> int apei_claim_sea(struct pt_regs *regs)
> {
> int cpu;
> int err = -ENOENT;
> unsigned long current_flags = arch_local_save_flags();
> unsigned long interrupted_flags = current_flags;
>
> if (!IS_ENABLED(CONFIG_ACPI_APEI_SEA))
> return err;
>
> if (regs)
> interrupted_flags = regs->pstate;
>
> /*
> * APEI expects an NMI-like notification to always be called
> * in NMI context.
> */
> local_daif_restore(DAIF_ERRCTX);
> nmi_enter();
> err = ghes_notify_sea();
> cpu = smp_processor_id();
> nmi_exit();
>
> /*
> * APEI NMI-like notifications are deferred to irq_work. Unless
> * we interrupted irqs-masked code, we can do that now.
> */
> if (!err) {
> if (!arch_irqs_disabled_flags(interrupted_flags)) {
> local_daif_restore(DAIF_PROCCTX_NOIRQ);
> irq_work_run();
> } else {
> err = -EINPROGRESS;
> }
> }
>
> local_daif_restore(current_flags);
>
> if (IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE) && !err) {
> /*
> * Memory failure work is scheduled on the local CPU.
> * If we interrupted userspace, or are in process context
> * we can do that now.
> */
> if ((regs && !user_mode(regs)) || !preemptible())
> err = -EINPROGRESS;
> else
> memory_failure_queue_kick(cpu);
> }
>
> return err;
> }
> -----------------%<-----------------
>
>
> and to mm/memory-failure.c:
> -----------------%<-----------------
> @@ -1355,7 +1355,7 @@ static void memory_failure_work_func(struct work_struct *w
> ork)
> unsigned long proc_flags;
> int gotten;
>
> - mf_cpu = this_cpu_ptr(&memory_failure_cpu);
> + mf_cpu = container_of(work, struct memory_failure_cpu, work);
> for (;;) {
> spin_lock_irqsave(&mf_cpu->lock, proc_flags);
> gotten = kfifo_get(&mf_cpu->fifo, &entry);
>
> @@ -1369,6 +1369,22 @@ static void memory_failure_work_func(struct work_struct *
> work)
> }
> }
>
> +/*
> + * Process memory_failure work queued on the specified CPU.
> + * Used to avoid return-to-userspace racing with the memory_failure workqueue.
> + */
> +void memory_failure_queue_kick(int cpu)
> +{
> + unsigned long flags;
> + struct memory_failure_cpu *mf_cpu;
> +
> + might_sleep();
> +
> + mf_cpu = &per_cpu(memory_failure_cpu, cpu);
> + cancel_work_sync(&mf_cpu->work);
> + memory_failure_work_func(&mf_cpu->work);
> +}
> +
> static int __init memory_failure_init(void)
> {
> struct memory_failure_cpu *mf_cpu;
> -----------------%<-----------------
It look like the change is reasonable, thanks James's solving.
>
> I've cooked up some NOTFIY_SEA-ing APEI firmware using kvmtool to test this. I
> haven't yet managed to hit irq-masked code with NOTIFY_SEA. I'll try and tidy
> this up and post a branch to make it easier to test...
>
> I prefer this as it doesn't duplicate the state then come back on a TIF flag.
> I'd like to move the kicking logic into ghes.c, as that is where the queueing
> happened, but the 'do-this, restore these flags, do-that' is somewhat tasteless,
> and it looks like on arm64 has synchronous nmi-like notifications that must be
> handled before returning to user-space...
>
>
>
> Thanks,
>
> James
>
> [0] https://www.spinics.net/lists/linux-acpi/msg80149.html
>
>
>
>
> .
>
Hi Xie XiuQi,
On 08/02/18 08:35, Xie XiuQi wrote:
> I am very glad that you are trying to solve the problem, which is very helpful.
> I agree with your proposal, and I'll test it on by box latter.
>
> Indeed, we're in precess context when we are in sea handler. I was thought we
> can't call schedule() in the exception handler before.
While testing this I've come to the conclusion that the
memory_failure_queue_kick() approach I suggested makes arm64 behave slightly
differently with APEI, and would need re-inventing if we support kernel-first
too. The same race exists with memory-failure notifications signalled by SDEI,
and to a lesser extent IRQ. So by fixing this in arch-code, we actually making
our lives harder.
Instead, I have the patch below. This is smaller, and not arch specific. It also
saves the arch code secretly knowing that APEI calls memory_failure_queue().
I will post this as part of that series shortly...
Thanks,
James
---------------%<---------------
[PATCH] mm/memory-failure: increase queued recovery work's priority
arm64 can take an NMI-like error notification when user-space steps in
some corrupt memory. APEI's GHES code will call memory_failure_queue()
to schedule the recovery work. We then return to user-space, possibly
taking the fault again.
Currently the arch code unconditionally signals user-space from this
path, so we don't get stuck in this loop, but the affected process
never benefits from memory_failure()s recovery work. To fix this we
need to know the recovery work will run before we get back to user-space.
Increase the priority of the recovery work by scheduling it on the
system_highpri_wq, then try to bump the current task off this CPU
so that the recover work starts immediately.
Reported-by: Xie XiuQi <[email protected]>
Signed-off-by: James Morse <[email protected]>
CC: Xie XiuQi <[email protected]>
CC: gengdongjiu <[email protected]>
---
mm/memory-failure.c | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 4b80ccee4535..14f44d841e8b 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -55,6 +55,7 @@
#include <linux/hugetlb.h>
#include <linux/memory_hotplug.h>
#include <linux/mm_inline.h>
+#include <linux/preempt.h>
#include <linux/kfifo.h>
#include <linux/ratelimit.h>
#include "internal.h"
@@ -1319,6 +1320,7 @@ static DEFINE_PER_CPU(struct memory_failure_cpu,
memory_failure_cpu);
*/
void memory_failure_queue(unsigned long pfn, int flags)
{
+ int cpu = smp_processor_id();
struct memory_failure_cpu *mf_cpu;
unsigned long proc_flags;
struct memory_failure_entry entry = {
@@ -1328,11 +1330,14 @@ void memory_failure_queue(unsigned long pfn, int flags)
mf_cpu = &get_cpu_var(memory_failure_cpu);
spin_lock_irqsave(&mf_cpu->lock, proc_flags);
- if (kfifo_put(&mf_cpu->fifo, entry))
- schedule_work_on(smp_processor_id(), &mf_cpu->work);
- else
+ if (kfifo_put(&mf_cpu->fifo, entry)) {
+ queue_work_on(cpu, system_highpri_wq, &mf_cpu->work);
+ set_tsk_need_resched(current);
+ preempt_set_need_resched();
+ } else {
pr_err("Memory failure: buffer overflow when queuing memory
failure at %#lx\n",
pfn);
+ }
spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
put_cpu_var(memory_failure_cpu);
}
---------------%<---------------