From: Naoya Horiguchi <[email protected]>
The previous patch solves the infinite MCE loop issue when multiple
MCE events races. The remaining issue is to make sure that all threads
processing Action Required MCEs send to the current processes the
SIGBUS with the proper virtual address and the error size.
This patch suggests to do page table walk to find the error virtual
address. If we find multiple virtual addresses in walking, we now can't
determine which one is correct, so we fall back to sending SIGBUS in
kill_me_maybe() without error info as we do now. This corner case needs
to be solved in the future.
Signed-off-by: Naoya Horiguchi <[email protected]>
---
arch/x86/kernel/cpu/mce/core.c | 13 ++-
include/linux/swapops.h | 5 ++
mm/memory-failure.c | 147 ++++++++++++++++++++++++++++++++-
3 files changed, 161 insertions(+), 4 deletions(-)
diff --git v5.12-rc5/arch/x86/kernel/cpu/mce/core.c v5.12-rc5_patched/arch/x86/kernel/cpu/mce/core.c
index 7962355436da..3ce23445a48c 100644
--- v5.12-rc5/arch/x86/kernel/cpu/mce/core.c
+++ v5.12-rc5_patched/arch/x86/kernel/cpu/mce/core.c
@@ -1257,19 +1257,28 @@ static void kill_me_maybe(struct callback_head *cb)
{
struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
int flags = MF_ACTION_REQUIRED;
+ int ret;
pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
if (!p->mce_ripv)
flags |= MF_MUST_KILL;
- if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) &&
- !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
+ ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
+ if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
sync_core();
return;
}
+ /*
+ * -EHWPOISON from memory_failure() means that it already sent SIGBUS
+ * to the current process with the proper error info, so no need to
+ * send it here again.
+ */
+ if (ret == -EHWPOISON)
+ return;
+
if (p->mce_vaddr != (void __user *)-1l) {
force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT);
} else {
diff --git v5.12-rc5/include/linux/swapops.h v5.12-rc5_patched/include/linux/swapops.h
index d9b7c9132c2f..98ea67fcf360 100644
--- v5.12-rc5/include/linux/swapops.h
+++ v5.12-rc5_patched/include/linux/swapops.h
@@ -323,6 +323,11 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
return swp_type(entry) == SWP_HWPOISON;
}
+static inline unsigned long hwpoison_entry_to_pfn(swp_entry_t entry)
+{
+ return swp_offset(entry);
+}
+
static inline void num_poisoned_pages_inc(void)
{
atomic_long_inc(&num_poisoned_pages);
diff --git v5.12-rc5/mm/memory-failure.c v5.12-rc5_patched/mm/memory-failure.c
index 368ef77e01f9..04e002bd573a 100644
--- v5.12-rc5/mm/memory-failure.c
+++ v5.12-rc5_patched/mm/memory-failure.c
@@ -56,6 +56,7 @@
#include <linux/kfifo.h>
#include <linux/ratelimit.h>
#include <linux/page-isolation.h>
+#include <linux/pagewalk.h>
#include "internal.h"
#include "ras/ras_event.h"
@@ -554,6 +555,142 @@ static void collect_procs(struct page *page, struct list_head *tokill,
collect_procs_file(page, tokill, force_early);
}
+struct hwp_walk {
+ struct to_kill tk;
+ unsigned long pfn;
+ int flags;
+};
+
+static int set_to_kill(struct to_kill *tk, unsigned long addr, short shift)
+{
+ /* Abort pagewalk when finding multiple mappings to the error page. */
+ if (tk->addr)
+ return 1;
+ tk->addr = addr;
+ tk->size_shift = shift;
+ return 0;
+}
+
+static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
+ unsigned long poisoned_pfn, struct to_kill *tk)
+{
+ unsigned long pfn;
+
+ if (pte_present(pte)) {
+ pfn = pte_pfn(pte);
+ } else {
+ swp_entry_t swp = pte_to_swp_entry(pte);
+
+ if (is_hwpoison_entry(swp))
+ pfn = hwpoison_entry_to_pfn(swp);
+ }
+
+ if (!pfn || pfn != poisoned_pfn)
+ return 0;
+
+ return set_to_kill(tk, addr, shift);
+}
+
+static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
+ int ret;
+ pte_t *ptep;
+ spinlock_t *ptl;
+
+ ptl = pmd_trans_huge_lock(pmdp, walk->vma);
+ if (ptl) {
+ pmd_t pmd = *pmdp;
+
+ if (pmd_present(pmd)) {
+ unsigned long pfn = pmd_pfn(pmd);
+
+ if (pfn <= hwp->pfn && hwp->pfn < pfn + PMD_SIZE) {
+ unsigned long hwpoison_vaddr = addr +
+ (hwp->pfn << PAGE_SHIFT & ~PMD_MASK);
+
+ ret = set_to_kill(&hwp->tk, hwpoison_vaddr,
+ PAGE_SHIFT);
+ }
+ }
+ spin_unlock(ptl);
+ goto out;
+ }
+
+ if (pmd_trans_unstable(pmdp))
+ goto out;
+
+ ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl);
+ for (; addr != end; ptep++, addr += PAGE_SIZE) {
+ ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT,
+ hwp->pfn, &hwp->tk);
+ if (ret == 1)
+ break;
+ }
+ pte_unmap_unlock(ptep - 1, ptl);
+out:
+ cond_resched();
+ return ret;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
+ pte_t pte = huge_ptep_get(ptep);
+ struct hstate *h = hstate_vma(walk->vma);
+
+ return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
+ hwp->pfn, &hwp->tk);
+}
+#else
+#define hwpoison_hugetlb_range NULL
+#endif
+
+static struct mm_walk_ops hwp_walk_ops = {
+ .pmd_entry = hwpoison_pte_range,
+ .hugetlb_entry = hwpoison_hugetlb_range,
+};
+
+/*
+ * Sends SIGBUS to the current process with the error info.
+ *
+ * This function is intended to handle "Action Required" MCEs on already
+ * hardware poisoned pages. They could happen, for example, when
+ * memory_failure() failed to unmap the error page at the first call, or
+ * when multiple Action Optional MCE events races on different CPUs with
+ * Local MCE enabled.
+ *
+ * MCE handler currently has no easy access to the error virtual address,
+ * so this function walks page table to find it. One challenge on this is
+ * to reliably get the proper virual address of the error to report to
+ * applications via SIGBUS. A process could map a page multiple times to
+ * different virtual addresses, then we now have no way to tell which virtual
+ * address was accessed when the Action Required MCE was generated.
+ * So in such a corner case, we now give up and fall back to sending SIGBUS
+ * with no error info.
+ */
+static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
+ int flags)
+{
+ int ret;
+ struct hwp_walk priv = {
+ .pfn = pfn,
+ };
+ priv.tk.tsk = p;
+
+ mmap_read_lock(p->mm);
+ ret = walk_page_range(p->mm, 0, TASK_SIZE_MAX, &hwp_walk_ops,
+ (void *)&priv);
+ if (!ret && priv.tk.addr)
+ kill_proc(&priv.tk, pfn, flags);
+ mmap_read_unlock(p->mm);
+ return ret ? -EFAULT : -EHWPOISON;
+}
+
static const char *action_name[] = {
[MF_IGNORED] = "Ignored",
[MF_FAILED] = "Failed",
@@ -1228,7 +1365,10 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
if (TestSetPageHWPoison(head)) {
pr_err("Memory failure: %#lx: already hardware poisoned\n",
pfn);
- return -EHWPOISON;
+ res = -EHWPOISON;
+ if (flags & MF_ACTION_REQUIRED)
+ res = kill_accessing_process(current, page_to_pfn(head), flags);
+ return res;
}
num_poisoned_pages_inc();
@@ -1437,8 +1577,11 @@ int memory_failure(unsigned long pfn, int flags)
if (TestSetPageHWPoison(p)) {
pr_err("Memory failure: %#lx: already hardware poisoned\n",
pfn);
+ res = -EHWPOISON;
+ if (flags & MF_ACTION_REQUIRED)
+ res = kill_accessing_process(current, pfn, flags);
mutex_unlock(&mf_mutex);
- return -EHWPOISON;
+ return res;
}
orig_head = hpage = compound_head(p);
--
2.25.1
On Tue, 13 Apr 2021 07:43:20 +0900, Naoya Horiguchi wrote:
...
> + * This function is intended to handle "Action Required" MCEs on already
> + * hardware poisoned pages. They could happen, for example, when
> + * memory_failure() failed to unmap the error page at the first call, or
> + * when multiple Action Optional MCE events races on different CPUs with
> + * Local MCE enabled.
+Tony Luck
Hey Tony, I thought SRAO MCEs are broadcasted to all cores in the system
as they come without an execution context, is it correct?
If Yes, Naoya, I think we might want to remove the comments about the
"multiple Action Optional MCE racing" part.
Best,
-Jue
On Tue, 13 Apr 2021 07:43:20 +0900, Naoya Horiguchi wrote:
> This patch suggests to do page table walk to find the error virtual
> address. If we find multiple virtual addresses in walking, we now can't
> determine which one is correct, so we fall back to sending SIGBUS in
> kill_me_maybe() without error info as we do now. This corner case needs
> to be solved in the future.
Instead of walking the page tables, I wonder what about the following idea:
When failing to get vaddr, memory_failure just ensures the mapping is removed
and an hwpoisoned swap pte is put in place; or the original page is flagged with
PG_HWPOISONED and kept in the radix tree (e.g., for SHMEM THP).
NOTE: no SIGBUS is sent to user space.
Then do_machine_check just returns to user space to resume execution, the
re-execution will result in a #PF and should land to the exact page fault
handling code that generates a SIGBUS with the precise vaddr info:
https://github.com/torvalds/linux/blob/7af08140979a6e7e12b78c93b8625c8d25b084e2/mm/memory.c#L3290
https://github.com/torvalds/linux/blob/7af08140979a6e7e12b78c93b8625c8d25b084e2/mm/memory.c#L3647
Thanks,
-Jue
On Mon, Apr 19, 2021 at 06:49:15PM -0700, Jue Wang wrote:
> On Tue, 13 Apr 2021 07:43:20 +0900, Naoya Horiguchi wrote:
> ...
> > + * This function is intended to handle "Action Required" MCEs on already
> > + * hardware poisoned pages. They could happen, for example, when
> > + * memory_failure() failed to unmap the error page at the first call, or
> > + * when multiple Action Optional MCE events races on different CPUs with
> > + * Local MCE enabled.
>
> +Tony Luck
>
> Hey Tony, I thought SRAO MCEs are broadcasted to all cores in the system
> as they come without an execution context, is it correct?
>
> If Yes, Naoya, I think we might want to remove the comments about the
> "multiple Action Optional MCE racing" part.
Hi Jue,
I just wrote a mistake and I meant "when multiple Action Required MCE events races ...".
Sorry and thank you for finding it. This will be fixed in the later versions.
Thanks,
Naoya Horiguchi
On Mon, Apr 19, 2021 at 06:49:15PM -0700, Jue Wang wrote:
> On Tue, 13 Apr 2021 07:43:20 +0900, Naoya Horiguchi wrote:
> ...
> > + * This function is intended to handle "Action Required" MCEs on already
> > + * hardware poisoned pages. They could happen, for example, when
> > + * memory_failure() failed to unmap the error page at the first call, or
> > + * when multiple Action Optional MCE events races on different CPUs with
> > + * Local MCE enabled.
>
> +Tony Luck
>
> Hey Tony, I thought SRAO MCEs are broadcasted to all cores in the system
> as they come without an execution context, is it correct?
>
> If Yes, Naoya, I think we might want to remove the comments about the
> "multiple Action Optional MCE racing" part.
Jue,
Correct. SRAO machine checks are broadcast. But rather than remove the
second part, just replace with "multiple local machine checks on different
CPUs".
-Tony
On Mon, Apr 19, 2021 at 07:03:01PM -0700, Jue Wang wrote:
> On Tue, 13 Apr 2021 07:43:20 +0900, Naoya Horiguchi wrote:
>
> > This patch suggests to do page table walk to find the error virtual
> > address. If we find multiple virtual addresses in walking, we now can't
> > determine which one is correct, so we fall back to sending SIGBUS in
> > kill_me_maybe() without error info as we do now. This corner case needs
> > to be solved in the future.
>
> Instead of walking the page tables, I wonder what about the following idea:
>
> When failing to get vaddr, memory_failure just ensures the mapping is removed
> and an hwpoisoned swap pte is put in place; or the original page is flagged with
> PG_HWPOISONED and kept in the radix tree (e.g., for SHMEM THP).
To remove the mapping, you need to know the virtual address :-)
Well, I did try a patch that removed *all* user mappings (switched CR3 to
swapper_pgdir) and returned to user. Then have the resulting page fault
report the address. But that didn't work very well.
> NOTE: no SIGBUS is sent to user space.
>
> Then do_machine_check just returns to user space to resume execution, the
> re-execution will result in a #PF and should land to the exact page fault
> handling code that generates a SIGBUS with the precise vaddr info:
That's how SRAO (and other races) are supposed to work.
-Tony
On Tue, Apr 20, 2021 at 8:48 AM Luck, Tony <[email protected]> wrote:
>
> On Mon, Apr 19, 2021 at 07:03:01PM -0700, Jue Wang wrote:
> > On Tue, 13 Apr 2021 07:43:20 +0900, Naoya Horiguchi wrote:
> >
> > > This patch suggests to do page table walk to find the error virtual
> > > address. If we find multiple virtual addresses in walking, we now can't
> > > determine which one is correct, so we fall back to sending SIGBUS in
> > > kill_me_maybe() without error info as we do now. This corner case needs
> > > to be solved in the future.
> >
> > Instead of walking the page tables, I wonder what about the following idea:
> >
> > When failing to get vaddr, memory_failure just ensures the mapping is removed
> > and an hwpoisoned swap pte is put in place; or the original page is flagged with
> > PG_HWPOISONED and kept in the radix tree (e.g., for SHMEM THP).
>
> To remove the mapping, you need to know the virtual address :-)
I meant in this case (racing to access the same poisoned pages), the
page mapping should have been removed by and the hwpoison swap pte
installed by the winner thread?
Other racing threads can rely on the subsequent #PFs to get the
correct SIGBUS with accurate vaddr semantics? Or is the goal to "give
back correct SIGBUS with accurate vaddr on _the first MCE on ANY
threads_"? I wonder if that goal is absolutely necessary and can be
relaxed a little to take into account subsequent #PFs.
>
> Well, I did try a patch that removed *all* user mappings (switched CR3 to
> swapper_pgdir) and returned to user. Then have the resulting page fault
> report the address. But that didn't work very well.
Curious what didn't work well in this case? :-)
>
>
>
>
> > NOTE: no SIGBUS is sent to user space.
> >
> > Then do_machine_check just returns to user space to resume execution, the
> > re-execution will result in a #PF and should land to the exact page fault
> > handling code that generates a SIGBUS with the precise vaddr info:
>
> That's how SRAO (and other races) are supposed to work.
Hmm, I wonder why it doesn't apply to this race.
>
> -Tony
> I meant in this case (racing to access the same poisoned pages), the
> page mapping should have been removed by and the hwpoison swap pte
> installed by the winner thread?
My "mutex" patch that Horiguchi-san has included his summary series should
make this happen in most cases. Only problem is if the first thread runs into some
error and does not complete unmapping the poison page from all other tasks.
So the backup plan is to scan the page tables.
>> Well, I did try a patch that removed *all* user mappings (switched CR3 to
>> swapper_pgdir) and returned to user. Then have the resulting page fault
>> report the address. But that didn't work very well.
> Curious what didn't work well in this case? :-)
Andy Lutomirski wasn't happy with the approach. It was specifically
to cover the "kernel accesses poison more than once from get_user()".
It doesn't generalize to the case where the user accessed the poison (because
you'll just take the #PF on the instruction fetch ... everything is unmapped ...
instead of on the original data access).
-Tony
On Tue, Apr 20, 2021 at 08:42:53AM -0700, Luck, Tony wrote:
> On Mon, Apr 19, 2021 at 06:49:15PM -0700, Jue Wang wrote:
> > On Tue, 13 Apr 2021 07:43:20 +0900, Naoya Horiguchi wrote:
> > ...
> > > + * This function is intended to handle "Action Required" MCEs on already
> > > + * hardware poisoned pages. They could happen, for example, when
> > > + * memory_failure() failed to unmap the error page at the first call, or
> > > + * when multiple Action Optional MCE events races on different CPUs with
> > > + * Local MCE enabled.
> >
> > +Tony Luck
> >
> > Hey Tony, I thought SRAO MCEs are broadcasted to all cores in the system
> > as they come without an execution context, is it correct?
> >
> > If Yes, Naoya, I think we might want to remove the comments about the
> > "multiple Action Optional MCE racing" part.
>
> Jue,
>
> Correct. SRAO machine checks are broadcast. But rather than remove the
> second part, just replace with "multiple local machine checks on different
> CPUs".
This looks more precise, so I replaced as such in v3.
Thanks,
Naoya Horiguchi