From: Ankit Agrawal <[email protected]>
The kernel MM currently handles ECC errors / poison only on memory page
backed by struct page. As part of [1], the nvgrace-gpu-vfio-pci module
maps the device memory to user VA (Qemu) using remap_pfn_range without
being added to the kernel. These pages are not backed by struct page.
Implement a new ECC handling for memory without struct pages. Kernel MM
expose registration APIs to allow modules that are managing the device
to register its memory region and a callback function. MM then tracks
such regions using interval tree.
The mechanism is largely similar to that of ECC on pfn with struct pages.
If there is an ECC error on a pfn, MM uses the registered memory failure
callback function to notify the module of the faulty PFN, so that the
module may take any required action. The pfn is then unmapped in Stage-2.
When the VM tries to access the page, it gets trapped in KVM, which calls
the vm ops fault function. If the module fault function returns
VM_FAULT_HWPOISON, KVM sends a BUS_MCEERR_AR to the usermode (Qemu) mapped
to the poisoned page.
Lastly, nvgrace-gpu-vfio-pci module make use of the new mechanism to get
poison handling support on the device memory.
Patch generated over v6.7-rc2 and with [1] applied. [1] is currently under
review.
[1] https://lore.kernel.org/all/[email protected]/
Signed-off-by: Ankit Agrawal <[email protected]>
---
Link for v1: https://lore.kernel.org/all/[email protected]/
v1 -> v2
- Change poisoned page tracking from bitmap to hashtable.
- Addressed miscellaneous comments in v1.
Ankit Agrawal (4):
mm: handle poisoning of pfn without struct pages
mm: Add poison error check in fixup_user_fault() for mapped pfn
mm: Change ghes code to allow poison of non-struct pfn
vfio/nvgpu: register device memory for poison handling
drivers/acpi/apei/ghes.c | 12 +--
drivers/vfio/pci/nvgrace-gpu/main.c | 123 ++++++++++++++++++++++-
drivers/vfio/vfio_main.c | 3 +-
include/linux/memory-failure.h | 22 +++++
include/linux/mm.h | 1 +
include/ras/ras_event.h | 1 +
mm/Kconfig | 1 +
mm/gup.c | 2 +-
mm/memory-failure.c | 146 +++++++++++++++++++++++-----
virt/kvm/kvm_main.c | 6 ++
10 files changed, 278 insertions(+), 39 deletions(-)
create mode 100644 include/linux/memory-failure.h
--
2.17.1
From: Ankit Agrawal <[email protected]>
The nvgrace-gpu-vfio-pci module [1] maps the device memory to the user VA
(Qemu) using remap_pfn_range() without adding the memory to the kernel.
The device memory pages are not backed by struct page. Patches 1-3
implements the mechanism to handle ECC/poison on memory page without
struct page and expose a registration function. This new mechanism is
leveraged here.
The module registers its memory region with the kernel MM for ECC handling
using the register_pfn_address_space() registration API exposed by the
kernel. It also defines a failure callback function pfn_memory_failure()
to get the poisoned PFN from the MM.
The module track poisoned PFN using a hastable. The PFN is communicated
by the kernel MM to the module through the failure function, which push
the appropriate memory offset to the hashtable.
The module also defines a VMA fault ops for the module. It returns
VM_FAULT_HWPOISON in case the memory offset is found in the hashtable.
[1] https://lore.kernel.org/all/[email protected]/
Signed-off-by: Ankit Agrawal <[email protected]>
---
drivers/vfio/pci/nvgrace-gpu/main.c | 123 +++++++++++++++++++++++++++-
drivers/vfio/vfio_main.c | 3 +-
2 files changed, 124 insertions(+), 2 deletions(-)
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index b8634974e5cc..5a567375bd14 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -6,6 +6,16 @@
#include <linux/pci.h>
#include <linux/vfio_pci_core.h>
#include <linux/vfio.h>
+#ifdef CONFIG_MEMORY_FAILURE
+#include <linux/bitmap.h>
+#include <linux/memory-failure.h>
+#include <linux/hashtable.h>
+#endif
+
+struct h_node {
+ unsigned long mem_offset;
+ struct hlist_node node;
+};
struct nvgrace_gpu_vfio_pci_core_device {
struct vfio_pci_core_device core_device;
@@ -13,8 +23,96 @@ struct nvgrace_gpu_vfio_pci_core_device {
size_t memlength;
void *memmap;
struct mutex memmap_lock;
+#ifdef CONFIG_MEMORY_FAILURE
+ struct pfn_address_space pfn_address_space;
+ DECLARE_HASHTABLE(htbl, 8);
+#endif
+};
+
+#ifdef CONFIG_MEMORY_FAILURE
+static void
+nvgrace_gpu_vfio_pci_pfn_memory_failure(struct pfn_address_space *pfn_space,
+ unsigned long pfn)
+{
+ struct nvgrace_gpu_vfio_pci_core_device *nvdev = container_of(
+ pfn_space, struct nvgrace_gpu_vfio_pci_core_device, pfn_address_space);
+ unsigned long mem_offset = pfn - pfn_space->node.start;
+ struct h_node *ecc;
+
+ if (mem_offset >= (nvdev->memlength >> PAGE_SHIFT))
+ return;
+
+ /*
+ * MM has called to notify a poisoned page. Track that in the hastable.
+ */
+ ecc = (struct h_node *)(vzalloc(sizeof(struct h_node)));
+ ecc->mem_offset = mem_offset;
+ hash_add(nvdev->htbl, &(ecc->node), ecc->mem_offset);
+}
+
+struct pfn_address_space_ops nvgrace_gpu_vfio_pci_pas_ops = {
+ .failure = nvgrace_gpu_vfio_pci_pfn_memory_failure,
};
+static int
+nvgrace_gpu_vfio_pci_register_pfn_range(struct nvgrace_gpu_vfio_pci_core_device *nvdev,
+ struct vm_area_struct *vma)
+{
+ unsigned long nr_pages;
+ int ret = 0;
+
+ nr_pages = nvdev->memlength >> PAGE_SHIFT;
+
+ nvdev->pfn_address_space.node.start = vma->vm_pgoff;
+ nvdev->pfn_address_space.node.last = vma->vm_pgoff + nr_pages - 1;
+ nvdev->pfn_address_space.ops = &nvgrace_gpu_vfio_pci_pas_ops;
+ nvdev->pfn_address_space.mapping = vma->vm_file->f_mapping;
+
+ ret = register_pfn_address_space(&(nvdev->pfn_address_space));
+
+ return ret;
+}
+
+extern struct vfio_device *vfio_device_from_file(struct file *file);
+
+static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf)
+{
+ unsigned long mem_offset = vmf->pgoff - vmf->vma->vm_pgoff;
+ struct vfio_device *core_vdev;
+ struct nvgrace_gpu_vfio_pci_core_device *nvdev;
+ bool found = false;
+ struct h_node *cur;
+
+ if (!(vmf->vma->vm_file))
+ goto error_exit;
+
+ core_vdev = vfio_device_from_file(vmf->vma->vm_file);
+
+ if (!core_vdev)
+ goto error_exit;
+
+ nvdev = container_of(core_vdev,
+ struct nvgrace_gpu_vfio_pci_core_device, core_device.vdev);
+
+ if (mem_offset < (nvdev->memlength >> PAGE_SHIFT)) {
+ /*
+ * Check if the page is poisoned.
+ */
+ hash_for_each_possible(nvdev->htbl, cur, node, mem_offset) {
+ if (cur->mem_offset == mem_offset)
+ return VM_FAULT_HWPOISON;
+ }
+ }
+
+error_exit:
+ return VM_FAULT_ERROR;
+}
+
+static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = {
+ .fault = nvgrace_gpu_vfio_pci_fault,
+};
+#endif
+
static int nvgrace_gpu_vfio_pci_open_device(struct vfio_device *core_vdev)
{
struct vfio_pci_core_device *vdev =
@@ -46,6 +144,9 @@ static void nvgrace_gpu_vfio_pci_close_device(struct vfio_device *core_vdev)
mutex_destroy(&nvdev->memmap_lock);
+#ifdef CONFIG_MEMORY_FAILURE
+ unregister_pfn_address_space(&(nvdev->pfn_address_space));
+#endif
vfio_pci_core_close_device(core_vdev);
}
@@ -103,8 +204,12 @@ static int nvgrace_gpu_vfio_pci_mmap(struct vfio_device *core_vdev,
return ret;
vma->vm_pgoff = start_pfn;
+#ifdef CONFIG_MEMORY_FAILURE
+ vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops;
- return 0;
+ ret = nvgrace_gpu_vfio_pci_register_pfn_range(nvdev, vma);
+#endif
+ return ret;
}
static long
@@ -413,6 +518,12 @@ nvgrace_gpu_vfio_pci_fetch_memory_property(struct pci_dev *pdev,
nvdev->memlength = memlength;
+#ifdef CONFIG_MEMORY_FAILURE
+ /*
+ * Initialize the hashtable tracking the poisoned pages.
+ */
+ hash_init(nvdev->htbl);
+#endif
return ret;
}
@@ -448,6 +559,16 @@ static void nvgrace_gpu_vfio_pci_remove(struct pci_dev *pdev)
{
struct nvgrace_gpu_vfio_pci_core_device *nvdev = nvgrace_gpu_drvdata(pdev);
struct vfio_pci_core_device *vdev = &nvdev->core_device;
+#ifdef CONFIG_MEMORY_FAILURE
+ struct h_node *cur;
+ unsigned long bkt;
+ struct hlist_node *tmp_node;
+
+ hash_for_each_safe(nvdev->htbl, bkt, tmp_node, cur, node) {
+ hash_del(&cur->node);
+ vfree(cur);
+ }
+#endif
vfio_pci_core_unregister_device(vdev);
vfio_put_device(&vdev->vdev);
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 8d4995ada74a..290431ac2e00 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -1319,7 +1319,7 @@ const struct file_operations vfio_device_fops = {
.mmap = vfio_device_fops_mmap,
};
-static struct vfio_device *vfio_device_from_file(struct file *file)
+struct vfio_device *vfio_device_from_file(struct file *file)
{
struct vfio_device_file *df = file->private_data;
@@ -1327,6 +1327,7 @@ static struct vfio_device *vfio_device_from_file(struct file *file)
return NULL;
return df->device;
}
+EXPORT_SYMBOL_GPL(vfio_device_from_file);
/**
* vfio_file_is_valid - True if the file is valid vfio file
--
2.17.1