LinuxLists.cc - [PATCH v4 00/10] fsdax: introduce fs query to support reflink

2021-06-04 01:21:28

Subject: [PATCH v4 00/10] fsdax: introduce fs query to support reflink

This patchset is aimed to support shared pages tracking for fsdax.

Change from V3:
- Introduce dax_device->holder to split dax specific issues from
block device, instead of the ->corrupted_range() in
block_device_operations
- Other mistakes and problems fix
- Rebased to v5.13-rc4

This patchset moves owner tracking from dax_assocaite_entry() to pmem
device driver, by introducing an interface ->memory_failure() of struct
pagemap. This interface is called by memory_failure() in mm, and
implemented by pmem device.

Then call holder operations to find the filesystem which the corrupted
data located in, and call filesystem handler to track files or metadata
associated with this page.

Finally we are able to try to fix the corrupted data in filesystem and
do other necessary processing, such as killing processes who are using
the files affected.

The call trace is like this:
memory_failure()
pgmap->ops->memory_failure() => pmem_pgmap_memory_failure()
dax_device->holder_ops->corrupted_range() =>
- fs_dax_corrupted_range()
- md_dax_corrupted_range()
sb->s_ops->currupted_range() => xfs_fs_corrupted_range()
xfs_rmap_query_range()
xfs_currupt_helper()
* corrupted on metadata
try to recover data, call xfs_force_shutdown()
* corrupted on file data
try to recover data, call mf_dax_kill_procs()

The fsdax & reflink support for XFS is not contained in this patchset.

(Rebased on v5.13-rc4)
==

Shiyang Ruan (10):
pagemap: Introduce ->memory_failure()
dax: Introduce holder for dax_device
fs: Introduce ->corrupted_range() for superblock
mm, fsdax: Refactor memory-failure handler for dax mapping
mm, pmem: Implement ->memory_failure() in pmem driver
fs/dax: Implement dax_holder_operations
dm: Introduce ->rmap() to find bdev offset
md: Implement dax_holder_operations
xfs: Implement ->corrupted_range() for XFS
fs/dax: Remove useless functions

block/genhd.c | 30 ++++++
drivers/dax/super.c | 38 ++++++++
drivers/md/dm-linear.c | 20 ++++
drivers/md/dm.c | 119 ++++++++++++++++++++++-
drivers/nvdimm/pmem.c | 14 +++
fs/dax.c | 79 +++++++---------
fs/xfs/xfs_fsops.c | 5 +
fs/xfs/xfs_mount.h | 1 +
fs/xfs/xfs_super.c | 108 +++++++++++++++++++++
include/linux/dax.h | 13 +++
include/linux/device-mapper.h | 5 +
include/linux/fs.h | 2 +
include/linux/genhd.h | 1 +
include/linux/memremap.h | 8 ++
include/linux/mm.h | 9 ++
mm/memory-failure.c | 173 ++++++++++++++++++++++------------
16 files changed, 520 insertions(+), 105 deletions(-)

--
2.31.1

2021-06-04 01:21:37

by Shiyang Ruan

[permalink] [raw]

Subject: [PATCH v4 04/10] mm, fsdax: Refactor memory-failure handler for dax mapping

The current memory_failure_dev_pagemap() can only handle single-mapped
dax page for fsdax mode. The dax page could be mapped by multiple files
and offsets if we let reflink feature & fsdax mode work together. So,
we refactor current implementation to support handle memory failure on
each file and offset.

Signed-off-by: Shiyang Ruan <[email protected]>
---
fs/dax.c | 21 ++++++++
include/linux/dax.h | 1 +
include/linux/mm.h | 9 ++++
mm/memory-failure.c | 114 ++++++++++++++++++++++++++++----------------
4 files changed, 105 insertions(+), 40 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 62352cbcf0f4..58faca85455a 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -389,6 +389,27 @@ static struct page *dax_busy_page(void *entry)
return NULL;
}

+/*
+ * dax_load_pfn - Load pfn of the DAX entry corresponding to a page
+ * @mapping: The file whose entry we want to load
+ * @index: The offset where the DAX entry located in
+ *
+ * Return: pfn of the DAX entry
+ */
+unsigned long dax_load_pfn(struct address_space *mapping, unsigned long index)
+{
+ XA_STATE(xas, &mapping->i_pages, index);
+ void *entry;
+ unsigned long pfn;
+
+ xas_lock_irq(&xas);
+ entry = xas_load(&xas);
+ pfn = dax_to_pfn(entry);
+ xas_unlock_irq(&xas);
+
+ return pfn;
+}
+
/*
* dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
* @page: The page whose entry we want to lock
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 1ce343a960ab..6e758daa5004 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -158,6 +158,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,

struct page *dax_layout_busy_page(struct address_space *mapping);
struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
+unsigned long dax_load_pfn(struct address_space *mapping, unsigned long index);
dax_entry_t dax_lock_page(struct page *page);
void dax_unlock_page(struct page *page, dax_entry_t cookie);
#else
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c274f75efcf9..2b7527e93c77 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1187,6 +1187,14 @@ static inline bool is_device_private_page(const struct page *page)
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
}

+static inline bool is_device_fsdax_page(const struct page *page)
+{
+ return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+ IS_ENABLED(CONFIG_FS_DAX) &&
+ is_zone_device_page(page) &&
+ page->pgmap->type == MEMORY_DEVICE_FS_DAX;
+}
+
static inline bool is_pci_p2pdma_page(const struct page *page)
{
return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
@@ -3078,6 +3086,7 @@ enum mf_flags {
MF_MUST_KILL = 1 << 2,
MF_SOFT_OFFLINE = 1 << 3,
};
+int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, int flags);
extern int memory_failure(unsigned long pfn, int flags);
extern void memory_failure_queue(unsigned long pfn, int flags);
extern void memory_failure_queue_kick(int cpu);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 85ad98c00fd9..4377e727d478 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -56,6 +56,7 @@
#include <linux/kfifo.h>
#include <linux/ratelimit.h>
#include <linux/page-isolation.h>
+#include <linux/dax.h>
#include "internal.h"
#include "ras/ras_event.h"

@@ -120,6 +121,13 @@ static int hwpoison_filter_dev(struct page *p)
if (PageSlab(p))
return -EINVAL;

+ if (pfn_valid(page_to_pfn(p))) {
+ if (is_device_fsdax_page(p))
+ return 0;
+ else
+ return -EINVAL;
+ }
+
mapping = page_mapping(p);
if (mapping == NULL || mapping->host == NULL)
return -EINVAL;
@@ -290,10 +298,9 @@ void shake_page(struct page *p, int access)
}
EXPORT_SYMBOL_GPL(shake_page);

-static unsigned long dev_pagemap_mapping_shift(struct page *page,
- struct vm_area_struct *vma)
+static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
+ unsigned long address)
{
- unsigned long address = vma_address(page, vma);
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
@@ -333,9 +340,8 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
* Schedule a process for later kill.
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
*/
-static void add_to_kill(struct task_struct *tsk, struct page *p,
- struct vm_area_struct *vma,
- struct list_head *to_kill)
+static void add_to_kill(struct task_struct *tsk, struct page *p, pgoff_t pgoff,
+ struct vm_area_struct *vma, struct list_head *to_kill)
{
struct to_kill *tk;

@@ -346,9 +352,12 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
}

tk->addr = page_address_in_vma(p, vma);
- if (is_zone_device_page(p))
- tk->size_shift = dev_pagemap_mapping_shift(p, vma);
- else
+ if (is_zone_device_page(p)) {
+ if (is_device_fsdax_page(p))
+ tk->addr = vma->vm_start +
+ ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
+ } else
tk->size_shift = page_shift(compound_head(p));

/*
@@ -496,7 +505,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
if (!page_mapped_in_vma(page, vma))
continue;
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill);
+ add_to_kill(t, page, 0, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -506,24 +515,19 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
/*
* Collect processes when the error hit a file mapped page.
*/
-static void collect_procs_file(struct page *page, struct list_head *to_kill,
- int force_early)
+static void collect_procs_file(struct page *page, struct address_space *mapping,
+ pgoff_t pgoff, struct list_head *to_kill, int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
- struct address_space *mapping = page->mapping;
- pgoff_t pgoff;

i_mmap_lock_read(mapping);
read_lock(&tasklist_lock);
- pgoff = page_to_pgoff(page);
for_each_process(tsk) {
struct task_struct *t = task_early_kill(tsk, force_early);
-
if (!t)
continue;
- vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
- pgoff) {
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
/*
* Send early kill signal to tasks where a vma covers
* the page but the corrupted page is not necessarily
@@ -532,7 +536,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* to be informed of all such data corruptions.
*/
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill);
+ add_to_kill(t, page, pgoff, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -551,7 +555,8 @@ static void collect_procs(struct page *page, struct list_head *tokill,
if (PageAnon(page))
collect_procs_anon(page, tokill, force_early);
else
- collect_procs_file(page, tokill, force_early);
+ collect_procs_file(page, page_mapping(page), page_to_pgoff(page),
+ tokill, force_early);
}

static const char *action_name[] = {
@@ -1218,6 +1223,51 @@ static int try_to_split_thp_page(struct page *page, const char *msg)
return 0;
}

+static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
+ struct address_space *mapping, pgoff_t index, int flags)
+{
+ struct to_kill *tk;
+ unsigned long size = 0;
+ loff_t start;
+
+ list_for_each_entry(tk, to_kill, nd)
+ if (tk->size_shift)
+ size = max(size, 1UL << tk->size_shift);
+ if (size) {
+ /*
+ * Unmap the largest mapping to avoid breaking up
+ * device-dax mappings which are constant size. The
+ * actual size of the mapping being torn down is
+ * communicated in siginfo, see kill_proc()
+ */
+ start = (index << PAGE_SHIFT) & ~(size - 1);
+ unmap_mapping_range(mapping, start, size, 0);
+ }
+
+ kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags);
+}
+
+int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, int flags)
+{
+ LIST_HEAD(to_kill);
+ /* load the pfn of the dax mapping file */
+ unsigned long pfn = dax_load_pfn(mapping, index);
+
+ /*
+ * Unlike System-RAM there is no possibility to swap in a
+ * different physical page at a given virtual address, so all
+ * userspace consumption of ZONE_DEVICE memory necessitates
+ * SIGBUS (i.e. MF_MUST_KILL)
+ */
+ flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+ collect_procs_file(pfn_to_page(pfn), mapping, index, &to_kill,
+ flags & MF_ACTION_REQUIRED);
+
+ unmap_and_kill(&to_kill, pfn, mapping, index, flags);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
+
static int memory_failure_hugetlb(unsigned long pfn, int flags)
{
struct page *p = pfn_to_page(pfn);
@@ -1298,12 +1348,8 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
struct dev_pagemap *pgmap)
{
struct page *page = pfn_to_page(pfn);
- const bool unmap_success = true;
- unsigned long size = 0;
- struct to_kill *tk;
- LIST_HEAD(tokill);
+ LIST_HEAD(to_kill);
int rc = -EBUSY;
- loff_t start;
dax_entry_t cookie;

if (flags & MF_COUNT_INCREASED)
@@ -1355,22 +1401,10 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
* SIGBUS (i.e. MF_MUST_KILL)
*/
flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
- collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
+ collect_procs_file(page, page->mapping, page->index, &to_kill,
+ flags & MF_ACTION_REQUIRED);

- list_for_each_entry(tk, &tokill, nd)
- if (tk->size_shift)
- size = max(size, 1UL << tk->size_shift);
- if (size) {
- /*
- * Unmap the largest mapping to avoid breaking up
- * device-dax mappings which are constant size. The
- * actual size of the mapping being torn down is
- * communicated in siginfo, see kill_proc()
- */
- start = (page->index << PAGE_SHIFT) & ~(size - 1);
- unmap_mapping_range(page->mapping, start, size, 0);
- }
- kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
+ unmap_and_kill(&to_kill, pfn, page->mapping, page->index, flags);
rc = 0;
unlock:
dax_unlock_page(page, cookie);
--
2.31.1

2021-06-04 01:21:41

by Shiyang Ruan

[permalink] [raw]

Subject: [PATCH v4 07/10] dm: Introduce ->rmap() to find bdev offset

Pmem device could be a target of mapped device. In order to find out
the global location on a mapped device, we introduce this to translate
offset from target device to mapped device.

Currently, we implement it on linear target, which is easy to do the
translation. Other targets will be supported in the future. However,
some targets may not support it because of the non-linear mapping.

Signed-off-by: Shiyang Ruan <[email protected]>
---
drivers/md/dm-linear.c | 20 ++++++++++++++++++++
include/linux/device-mapper.h | 5 +++++
2 files changed, 25 insertions(+)

diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 92db0f5e7f28..f9f9bc765ba7 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -5,6 +5,7 @@
*/

#include "dm.h"
+#include "dm-core.h"
#include <linux/module.h>
#include <linux/init.h>
#include <linux/blkdev.h>
@@ -119,6 +120,24 @@ static void linear_status(struct dm_target *ti, status_type_t type,
}
}

+static int linear_rmap(struct dm_target *ti, sector_t offset,
+ rmap_callout_fn fn, void *data)
+{
+ struct linear_c *lc = (struct linear_c *) ti->private;
+ struct mapped_device *md = ti->table->md;
+ struct block_device *bdev;
+ sector_t disk_sect = offset - dm_target_offset(ti, lc->start);
+ int rc = -ENODEV;
+
+ bdev = bdget_disk_sector(md->disk, offset);
+ if (!bdev)
+ return rc;
+
+ rc = fn(ti, bdev, disk_sect, data);
+ bdput(bdev);
+ return rc;
+}
+
static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
{
struct linear_c *lc = (struct linear_c *) ti->private;
@@ -236,6 +255,7 @@ static struct target_type linear_target = {
.ctr = linear_ctr,
.dtr = linear_dtr,
.map = linear_map,
+ .rmap = linear_rmap,
.status = linear_status,
.prepare_ioctl = linear_prepare_ioctl,
.iterate_devices = linear_iterate_devices,
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index ff700fb6ce1d..89a893565407 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -58,6 +58,10 @@ typedef void (*dm_dtr_fn) (struct dm_target *ti);
* = 2: The target wants to push back the io
*/
typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio);
+typedef int (*rmap_callout_fn) (struct dm_target *ti, struct block_device *bdev,
+ sector_t sect, void *data);
+typedef int (*dm_rmap_fn) (struct dm_target *ti, sector_t offset,
+ rmap_callout_fn fn, void *data);
typedef int (*dm_clone_and_map_request_fn) (struct dm_target *ti,
struct request *rq,
union map_info *map_context,
@@ -184,6 +188,7 @@ struct target_type {
dm_ctr_fn ctr;
dm_dtr_fn dtr;
dm_map_fn map;
+ dm_rmap_fn rmap;
dm_clone_and_map_request_fn clone_and_map_rq;
dm_release_clone_request_fn release_clone_rq;
dm_endio_fn end_io;
--
2.31.1

2021-06-04 01:22:02

by Shiyang Ruan

[permalink] [raw]

Subject: [PATCH v4 01/10] pagemap: Introduce ->memory_failure()

When memory-failure occurs, we call this function which is implemented
by each kind of devices. For the fsdax case, pmem device driver
implements it. Pmem device driver will find out the filesystem in which
the corrupted page located in. And finally call filesystem handler to
deal with this error.

The filesystem will try to recover the corrupted data if possiable.

Signed-off-by: Shiyang Ruan <[email protected]>
---
include/linux/memremap.h | 8 ++++++++
1 file changed, 8 insertions(+)

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 45a79da89c5f..473fe18c516a 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -87,6 +87,14 @@ struct dev_pagemap_ops {
* the page back to a CPU accessible page.
*/
vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
+
+ /*
+ * Handle the memory failure happens on one page. Notify the processes
+ * who are using this page, and try to recover the data on this page
+ * if necessary.
+ */
+ int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn,
+ int flags);
};

#define PGMAP_ALTMAP_VALID (1 << 0)
--
2.31.1

2021-06-04 01:22:11

by Shiyang Ruan

[permalink] [raw]

Subject: [PATCH v4 08/10] md: Implement dax_holder_operations

This is the case where the holder represents a mapped device, or a list
of mapped devices more exactly(because it is possible to create more
than one mapped device on one pmem device).

Find out which mapped device the offset belongs to, and translate the
offset from target device to mapped device. When it is done, call
dax_corrupted_range() for the holder of this mapped device.

Signed-off-by: Shiyang Ruan <[email protected]>
---
drivers/md/dm.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 118 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ca2aedd8ee7d..606ad74ccf87 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -749,7 +749,11 @@ static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
}

static char *_dm_claim_ptr = "I belong to device-mapper";
-
+static const struct dax_holder_operations dm_dax_holder_ops;
+struct dm_holder {
+ struct list_head list;
+ struct mapped_device *md;
+};
/*
* Open a table device so we can use it as a map destination.
*/
@@ -757,6 +761,8 @@ static int open_table_device(struct table_device *td, dev_t dev,
struct mapped_device *md)
{
struct block_device *bdev;
+ struct list_head *holders;
+ struct dm_holder *holder;

int r;

@@ -774,6 +780,17 @@ static int open_table_device(struct table_device *td, dev_t dev,

td->dm_dev.bdev = bdev;
td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+
+ holders = dax_get_holder(td->dm_dev.dax_dev);
+ if (!holders) {
+ holders = kmalloc(sizeof(*holders), GFP_KERNEL);
+ INIT_LIST_HEAD(holders);
+ dax_set_holder(td->dm_dev.dax_dev, holders, &dm_dax_holder_ops);
+ }
+ holder = kmalloc(sizeof(*holder), GFP_KERNEL);
+ holder->md = md;
+ list_add_tail(&holder->list, holders);
+
return 0;
}

@@ -782,9 +799,24 @@ static int open_table_device(struct table_device *td, dev_t dev,
*/
static void close_table_device(struct table_device *td, struct mapped_device *md)
{
+ struct list_head *holders;
+ struct dm_holder *holder, *n;
+
if (!td->dm_dev.bdev)
return;

+ holders = dax_get_holder(td->dm_dev.dax_dev);
+ if (holders) {
+ list_for_each_entry_safe(holder, n, holders, list) {
+ if (holder->md == md) {
+ list_del(&holder->list);
+ kfree(holder);
+ }
+ }
+ if (list_empty(holders))
+ kfree(holders);
+ }
+
bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
put_dax(td->dm_dev.dax_dev);
@@ -1235,6 +1267,87 @@ static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
return ret;
}

+struct corrupted_hit_info {
+ struct block_device *bdev;
+ sector_t offset;
+};
+
+static int dm_blk_corrupted_hit(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t count, void *data)
+{
+ struct corrupted_hit_info *bc = data;
+
+ return bc->bdev == (void *)dev->bdev &&
+ (start <= bc->offset && bc->offset < start + count);
+}
+
+struct corrupted_do_info {
+ size_t length;
+ void *data;
+};
+
+static int dm_blk_corrupted_do(struct dm_target *ti, struct block_device *bdev,
+ sector_t sector, void *data)
+{
+ struct mapped_device *md = ti->table->md;
+ struct corrupted_do_info *bc = data;
+ struct block_device *md_bdev = bdget_disk_sector(md->disk, sector);
+
+ return dax_corrupted_range(md->dax_dev, md_bdev, to_bytes(sector),
+ bc->length, bc->data);
+}
+
+static int dm_dax_corrputed_range_one(struct mapped_device *md,
+ struct block_device *bdev, loff_t offset,
+ size_t length, void *data)
+{
+ struct dm_table *map;
+ struct dm_target *ti;
+ sector_t sect = to_sector(offset);
+ struct corrupted_hit_info hi = {bdev, sect};
+ struct corrupted_do_info di = {length, data};
+ int srcu_idx, i, rc = -ENODEV;
+
+ map = dm_get_live_table(md, &srcu_idx);
+ if (!map)
+ return rc;
+
+ /*
+ * find the target device, and then translate the offset of this target
+ * to the whole mapped device.
+ */
+ for (i = 0; i < dm_table_get_num_targets(map); i++) {
+ ti = dm_table_get_target(map, i);
+ if (!(ti->type->iterate_devices && ti->type->rmap))
+ continue;
+ if (!ti->type->iterate_devices(ti, dm_blk_corrupted_hit, &hi))
+ continue;
+
+ rc = ti->type->rmap(ti, sect, dm_blk_corrupted_do, &di);
+ break;
+ }
+
+ dm_put_live_table(md, srcu_idx);
+ return rc;
+}
+
+static int dm_dax_corrputed_range(struct dax_device *dax_dev,
+ struct block_device *bdev,
+ loff_t offset, size_t length, void *data)
+{
+ struct dm_holder *holder;
+ struct list_head *holders = dax_get_holder(dax_dev);
+ int rc = -ENODEV;
+
+ list_for_each_entry(holder, holders, list) {
+ rc = dm_dax_corrputed_range_one(holder->md, bdev, offset,
+ length, data);
+ if (rc != -ENODEV)
+ break;
+ }
+ return rc;
+}
+
/*
* A target may call dm_accept_partial_bio only from the map routine. It is
* allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_RESET,
@@ -3157,6 +3270,10 @@ static const struct dax_operations dm_dax_ops = {
.zero_page_range = dm_dax_zero_page_range,
};

+static const struct dax_holder_operations dm_dax_holder_ops = {
+ .corrupted_range = dm_dax_corrputed_range,
+};
+
/*
* module hooks
*/
--
2.31.1

2021-06-04 01:24:06

by Shiyang Ruan

[permalink] [raw]

Subject: [PATCH v4 09/10] xfs: Implement ->corrupted_range() for XFS

This function is used to handle errors which may cause data lost in
filesystem. Such as memory failure in fsdax mode.

If the rmap feature of XFS enabled, we can query it to find files and
metadata which are associated with the corrupt data. For now all we do
is kill processes with that file mapped into their address spaces, but
future patches could actually do something about corrupt metadata.

After that, the memory failure needs to notify the processes who are
using those files.

Only support data device. Realtime device is not supported for now.

Signed-off-by: Shiyang Ruan <[email protected]>
---
fs/xfs/xfs_fsops.c | 5 +++
fs/xfs/xfs_mount.h | 1 +
fs/xfs/xfs_super.c | 108 +++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 114 insertions(+)

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index be9cf88d2ad7..e89ada33d8fc 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -551,6 +551,11 @@ xfs_do_force_shutdown(
"Corruption of in-memory data detected. Shutting down filesystem");
if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
xfs_stack_trace();
+ } else if (flags & SHUTDOWN_CORRUPT_META) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
+"Corruption of on-disk metadata detected. Shutting down filesystem");
+ if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
+ xfs_stack_trace();
} else if (logerror) {
xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
"Log I/O Error Detected. Shutting down filesystem");
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index bb67274ee23f..c62ccf3e07d0 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -276,6 +276,7 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */
#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */
#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */
+#define SHUTDOWN_CORRUPT_META 0x0010 /* corrupt metadata on device */

/*
* Flags for xfs_mountfs
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index a2dab05332ac..498edaeb8363 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -36,6 +36,11 @@
#include "xfs_bmap_item.h"
#include "xfs_reflink.h"
#include "xfs_pwork.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_bit.h"

#include <linux/magic.h>
#include <linux/fs_context.h>
@@ -392,6 +397,7 @@ xfs_open_devices(
struct block_device *logdev = NULL, *rtdev = NULL;
int error;

+ dax_set_holder(dax_ddev, mp->m_super, &fs_dax_holder_ops);
/*
* Open real time and log devices - order is important.
*/
@@ -400,6 +406,9 @@ xfs_open_devices(
if (error)
goto out;
dax_logdev = fs_dax_get_by_bdev(logdev);
+ if (dax_logdev != dax_ddev)
+ dax_set_holder(dax_logdev, mp->m_super,
+ &fs_dax_holder_ops);
}

if (mp->m_rtname) {
@@ -414,6 +423,7 @@ xfs_open_devices(
goto out_close_rtdev;
}
dax_rtdev = fs_dax_get_by_bdev(rtdev);
+ dax_set_holder(dax_rtdev, mp->m_super, &fs_dax_holder_ops);
}

/*
@@ -1076,6 +1086,103 @@ xfs_fs_free_cached_objects(
return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
}

+static int
+xfs_corrupt_helper(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *data)
+{
+ struct xfs_inode *ip;
+ struct address_space *mapping;
+ int rc = 0;
+ int *flags = data;
+
+ if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
+ (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
+ // TODO check and try to fix metadata
+ xfs_force_shutdown(cur->bc_mp, SHUTDOWN_CORRUPT_META);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Get files that incore, filter out others that are not in use.
+ */
+ rc = xfs_iget(cur->bc_mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
+ 0, &ip);
+ if (rc)
+ return rc;
+
+ mapping = VFS_I(ip)->i_mapping;
+ rc = mf_dax_kill_procs(mapping, rec->rm_offset, *flags);
+
+ // TODO try to fix data
+ xfs_irele(ip);
+
+ return rc;
+}
+
+static int
+xfs_fs_corrupted_range(
+ struct super_block *sb,
+ struct block_device *bdev,
+ loff_t offset,
+ size_t len,
+ void *data)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+ struct xfs_trans *tp = NULL;
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_rmap_irec rmap_low, rmap_high;
+ struct xfs_buf *agf_bp = NULL;
+ xfs_fsblock_t fsbno = XFS_B_TO_FSB(mp, offset);
+ xfs_filblks_t bcnt = XFS_B_TO_FSB(mp, len);
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno);
+ xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, fsbno);
+ int error = 0;
+
+ if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_bdev == bdev) {
+ xfs_warn(mp, "corrupted_range support not available for realtime device!");
+ return -EOPNOTSUPP;
+ }
+ if (mp->m_logdev_targp && mp->m_logdev_targp->bt_bdev == bdev &&
+ mp->m_logdev_targp != mp->m_ddev_targp) {
+ xfs_err(mp, "ondisk log corrupt, shutting down fs!");
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_META);
+ return -EFSCORRUPTED;
+ }
+
+ if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ xfs_warn(mp, "corrupted_range needs rmapbt enabled!");
+ return -EOPNOTSUPP;
+ }
+
+ error = xfs_trans_alloc_empty(mp, &tp);
+ if (error)
+ return error;
+
+ error = xfs_alloc_read_agf(mp, tp, agno, 0, &agf_bp);
+ if (error)
+ goto out_cancel_tp;
+
+ cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, agno);
+
+ /* Construct a range for rmap query */
+ memset(&rmap_low, 0, sizeof(rmap_low));
+ memset(&rmap_high, 0xFF, sizeof(rmap_high));
+ rmap_low.rm_startblock = rmap_high.rm_startblock = agbno;
+ rmap_low.rm_blockcount = rmap_high.rm_blockcount = bcnt;
+
+ error = xfs_rmap_query_range(cur, &rmap_low, &rmap_high,
+ xfs_corrupt_helper, data);
+
+ xfs_btree_del_cursor(cur, error);
+ xfs_trans_brelse(tp, agf_bp);
+
+out_cancel_tp:
+ xfs_trans_cancel(tp);
+ return error;
+}
+
static const struct super_operations xfs_super_operations = {
.alloc_inode = xfs_fs_alloc_inode,
.destroy_inode = xfs_fs_destroy_inode,
@@ -1089,6 +1196,7 @@ static const struct super_operations xfs_super_operations = {
.show_options = xfs_fs_show_options,
.nr_cached_objects = xfs_fs_nr_cached_objects,
.free_cached_objects = xfs_fs_free_cached_objects,
+ .corrupted_range = xfs_fs_corrupted_range,
};

static int
--
2.31.1

2021-06-04 01:24:12

by Shiyang Ruan

[permalink] [raw]

Subject: [PATCH v4 10/10] fs/dax: Remove useless functions

Since owner tracking is triggerred by pmem device, these functions are
useless. So remove them.

Signed-off-by: Shiyang Ruan <[email protected]>
---
fs/dax.c | 46 ----------------------------------------------
1 file changed, 46 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 1a7473f46df2..aff1e0f58967 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -334,48 +334,6 @@ static unsigned long dax_end_pfn(void *entry)
for (pfn = dax_to_pfn(entry); \
pfn < dax_end_pfn(entry); pfn++)

-/*
- * TODO: for reflink+dax we need a way to associate a single page with
- * multiple address_space instances at different linear_page_index()
- * offsets.
- */
-static void dax_associate_entry(void *entry, struct address_space *mapping,
- struct vm_area_struct *vma, unsigned long address)
-{
- unsigned long size = dax_entry_size(entry), pfn, index;
- int i = 0;
-
- if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
- return;
-
- index = linear_page_index(vma, address & ~(size - 1));
- for_each_mapped_pfn(entry, pfn) {
- struct page *page = pfn_to_page(pfn);
-
- WARN_ON_ONCE(page->mapping);
- page->mapping = mapping;
- page->index = index + i++;
- }
-}
-
-static void dax_disassociate_entry(void *entry, struct address_space *mapping,
- bool trunc)
-{
- unsigned long pfn;
-
- if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
- return;
-
- for_each_mapped_pfn(entry, pfn) {
- struct page *page = pfn_to_page(pfn);
-
- WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
- WARN_ON_ONCE(page->mapping && page->mapping != mapping);
- page->mapping = NULL;
- page->index = 0;
- }
-}
-
static struct page *dax_busy_page(void *entry)
{
unsigned long pfn;
@@ -554,7 +512,6 @@ static void *grab_mapping_entry(struct xa_state *xas,
xas_lock_irq(xas);
}

- dax_disassociate_entry(entry, mapping, false);
xas_store(xas, NULL); /* undo the PMD join */
dax_wake_entry(xas, entry, WAKE_ALL);
mapping->nrpages -= PG_PMD_NR;
@@ -691,7 +648,6 @@ static int __dax_invalidate_entry(struct address_space *mapping,
(xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
goto out;
- dax_disassociate_entry(entry, mapping, trunc);
xas_store(&xas, NULL);
mapping->nrpages -= 1UL << dax_entry_order(entry);
ret = 1;
@@ -785,8 +741,6 @@ static void *dax_insert_entry(struct xa_state *xas,
if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
void *old;

- dax_disassociate_entry(entry, mapping, false);
- dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
/*
* Only swap our new entry into the page cache if the current
* entry is a zero page or an empty entry. If a normal PTE or
--
2.31.1

2021-06-04 05:27:08

by kernel test robot

[permalink] [raw]

Subject: Re: [PATCH v4 09/10] xfs: Implement ->corrupted_range() for XFS

Hi Shiyang,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on xfs-linux/for-next]
[also build test ERROR on dm/for-next linus/master v5.13-rc4]
[cannot apply to hnaz-linux-mm/master next-20210603]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url: https://github.com/0day-ci/linux/commits/Shiyang-Ruan/fsdax-introduce-fs-query-to-support-reflink/20210604-092105
base: https://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git for-next
config: csky-randconfig-r014-20210604 (attached as .config)
compiler: csky-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/0day-ci/linux/commit/8fc6cb02d396487fa3a77fb57f23dcdc978dd3e3
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Shiyang-Ruan/fsdax-introduce-fs-query-to-support-reflink/20210604-092105
git checkout 8fc6cb02d396487fa3a77fb57f23dcdc978dd3e3
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=csky

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

fs/xfs/xfs_super.c: In function 'xfs_open_devices':
>> fs/xfs/xfs_super.c:400:2: error: implicit declaration of function 'dax_set_holder'; did you mean 'xas_set_order'? [-Werror=implicit-function-declaration]
400 | dax_set_holder(dax_ddev, mp->m_super, &fs_dax_holder_ops);
| ^~~~~~~~~~~~~~
| xas_set_order
cc1: some warnings being treated as errors

vim +400 fs/xfs/xfs_super.c

379
380 /*
381 * The file system configurations are:
382 * (1) device (partition) with data and internal log
383 * (2) logical volume with data and log subvolumes.
384 * (3) logical volume with data, log, and realtime subvolumes.
385 *
386 * We only have to handle opening the log and realtime volumes here if
387 * they are present. The data subvolume has already been opened by
388 * get_sb_bdev() and is stored in sb->s_bdev.
389 */
390 STATIC int
391 xfs_open_devices(
392 struct xfs_mount *mp)
393 {
394 struct block_device *ddev = mp->m_super->s_bdev;
395 struct dax_device *dax_ddev = fs_dax_get_by_bdev(ddev);
396 struct dax_device *dax_logdev = NULL, *dax_rtdev = NULL;
397 struct block_device *logdev = NULL, *rtdev = NULL;
398 int error;
399
> 400 dax_set_holder(dax_ddev, mp->m_super, &fs_dax_holder_ops);
401 /*
402 * Open real time and log devices - order is important.
403 */
404 if (mp->m_logname) {
405 error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
406 if (error)
407 goto out;
408 dax_logdev = fs_dax_get_by_bdev(logdev);
409 if (dax_logdev != dax_ddev)
410 dax_set_holder(dax_logdev, mp->m_super,
411 &fs_dax_holder_ops);
412 }
413
414 if (mp->m_rtname) {
415 error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev);
416 if (error)
417 goto out_close_logdev;
418
419 if (rtdev == ddev || rtdev == logdev) {
420 xfs_warn(mp,
421 "Cannot mount filesystem with identical rtdev and ddev/logdev.");
422 error = -EINVAL;
423 goto out_close_rtdev;
424 }
425 dax_rtdev = fs_dax_get_by_bdev(rtdev);
426 dax_set_holder(dax_rtdev, mp->m_super, &fs_dax_holder_ops);
427 }
428
429 /*
430 * Setup xfs_mount buffer target pointers
431 */
432 error = -ENOMEM;
433 mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, dax_ddev);
434 if (!mp->m_ddev_targp)
435 goto out_close_rtdev;
436
437 if (rtdev) {
438 mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, dax_rtdev);
439 if (!mp->m_rtdev_targp)
440 goto out_free_ddev_targ;
441 }
442
443 if (logdev && logdev != ddev) {
444 mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, dax_logdev);
445 if (!mp->m_logdev_targp)
446 goto out_free_rtdev_targ;
447 } else {
448 mp->m_logdev_targp = mp->m_ddev_targp;
449 }
450
451 return 0;
452
453 out_free_rtdev_targ:
454 if (mp->m_rtdev_targp)
455 xfs_free_buftarg(mp->m_rtdev_targp);
456 out_free_ddev_targ:
457 xfs_free_buftarg(mp->m_ddev_targp);
458 out_close_rtdev:
459 xfs_blkdev_put(rtdev);
460 fs_put_dax(dax_rtdev);
461 out_close_logdev:
462 if (logdev && logdev != ddev) {
463 xfs_blkdev_put(logdev);
464 fs_put_dax(dax_logdev);
465 }
466 out:
467 fs_put_dax(dax_ddev);
468 return error;
469 }
470

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]

Attachments:

(No filename) (5.04 kB)
.config.gz (23.93 kB)
Download all attachments

2021-06-04 05:43:19

by kernel test robot

[permalink] [raw]

Subject: Re: [PATCH v4 09/10] xfs: Implement ->corrupted_range() for XFS

Attachments:

(No filename) (2.02 kB)
.config.gz (25.71 kB)
Download all attachments

2021-06-04 05:51:04

by kernel test robot

[permalink] [raw]

Subject: Re: [PATCH v4 08/10] md: Implement dax_holder_operations

Hi Shiyang,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on xfs-linux/for-next]
[also build test ERROR on dm/for-next linus/master v5.13-rc4]
[cannot apply to hnaz-linux-mm/master next-20210603]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url: https://github.com/0day-ci/linux/commits/Shiyang-Ruan/fsdax-introduce-fs-query-to-support-reflink/20210604-092105
base: https://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git for-next
config: h8300-randconfig-r021-20210604 (attached as .config)
compiler: h8300-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/0day-ci/linux/commit/94db8a17905296e4d5bfe93eb5199f477646622a
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Shiyang-Ruan/fsdax-introduce-fs-query-to-support-reflink/20210604-092105
git checkout 94db8a17905296e4d5bfe93eb5199f477646622a
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=h8300

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>

All error/warnings (new ones prefixed by >>):

drivers/md/dm.c: In function 'open_table_device':
>> drivers/md/dm.c:784:12: error: implicit declaration of function 'dax_get_holder'; did you mean 'xa_get_order'? [-Werror=implicit-function-declaration]
784 | holders = dax_get_holder(td->dm_dev.dax_dev);
| ^~~~~~~~~~~~~~
| xa_get_order
>> drivers/md/dm.c:784:10: warning: assignment to 'struct list_head *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
784 | holders = dax_get_holder(td->dm_dev.dax_dev);
| ^
>> drivers/md/dm.c:788:3: error: implicit declaration of function 'dax_set_holder'; did you mean 'xas_set_order'? [-Werror=implicit-function-declaration]
788 | dax_set_holder(td->dm_dev.dax_dev, holders, &dm_dax_holder_ops);
| ^~~~~~~~~~~~~~
| xas_set_order
drivers/md/dm.c: In function 'close_table_device':
drivers/md/dm.c:808:10: warning: assignment to 'struct list_head *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
808 | holders = dax_get_holder(td->dm_dev.dax_dev);
| ^
drivers/md/dm.c: In function 'dm_dax_corrputed_range':
>> drivers/md/dm.c:1339:30: warning: initialization of 'struct list_head *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
1339 | struct list_head *holders = dax_get_holder(dax_dev);
| ^~~~~~~~~~~~~~
cc1: some warnings being treated as errors

vim +784 drivers/md/dm.c

750
751 static char *_dm_claim_ptr = "I belong to device-mapper";
752 static const struct dax_holder_operations dm_dax_holder_ops;
753 struct dm_holder {
754 struct list_head list;
755 struct mapped_device *md;
756 };
757 /*
758 * Open a table device so we can use it as a map destination.
759 */
760 static int open_table_device(struct table_device *td, dev_t dev,
761 struct mapped_device *md)
762 {
763 struct block_device *bdev;
764 struct list_head *holders;
765 struct dm_holder *holder;
766
767 int r;
768
769 BUG_ON(td->dm_dev.bdev);
770
771 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
772 if (IS_ERR(bdev))
773 return PTR_ERR(bdev);
774
775 r = bd_link_disk_holder(bdev, dm_disk(md));
776 if (r) {
777 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
778 return r;
779 }
780
781 td->dm_dev.bdev = bdev;
782 td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
783
> 784 holders = dax_get_holder(td->dm_dev.dax_dev);
785 if (!holders) {
786 holders = kmalloc(sizeof(*holders), GFP_KERNEL);
787 INIT_LIST_HEAD(holders);
> 788 dax_set_holder(td->dm_dev.dax_dev, holders, &dm_dax_holder_ops);
789 }
790 holder = kmalloc(sizeof(*holder), GFP_KERNEL);
791 holder->md = md;
792 list_add_tail(&holder->list, holders);
793
794 return 0;
795 }
796

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]

Attachments:

(No filename) (4.62 kB)
.config.gz (26.51 kB)
Download all attachments

2021-06-16 00:20:27

by Dan Williams

[permalink] [raw]

Subject: Re: [PATCH v4 01/10] pagemap: Introduce ->memory_failure()

On Thu, Jun 3, 2021 at 6:19 PM Shiyang Ruan <[email protected]> wrote:

Hi Ruan, apologies for the delays circling back to this.

>
> When memory-failure occurs, we call this function which is implemented
> by each kind of devices. For the fsdax case, pmem device driver
> implements it. Pmem device driver will find out the filesystem in which
> the corrupted page located in. And finally call filesystem handler to
> deal with this error.
>
> The filesystem will try to recover the corrupted data if possiable.
>

Let's move this change to the patch that needs it, this patch does not
do anything on its own.

> Signed-off-by: Shiyang Ruan <[email protected]>
> ---
> include/linux/memremap.h | 8 ++++++++
> 1 file changed, 8 insertions(+)
>
> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
> index 45a79da89c5f..473fe18c516a 100644
> --- a/include/linux/memremap.h
> +++ b/include/linux/memremap.h
> @@ -87,6 +87,14 @@ struct dev_pagemap_ops {
> * the page back to a CPU accessible page.
> */
> vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
> +
> + /*
> + * Handle the memory failure happens on one page. Notify the processes
> + * who are using this page, and try to recover the data on this page
> + * if necessary.
> + */

I thought we discussed that this needed to be range based here:

https://lore.kernel.org/r/CAPcyv4jhUU3NVD8HLZnJzir+SugB6LnnrgJZ-jP45BZrbJ1dJQ@mail.gmail.com

...but also incorporate Christoph's feedback to not use notifiers.

> + int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn,
> + int flags);

Change this callback to

int (*notify_memory_failure)(struct dev_pagemap *pgmap, unsigned long
pfn, unsigned long nr_pfns)

...to pass a range and to clarify that this callback is for
memory_failure() to notify the pgmap, the pgmap notifies the owner via
the holder callbacks.

2021-06-16 06:31:44

by Dan Williams

[permalink] [raw]

Subject: Re: [PATCH v4 04/10] mm, fsdax: Refactor memory-failure handler for dax mapping

[ drop old nvdimm list, add the new one ]

On Thu, Jun 3, 2021 at 6:19 PM Shiyang Ruan <[email protected]> wrote:
>
> The current memory_failure_dev_pagemap() can only handle single-mapped
> dax page for fsdax mode. The dax page could be mapped by multiple files
> and offsets if we let reflink feature & fsdax mode work together. So,
> we refactor current implementation to support handle memory failure on
> each file and offset.

I don't understand this organization, perhaps because this patch
introduces mf_dax_kill_procs() without a user. However, my expectation
is that memory_failure() is mostly untouched save for an early check
via pgmap->notify_memory_failure(). If pgmap->notify_memory_failure()
indicates that the memory failure was handled by the pgmap->owner /
dax_dev holder stack, then the typical memory failure path is
short-circuited. Otherwise, for non-reflink filesystems where
page->mapping() is valid the legacy / existing memory_failure()
operates as it does currently. If reflink capable filesystems want to
share a common implementation to map pfns to files they can, but I
don't think that common code belongs in mm/memory-failure.c.

>
> Signed-off-by: Shiyang Ruan <[email protected]>
> ---
> fs/dax.c | 21 ++++++++
> include/linux/dax.h | 1 +
> include/linux/mm.h | 9 ++++
> mm/memory-failure.c | 114 ++++++++++++++++++++++++++++----------------
> 4 files changed, 105 insertions(+), 40 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 62352cbcf0f4..58faca85455a 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -389,6 +389,27 @@ static struct page *dax_busy_page(void *entry)
> return NULL;
> }
>
> +/*
> + * dax_load_pfn - Load pfn of the DAX entry corresponding to a page
> + * @mapping: The file whose entry we want to load
> + * @index: The offset where the DAX entry located in
> + *
> + * Return: pfn of the DAX entry
> + */
> +unsigned long dax_load_pfn(struct address_space *mapping, unsigned long index)
> +{
> + XA_STATE(xas, &mapping->i_pages, index);
> + void *entry;
> + unsigned long pfn;
> +
> + xas_lock_irq(&xas);
> + entry = xas_load(&xas);
> + pfn = dax_to_pfn(entry);
> + xas_unlock_irq(&xas);

This looks racy, what happened to the locking afforded by dax_lock_page()?

> +
> + return pfn;
> +}
> +
> /*
> * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
> * @page: The page whose entry we want to lock
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 1ce343a960ab..6e758daa5004 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -158,6 +158,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
>
> struct page *dax_layout_busy_page(struct address_space *mapping);
> struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
> +unsigned long dax_load_pfn(struct address_space *mapping, unsigned long index);
> dax_entry_t dax_lock_page(struct page *page);
> void dax_unlock_page(struct page *page, dax_entry_t cookie);
> #else
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index c274f75efcf9..2b7527e93c77 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1187,6 +1187,14 @@ static inline bool is_device_private_page(const struct page *page)
> page->pgmap->type == MEMORY_DEVICE_PRIVATE;
> }
>
> +static inline bool is_device_fsdax_page(const struct page *page)
> +{
> + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
> + IS_ENABLED(CONFIG_FS_DAX) &&
> + is_zone_device_page(page) &&
> + page->pgmap->type == MEMORY_DEVICE_FS_DAX;

Why is this necessary? The dax_dev holder is the one that knows the
nature of the pfn. The common memory_failure() code should not care
about fsdax vs devdax.

> +}
> +
> static inline bool is_pci_p2pdma_page(const struct page *page)
> {
> return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
> @@ -3078,6 +3086,7 @@ enum mf_flags {
> MF_MUST_KILL = 1 << 2,
> MF_SOFT_OFFLINE = 1 << 3,
> };
> +int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, int flags);
> extern int memory_failure(unsigned long pfn, int flags);
> extern void memory_failure_queue(unsigned long pfn, int flags);
> extern void memory_failure_queue_kick(int cpu);
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 85ad98c00fd9..4377e727d478 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -56,6 +56,7 @@
> #include <linux/kfifo.h>
> #include <linux/ratelimit.h>
> #include <linux/page-isolation.h>
> +#include <linux/dax.h>
> #include "internal.h"
> #include "ras/ras_event.h"
>
> @@ -120,6 +121,13 @@ static int hwpoison_filter_dev(struct page *p)
> if (PageSlab(p))
> return -EINVAL;
>
> + if (pfn_valid(page_to_pfn(p))) {
> + if (is_device_fsdax_page(p))

This is racy unless the page is pinned. Also, not clear why this is needed?

> + return 0;
> + else
> + return -EINVAL;
> + }
> +
> mapping = page_mapping(p);
> if (mapping == NULL || mapping->host == NULL)
> return -EINVAL;
> @@ -290,10 +298,9 @@ void shake_page(struct page *p, int access)
> }
> EXPORT_SYMBOL_GPL(shake_page);
>
> -static unsigned long dev_pagemap_mapping_shift(struct page *page,
> - struct vm_area_struct *vma)
> +static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
> + unsigned long address)
> {
> - unsigned long address = vma_address(page, vma);
> pgd_t *pgd;
> p4d_t *p4d;
> pud_t *pud;
> @@ -333,9 +340,8 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
> * Schedule a process for later kill.
> * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
> */
> -static void add_to_kill(struct task_struct *tsk, struct page *p,
> - struct vm_area_struct *vma,
> - struct list_head *to_kill)
> +static void add_to_kill(struct task_struct *tsk, struct page *p, pgoff_t pgoff,
> + struct vm_area_struct *vma, struct list_head *to_kill)
> {
> struct to_kill *tk;
>
> @@ -346,9 +352,12 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
> }
>
> tk->addr = page_address_in_vma(p, vma);
> - if (is_zone_device_page(p))
> - tk->size_shift = dev_pagemap_mapping_shift(p, vma);
> - else
> + if (is_zone_device_page(p)) {
> + if (is_device_fsdax_page(p))
> + tk->addr = vma->vm_start +
> + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
> + tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);

What was wrong with the original code?

> + } else
> tk->size_shift = page_shift(compound_head(p));
>
> /*
> @@ -496,7 +505,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
> if (!page_mapped_in_vma(page, vma))
> continue;
> if (vma->vm_mm == t->mm)
> - add_to_kill(t, page, vma, to_kill);
> + add_to_kill(t, page, 0, vma, to_kill);
> }
> }
> read_unlock(&tasklist_lock);
> @@ -506,24 +515,19 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
> /*
> * Collect processes when the error hit a file mapped page.
> */
> -static void collect_procs_file(struct page *page, struct list_head *to_kill,
> - int force_early)
> +static void collect_procs_file(struct page *page, struct address_space *mapping,
> + pgoff_t pgoff, struct list_head *to_kill, int force_early)
> {

collect_procs() and kill_procs() are the only core memory_failure()
helpers I expect would be exported for a fileystem dax_dev holder to
call when it is trying to cleanup a memory_failure() on a reflink'd
mapping.

> struct vm_area_struct *vma;
> struct task_struct *tsk;
> - struct address_space *mapping = page->mapping;
> - pgoff_t pgoff;
>
> i_mmap_lock_read(mapping);
> read_lock(&tasklist_lock);
> - pgoff = page_to_pgoff(page);
> for_each_process(tsk) {
> struct task_struct *t = task_early_kill(tsk, force_early);
> -
> if (!t)
> continue;
> - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
> - pgoff) {
> + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
> /*
> * Send early kill signal to tasks where a vma covers
> * the page but the corrupted page is not necessarily
> @@ -532,7 +536,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
> * to be informed of all such data corruptions.
> */
> if (vma->vm_mm == t->mm)
> - add_to_kill(t, page, vma, to_kill);
> + add_to_kill(t, page, pgoff, vma, to_kill);
> }
> }
> read_unlock(&tasklist_lock);
> @@ -551,7 +555,8 @@ static void collect_procs(struct page *page, struct list_head *tokill,
> if (PageAnon(page))
> collect_procs_anon(page, tokill, force_early);
> else
> - collect_procs_file(page, tokill, force_early);
> + collect_procs_file(page, page_mapping(page), page_to_pgoff(page),
> + tokill, force_early);
> }
>
> static const char *action_name[] = {
> @@ -1218,6 +1223,51 @@ static int try_to_split_thp_page(struct page *page, const char *msg)
> return 0;
> }
>
> +static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
> + struct address_space *mapping, pgoff_t index, int flags)
> +{
> + struct to_kill *tk;
> + unsigned long size = 0;
> + loff_t start;
> +
> + list_for_each_entry(tk, to_kill, nd)
> + if (tk->size_shift)
> + size = max(size, 1UL << tk->size_shift);
> + if (size) {
> + /*
> + * Unmap the largest mapping to avoid breaking up
> + * device-dax mappings which are constant size. The
> + * actual size of the mapping being torn down is
> + * communicated in siginfo, see kill_proc()
> + */
> + start = (index << PAGE_SHIFT) & ~(size - 1);
> + unmap_mapping_range(mapping, start, size, 0);
> + }
> +
> + kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags);
> +}
> +
> +int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, int flags)
> +{
> + LIST_HEAD(to_kill);
> + /* load the pfn of the dax mapping file */
> + unsigned long pfn = dax_load_pfn(mapping, index);
> +
> + /*
> + * Unlike System-RAM there is no possibility to swap in a
> + * different physical page at a given virtual address, so all
> + * userspace consumption of ZONE_DEVICE memory necessitates
> + * SIGBUS (i.e. MF_MUST_KILL)
> + */
> + flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
> + collect_procs_file(pfn_to_page(pfn), mapping, index, &to_kill,
> + flags & MF_ACTION_REQUIRED);
> +
> + unmap_and_kill(&to_kill, pfn, mapping, index, flags);
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
> +
> static int memory_failure_hugetlb(unsigned long pfn, int flags)
> {
> struct page *p = pfn_to_page(pfn);
> @@ -1298,12 +1348,8 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
> struct dev_pagemap *pgmap)
> {
> struct page *page = pfn_to_page(pfn);
> - const bool unmap_success = true;
> - unsigned long size = 0;
> - struct to_kill *tk;
> - LIST_HEAD(tokill);
> + LIST_HEAD(to_kill);
> int rc = -EBUSY;
> - loff_t start;
> dax_entry_t cookie;
>
> if (flags & MF_COUNT_INCREASED)
> @@ -1355,22 +1401,10 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
> * SIGBUS (i.e. MF_MUST_KILL)
> */
> flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
> - collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
> + collect_procs_file(page, page->mapping, page->index, &to_kill,
> + flags & MF_ACTION_REQUIRED);
>
> - list_for_each_entry(tk, &tokill, nd)
> - if (tk->size_shift)
> - size = max(size, 1UL << tk->size_shift);
> - if (size) {
> - /*
> - * Unmap the largest mapping to avoid breaking up
> - * device-dax mappings which are constant size. The
> - * actual size of the mapping being torn down is
> - * communicated in siginfo, see kill_proc()
> - */
> - start = (page->index << PAGE_SHIFT) & ~(size - 1);
> - unmap_mapping_range(page->mapping, start, size, 0);
> - }
> - kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
> + unmap_and_kill(&to_kill, pfn, page->mapping, page->index, flags);

There's just too much change in this patch and not enough
justification of what is being refactored and why.

2021-06-17 07:53:02

by Shiyang Ruan

[permalink] [raw]

Subject: RE: [PATCH v4 04/10] mm, fsdax: Refactor memory-failure handler for dax mapping

> -----Original Message-----
> From: Dan Williams <[email protected]>
> Subject: Re: [PATCH v4 04/10] mm, fsdax: Refactor memory-failure handler for
> dax mapping
>
> [ drop old nvdimm list, add the new one ]
>
> On Thu, Jun 3, 2021 at 6:19 PM Shiyang Ruan <[email protected]> wrote:
> >
> > The current memory_failure_dev_pagemap() can only handle single-mapped
> > dax page for fsdax mode. The dax page could be mapped by multiple
> > files and offsets if we let reflink feature & fsdax mode work
> > together. So, we refactor current implementation to support handle
> > memory failure on each file and offset.
>
> I don't understand this organization, perhaps because this patch introduces
> mf_dax_kill_procs() without a user.

Yes, I think I made it a mess... I should reorganize this whole patchset.

The mf_dax_kill_procs() is used by xfs in patch 9. I was mean to refactor these code for the next patches usage.

> However, my expectation is that
> memory_failure() is mostly untouched save for an early check via
> pgmap->notify_memory_failure(). If pgmap->notify_memory_failure() indicates
> that the memory failure was handled by the pgmap->owner / dax_dev holder
> stack, then the typical memory failure path is short-circuited. Otherwise, for
> non-reflink filesystems where
> page->mapping() is valid the legacy / existing memory_failure()
> operates as it does currently.

You can find this logic in patch 5.

When it comes to memory-failure() and memory_failure_dev_pagemap(), after some check, it will try to call pgmap->ops->memory_failure(). If this interface is implemented, for example pgmap is pmem device, it will call the dax_dev holder stack. And finally, it comes to mf_dax_kill_procs().
However, if something wrong happens in this stack, such as feature not support or interface not implemented, it will roll back to normal memory-failure hanlder which is refactored as mf_generic_kill_porcs().

So, I think we are in agreement on this.

Let me reorganize these code.

> If reflink capable filesystems want to share a
> common implementation to map pfns to files they can, but I don't think that
> common code belongs in mm/memory-failure.c.
>
> >
> > Signed-off-by: Shiyang Ruan <[email protected]>
> > ---
> > fs/dax.c | 21 ++++++++
> > include/linux/dax.h | 1 +
> > include/linux/mm.h | 9 ++++
> > mm/memory-failure.c | 114
> > ++++++++++++++++++++++++++++----------------
> > 4 files changed, 105 insertions(+), 40 deletions(-)
> >
> > diff --git a/fs/dax.c b/fs/dax.c
> > index 62352cbcf0f4..58faca85455a 100644
> > --- a/fs/dax.c
> > +++ b/fs/dax.c
> > @@ -389,6 +389,27 @@ static struct page *dax_busy_page(void *entry)
> > return NULL;
> > }
> >
> > +/*
> > + * dax_load_pfn - Load pfn of the DAX entry corresponding to a page
> > + * @mapping: The file whose entry we want to load
> > + * @index: The offset where the DAX entry located in
> > + *
> > + * Return: pfn of the DAX entry
> > + */
> > +unsigned long dax_load_pfn(struct address_space *mapping, unsigned
> > +long index) {
> > + XA_STATE(xas, &mapping->i_pages, index);
> > + void *entry;
> > + unsigned long pfn;
> > +
> > + xas_lock_irq(&xas);
> > + entry = xas_load(&xas);
> > + pfn = dax_to_pfn(entry);
> > + xas_unlock_irq(&xas);
>
> This looks racy, what happened to the locking afforded by dax_lock_page()?
>
> > +
> > + return pfn;
> > +}
> > +
> > /*
> > * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
> > * @page: The page whose entry we want to lock diff --git
> > a/include/linux/dax.h b/include/linux/dax.h index
> > 1ce343a960ab..6e758daa5004 100644
> > --- a/include/linux/dax.h
> > +++ b/include/linux/dax.h
> > @@ -158,6 +158,7 @@ int dax_writeback_mapping_range(struct
> > address_space *mapping,
> >
> > struct page *dax_layout_busy_page(struct address_space *mapping);
> > struct page *dax_layout_busy_page_range(struct address_space *mapping,
> > loff_t start, loff_t end);
> > +unsigned long dax_load_pfn(struct address_space *mapping, unsigned
> > +long index);
> > dax_entry_t dax_lock_page(struct page *page); void
> > dax_unlock_page(struct page *page, dax_entry_t cookie); #else diff
> > --git a/include/linux/mm.h b/include/linux/mm.h index
> > c274f75efcf9..2b7527e93c77 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -1187,6 +1187,14 @@ static inline bool is_device_private_page(const
> struct page *page)
> > page->pgmap->type == MEMORY_DEVICE_PRIVATE; }
> >
> > +static inline bool is_device_fsdax_page(const struct page *page) {
> > + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
> > + IS_ENABLED(CONFIG_FS_DAX) &&
> > + is_zone_device_page(page) &&
> > + page->pgmap->type == MEMORY_DEVICE_FS_DAX;
>
> Why is this necessary? The dax_dev holder is the one that knows the nature of
> the pfn. The common memory_failure() code should not care about fsdax vs
> devdax.

add_to_kill() in collect_procs() needs this. Please see explanation at below.

>
> > +}
> > +
> > static inline bool is_pci_p2pdma_page(const struct page *page) {
> > return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && @@
> -3078,6
> > +3086,7 @@ enum mf_flags {
> > MF_MUST_KILL = 1 << 2,
> > MF_SOFT_OFFLINE = 1 << 3,
> > };
> > +int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
> > +int flags);
> > extern int memory_failure(unsigned long pfn, int flags); extern void
> > memory_failure_queue(unsigned long pfn, int flags); extern void
> > memory_failure_queue_kick(int cpu); diff --git a/mm/memory-failure.c
> > b/mm/memory-failure.c index 85ad98c00fd9..4377e727d478 100644
> > --- a/mm/memory-failure.c
> > +++ b/mm/memory-failure.c
> > @@ -56,6 +56,7 @@
> > #include <linux/kfifo.h>
> > #include <linux/ratelimit.h>
> > #include <linux/page-isolation.h>
> > +#include <linux/dax.h>
> > #include "internal.h"
> > #include "ras/ras_event.h"
> >
> > @@ -120,6 +121,13 @@ static int hwpoison_filter_dev(struct page *p)
> > if (PageSlab(p))
> > return -EINVAL;
> >
> > + if (pfn_valid(page_to_pfn(p))) {
> > + if (is_device_fsdax_page(p))
>
> This is racy unless the page is pinned. Also, not clear why this is needed?
>
> > + return 0;
> > + else
> > + return -EINVAL;
> > + }
> > +
> > mapping = page_mapping(p);
> > if (mapping == NULL || mapping->host == NULL)
> > return -EINVAL;
> > @@ -290,10 +298,9 @@ void shake_page(struct page *p, int access) }
> > EXPORT_SYMBOL_GPL(shake_page);
> >
> > -static unsigned long dev_pagemap_mapping_shift(struct page *page,
> > - struct vm_area_struct *vma)
> > +static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct
> *vma,
> > + unsigned long
> address)
> > {
> > - unsigned long address = vma_address(page, vma);
> > pgd_t *pgd;
> > p4d_t *p4d;
> > pud_t *pud;
> > @@ -333,9 +340,8 @@ static unsigned long
> dev_pagemap_mapping_shift(struct page *page,
> > * Schedule a process for later kill.
> > * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
> > */
> > -static void add_to_kill(struct task_struct *tsk, struct page *p,
> > - struct vm_area_struct *vma,
> > - struct list_head *to_kill)
> > +static void add_to_kill(struct task_struct *tsk, struct page *p, pgoff_t pgoff,
> > + struct vm_area_struct *vma, struct list_head
> > +*to_kill)
> > {
> > struct to_kill *tk;
> >
> > @@ -346,9 +352,12 @@ static void add_to_kill(struct task_struct *tsk, struct
> page *p,
> > }
> >
> > tk->addr = page_address_in_vma(p, vma);
> > - if (is_zone_device_page(p))
> > - tk->size_shift = dev_pagemap_mapping_shift(p, vma);
> > - else
> > + if (is_zone_device_page(p)) {
> > + if (is_device_fsdax_page(p))
> > + tk->addr = vma->vm_start +
> > + ((pgoff - vma->vm_pgoff) <<
> PAGE_SHIFT);
> > + tk->size_shift = dev_pagemap_mapping_shift(vma,
> > + tk->addr);
>
> What was wrong with the original code?

Here is the explanation: For normal page, it associate file's mapping and offset to page->mapping, page->index for rmap tracking. But for fsdax, in order to support reflink, we no longer use this mechanism, using dax_device holder stack instead. Thus, this dax page->mapping is NULL. As a result, we need is_device_fsdax_page(p) to distinguish if a page is a fsdax page and calculate this tk->addr manually.

>
> > + } else
> > tk->size_shift = page_shift(compound_head(p));
> >
> > /*
> > @@ -496,7 +505,7 @@ static void collect_procs_anon(struct page *page,
> struct list_head *to_kill,
> > if (!page_mapped_in_vma(page, vma))
> > continue;
> > if (vma->vm_mm == t->mm)
> > - add_to_kill(t, page, vma, to_kill);
> > + add_to_kill(t, page, 0, vma, to_kill);
> > }
> > }
> > read_unlock(&tasklist_lock);
> > @@ -506,24 +515,19 @@ static void collect_procs_anon(struct page
> > *page, struct list_head *to_kill,
> > /*
> > * Collect processes when the error hit a file mapped page.
> > */
> > -static void collect_procs_file(struct page *page, struct list_head *to_kill,
> > - int force_early)
> > +static void collect_procs_file(struct page *page, struct address_space
> *mapping,
> > + pgoff_t pgoff, struct list_head *to_kill, int
> > +force_early)
> > {
>
> collect_procs() and kill_procs() are the only core memory_failure() helpers I
> expect would be exported for a fileystem dax_dev holder to call when it is trying
> to cleanup a memory_failure() on a reflink'd mapping.

Yes, they are the core we need. But there are some small different when dealing with normal page and dax page. So, I factor these two core functions into two helpers. One is mf_generic_kill_procs() for normal page. Another is mf_dax_kill_procs() for dax page.

>
> > struct vm_area_struct *vma;
> > struct task_struct *tsk;
> > - struct address_space *mapping = page->mapping;
> > - pgoff_t pgoff;
> >
> > i_mmap_lock_read(mapping);
> > read_lock(&tasklist_lock);
> > - pgoff = page_to_pgoff(page);
> > for_each_process(tsk) {
> > struct task_struct *t = task_early_kill(tsk,
> > force_early);
> > -
> > if (!t)
> > continue;
> > - vma_interval_tree_foreach(vma, &mapping->i_mmap,
> pgoff,
> > - pgoff) {
> > + vma_interval_tree_foreach(vma, &mapping->i_mmap,
> > + pgoff, pgoff) {
> > /*
> > * Send early kill signal to tasks where a vma
> covers
> > * the page but the corrupted page is not
> > necessarily @@ -532,7 +536,7 @@ static void collect_procs_file(struct page
> *page, struct list_head *to_kill,
> > * to be informed of all such data corruptions.
> > */
> > if (vma->vm_mm == t->mm)
> > - add_to_kill(t, page, vma, to_kill);
> > + add_to_kill(t, page, pgoff, vma,
> > + to_kill);
> > }
> > }
> > read_unlock(&tasklist_lock);
> > @@ -551,7 +555,8 @@ static void collect_procs(struct page *page, struct
> list_head *tokill,
> > if (PageAnon(page))
> > collect_procs_anon(page, tokill, force_early);
> > else
> > - collect_procs_file(page, tokill, force_early);
> > + collect_procs_file(page, page_mapping(page),
> page_to_pgoff(page),
> > + tokill, force_early);
> > }
> >
> > static const char *action_name[] = {
> > @@ -1218,6 +1223,51 @@ static int try_to_split_thp_page(struct page *page,
> const char *msg)
> > return 0;
> > }
> >
> > +static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
> > + struct address_space *mapping, pgoff_t index, int
> > +flags) {
> > + struct to_kill *tk;
> > + unsigned long size = 0;
> > + loff_t start;
> > +
> > + list_for_each_entry(tk, to_kill, nd)
> > + if (tk->size_shift)
> > + size = max(size, 1UL << tk->size_shift);
> > + if (size) {
> > + /*
> > + * Unmap the largest mapping to avoid breaking up
> > + * device-dax mappings which are constant size. The
> > + * actual size of the mapping being torn down is
> > + * communicated in siginfo, see kill_proc()
> > + */
> > + start = (index << PAGE_SHIFT) & ~(size - 1);
> > + unmap_mapping_range(mapping, start, size, 0);
> > + }
> > +
> > + kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags);
> > +}
> > +
> > +int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
> > +int flags) {
> > + LIST_HEAD(to_kill);
> > + /* load the pfn of the dax mapping file */
> > + unsigned long pfn = dax_load_pfn(mapping, index);
> > +
> > + /*
> > + * Unlike System-RAM there is no possibility to swap in a
> > + * different physical page at a given virtual address, so all
> > + * userspace consumption of ZONE_DEVICE memory necessitates
> > + * SIGBUS (i.e. MF_MUST_KILL)
> > + */
> > + flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
> > + collect_procs_file(pfn_to_page(pfn), mapping, index, &to_kill,
> > + flags & MF_ACTION_REQUIRED);
> > +
> > + unmap_and_kill(&to_kill, pfn, mapping, index, flags);
> > + return 0;
> > +}
> > +EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
> > +
> > static int memory_failure_hugetlb(unsigned long pfn, int flags) {
> > struct page *p = pfn_to_page(pfn); @@ -1298,12 +1348,8 @@
> > static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
> > struct dev_pagemap *pgmap) {
> > struct page *page = pfn_to_page(pfn);
> > - const bool unmap_success = true;
> > - unsigned long size = 0;
> > - struct to_kill *tk;
> > - LIST_HEAD(tokill);
> > + LIST_HEAD(to_kill);
> > int rc = -EBUSY;
> > - loff_t start;
> > dax_entry_t cookie;
> >
> > if (flags & MF_COUNT_INCREASED) @@ -1355,22 +1401,10 @@
> static
> > int memory_failure_dev_pagemap(unsigned long pfn, int flags,
> > * SIGBUS (i.e. MF_MUST_KILL)
> > */
> > flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
> > - collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
> > + collect_procs_file(page, page->mapping, page->index, &to_kill,
> > + flags & MF_ACTION_REQUIRED);
> >
> > - list_for_each_entry(tk, &tokill, nd)
> > - if (tk->size_shift)
> > - size = max(size, 1UL << tk->size_shift);
> > - if (size) {
> > - /*
> > - * Unmap the largest mapping to avoid breaking up
> > - * device-dax mappings which are constant size. The
> > - * actual size of the mapping being torn down is
> > - * communicated in siginfo, see kill_proc()
> > - */
> > - start = (page->index << PAGE_SHIFT) & ~(size - 1);
> > - unmap_mapping_range(page->mapping, start, size, 0);
> > - }
> > - kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn,
> flags);
> > + unmap_and_kill(&to_kill, pfn, page->mapping, page->index,
> > + flags);
>
> There's just too much change in this patch and not enough justification of what
> is being refactored and why.

Sorry again for the mess...

--
Thanks,
Ruan.