2020-09-15 10:14:50

by Ruan Shiyang

[permalink] [raw]
Subject: [RFC PATCH 0/4] fsdax: introduce fs query to support reflink

This patchset is a try to resolve the problem of tracking shared page
for fsdax.

This patchset moves owner tracking from dax_assocaite_entry() to pmem
device, by introducing an interface ->memory_failure() of struct
pagemap. The interface is called by memory_failure() in mm, and
implemented by pmem device. Then pmem device find the filesystem
which the damaged page located in, and call ->storage_lost() to track
files or metadata assocaited with this page. Finally we are able to
try to fix the damaged data in filesystem and do other necessary
processing, such as killing processes who are using the files
affected.

The call trace is like this:
memory_failure()
pgmap->ops->memory_failure() => pmem_pgmap_memory_failure()
bdev->bd_super->storage_lost() => xfs_fs_storage_lost()
xfs_rmap_query_range()
xfs_storage_lost_helper()
mf_recover_controller->recover_fn => \
memory_failure_dev_pagemap_kill_procs()

The collect_procs() and kill_procs() are moved into a callback which
is passed from memory_failure() to xfs_storage_lost_helper(). So we
can call it when a file assocaited is found, instead of creating a
file list and iterate it.

The fsdax & reflink support for XFS is not contained in this patchset.

(Rebased on v5.9-rc2)
==

Shiyang Ruan (4):
fs: introduce ->storage_lost() for memory-failure
pagemap: introduce ->memory_failure()
mm, fsdax: refactor dax handler in memory-failure
fsdax: remove useless (dis)associate functions

block/genhd.c | 12 ++++
drivers/nvdimm/pmem.c | 31 ++++++++++
fs/dax.c | 64 ++------------------
fs/xfs/xfs_super.c | 80 ++++++++++++++++++++++++
include/linux/dax.h | 5 +-
include/linux/fs.h | 1 +
include/linux/genhd.h | 2 +
include/linux/memremap.h | 3 +
include/linux/mm.h | 14 +++++
mm/memory-failure.c | 127 ++++++++++++++++++++++++---------------
10 files changed, 229 insertions(+), 110 deletions(-)

--
2.28.0




2020-09-15 10:15:25

by Ruan Shiyang

[permalink] [raw]
Subject: [RFC PATCH 4/4] fsdax: remove useless (dis)associate functions

Since owner tarcking is triggerred by pmem device, these functions are
useless. So remove it.

Signed-off-by: Shiyang Ruan <[email protected]>
---
fs/dax.c | 46 ----------------------------------------------
1 file changed, 46 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 1ec592f0aadd..4c85b07abcc0 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -323,48 +323,6 @@ static unsigned long dax_end_pfn(void *entry)
for (pfn = dax_to_pfn(entry); \
pfn < dax_end_pfn(entry); pfn++)

-/*
- * TODO: for reflink+dax we need a way to associate a single page with
- * multiple address_space instances at different linear_page_index()
- * offsets.
- */
-static void dax_associate_entry(void *entry, struct address_space *mapping,
- struct vm_area_struct *vma, unsigned long address)
-{
- unsigned long size = dax_entry_size(entry), pfn, index;
- int i = 0;
-
- if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
- return;
-
- index = linear_page_index(vma, address & ~(size - 1));
- for_each_mapped_pfn(entry, pfn) {
- struct page *page = pfn_to_page(pfn);
-
- WARN_ON_ONCE(page->mapping);
- page->mapping = mapping;
- page->index = index + i++;
- }
-}
-
-static void dax_disassociate_entry(void *entry, struct address_space *mapping,
- bool trunc)
-{
- unsigned long pfn;
-
- if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
- return;
-
- for_each_mapped_pfn(entry, pfn) {
- struct page *page = pfn_to_page(pfn);
-
- WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
- WARN_ON_ONCE(page->mapping && page->mapping != mapping);
- page->mapping = NULL;
- page->index = 0;
- }
-}
-
static struct page *dax_busy_page(void *entry)
{
unsigned long pfn;
@@ -516,7 +474,6 @@ static void *grab_mapping_entry(struct xa_state *xas,
xas_lock_irq(xas);
}

- dax_disassociate_entry(entry, mapping, false);
xas_store(xas, NULL); /* undo the PMD join */
dax_wake_entry(xas, entry, true);
mapping->nrexceptional--;
@@ -636,7 +593,6 @@ static int __dax_invalidate_entry(struct address_space *mapping,
(xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
goto out;
- dax_disassociate_entry(entry, mapping, trunc);
xas_store(&xas, NULL);
mapping->nrexceptional--;
ret = 1;
@@ -730,8 +686,6 @@ static void *dax_insert_entry(struct xa_state *xas,
if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
void *old;

- dax_disassociate_entry(entry, mapping, false);
- dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
/*
* Only swap our new entry into the page cache if the current
* entry is a zero page or an empty entry. If a normal PTE or
--
2.28.0



2020-09-15 10:15:34

by Ruan Shiyang

[permalink] [raw]
Subject: [RFC PATCH 3/4] mm, fsdax: refactor dax handler in memory-failure

With the ->memory_failure() implemented in pmem device and
->storage_lost() in XFS, we are able to track files or metadata
and process them further.

We don't track files by page->mapping, page->index any more, so
some of functions who obtain ->mapping, ->index from struct page
parameter need to be changed by directly passing mapping and index.

Signed-off-by: Shiyang Ruan <[email protected]>
---
fs/dax.c | 18 +++----
include/linux/dax.h | 5 +-
include/linux/mm.h | 8 +++
mm/memory-failure.c | 127 +++++++++++++++++++++++++++-----------------
4 files changed, 94 insertions(+), 64 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 95341af1a966..1ec592f0aadd 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -379,14 +379,14 @@ static struct page *dax_busy_page(void *entry)
}

/*
- * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
+ * dax_lock - Lock the DAX entry corresponding to a page
* @page: The page whose entry we want to lock
*
* Context: Process context.
* Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
* not be locked.
*/
-dax_entry_t dax_lock_page(struct page *page)
+dax_entry_t dax_lock(struct address_space *mapping, pgoff_t index)
{
XA_STATE(xas, NULL, 0);
void *entry;
@@ -394,8 +394,6 @@ dax_entry_t dax_lock_page(struct page *page)
/* Ensure page->mapping isn't freed while we look at it */
rcu_read_lock();
for (;;) {
- struct address_space *mapping = READ_ONCE(page->mapping);
-
entry = NULL;
if (!mapping || !dax_mapping(mapping))
break;
@@ -413,11 +411,7 @@ dax_entry_t dax_lock_page(struct page *page)

xas.xa = &mapping->i_pages;
xas_lock_irq(&xas);
- if (mapping != page->mapping) {
- xas_unlock_irq(&xas);
- continue;
- }
- xas_set(&xas, page->index);
+ xas_set(&xas, index);
entry = xas_load(&xas);
if (dax_is_locked(entry)) {
rcu_read_unlock();
@@ -433,10 +427,10 @@ dax_entry_t dax_lock_page(struct page *page)
return (dax_entry_t)entry;
}

-void dax_unlock_page(struct page *page, dax_entry_t cookie)
+void dax_unlock(struct address_space *mapping, pgoff_t index,
+ dax_entry_t cookie)
{
- struct address_space *mapping = page->mapping;
- XA_STATE(xas, &mapping->i_pages, page->index);
+ XA_STATE(xas, &mapping->i_pages, index);

if (S_ISCHR(mapping->host->i_mode))
return;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 6904d4e0b2e0..669ba768b89e 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -141,8 +141,9 @@ int dax_writeback_mapping_range(struct address_space *mapping,
struct dax_device *dax_dev, struct writeback_control *wbc);

struct page *dax_layout_busy_page(struct address_space *mapping);
-dax_entry_t dax_lock_page(struct page *page);
-void dax_unlock_page(struct page *page, dax_entry_t cookie);
+dax_entry_t dax_lock(struct address_space *mapping, pgoff_t index);
+void dax_unlock(struct address_space *mapping, pgoff_t index,
+ dax_entry_t cookie);
#else
static inline bool bdev_dax_supported(struct block_device *bdev,
int blocksize)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3f0c36e1bf3d..d170b3f74d83 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1126,6 +1126,14 @@ static inline bool is_device_private_page(const struct page *page)
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
}

+static inline bool is_device_fsdax_page(const struct page *page)
+{
+ return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+ IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
+ is_zone_device_page(page) &&
+ page->pgmap->type == MEMORY_DEVICE_FS_DAX;
+}
+
static inline bool is_pci_p2pdma_page(const struct page *page)
{
return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index f1aa6433f404..0c4a25bf276f 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -93,6 +93,9 @@ static int hwpoison_filter_dev(struct page *p)
if (PageSlab(p))
return -EINVAL;

+ if (is_device_fsdax_page(p))
+ return 0;
+
mapping = page_mapping(p);
if (mapping == NULL || mapping->host == NULL)
return -EINVAL;
@@ -263,9 +266,8 @@ void shake_page(struct page *p, int access)
EXPORT_SYMBOL_GPL(shake_page);

static unsigned long dev_pagemap_mapping_shift(struct page *page,
- struct vm_area_struct *vma)
+ struct vm_area_struct *vma, unsigned long address)
{
- unsigned long address = vma_address(page, vma);
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
@@ -306,8 +308,8 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
*/
static void add_to_kill(struct task_struct *tsk, struct page *p,
- struct vm_area_struct *vma,
- struct list_head *to_kill)
+ struct address_space *mapping, pgoff_t pgoff,
+ struct vm_area_struct *vma, struct list_head *to_kill)
{
struct to_kill *tk;

@@ -317,12 +319,18 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
return;
}

- tk->addr = page_address_in_vma(p, vma);
- if (is_zone_device_page(p))
- tk->size_shift = dev_pagemap_mapping_shift(p, vma);
- else
- tk->size_shift = page_shift(compound_head(p));
-
+ if (is_device_fsdax_page(p)) {
+ tk->addr = vma->vm_start +
+ ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ tk->size_shift = dev_pagemap_mapping_shift(p, vma, tk->addr);
+ } else {
+ tk->addr = page_address_in_vma(p, vma);
+ if (is_zone_device_page(p)) {
+ tk->size_shift = dev_pagemap_mapping_shift(p, vma,
+ vma_address(p, vma));
+ } else
+ tk->size_shift = page_shift(compound_head(p));
+ }
/*
* Send SIGKILL if "tk->addr == -EFAULT". Also, as
* "tk->size_shift" is always non-zero for !is_zone_device_page(),
@@ -468,7 +476,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
if (!page_mapped_in_vma(page, vma))
continue;
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill);
+ add_to_kill(t, page, NULL, 0, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -478,23 +486,19 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
/*
* Collect processes when the error hit a file mapped page.
*/
-static void collect_procs_file(struct page *page, struct list_head *to_kill,
- int force_early)
+static void collect_procs_file(struct page *page, struct address_space *mapping,
+ pgoff_t pgoff, struct list_head *to_kill, int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
- struct address_space *mapping = page->mapping;

i_mmap_lock_read(mapping);
read_lock(&tasklist_lock);
for_each_process(tsk) {
- pgoff_t pgoff = page_to_pgoff(page);
struct task_struct *t = task_early_kill(tsk, force_early);
-
if (!t)
continue;
- vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
- pgoff) {
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
/*
* Send early kill signal to tasks where a vma covers
* the page but the corrupted page is not necessarily
@@ -502,8 +506,10 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* Assume applications who requested early kill want
* to be informed of all such data corruptions.
*/
- if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill);
+ if (vma->vm_mm == t->mm) {
+ add_to_kill(t, page, mapping, pgoff, vma,
+ to_kill);
+ }
}
}
read_unlock(&tasklist_lock);
@@ -522,7 +528,8 @@ static void collect_procs(struct page *page, struct list_head *tokill,
if (PageAnon(page))
collect_procs_anon(page, tokill, force_early);
else
- collect_procs_file(page, tokill, force_early);
+ collect_procs_file(page, page->mapping, page_to_pgoff(page),
+ tokill, force_early);
}

static const char *action_name[] = {
@@ -1176,14 +1183,14 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
return res;
}

-static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
- struct dev_pagemap *pgmap)
+static int memory_failure_dev_pagemap_kill_procs(unsigned long pfn, int flags,
+ struct address_space *mapping, pgoff_t index)
{
struct page *page = pfn_to_page(pfn);
const bool unmap_success = true;
unsigned long size = 0;
struct to_kill *tk;
- LIST_HEAD(tokill);
+ LIST_HEAD(to_kill);
int rc = -EBUSY;
loff_t start;
dax_entry_t cookie;
@@ -1195,28 +1202,9 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
* also prevents changes to the mapping of this pfn until
* poison signaling is complete.
*/
- cookie = dax_lock_page(page);
+ cookie = dax_lock(mapping, index);
if (!cookie)
- goto out;
-
- if (hwpoison_filter(page)) {
- rc = 0;
goto unlock;
- }
-
- if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
- /*
- * TODO: Handle HMM pages which may need coordination
- * with device-side memory.
- */
- goto unlock;
- }
-
- /*
- * Use this flag as an indication that the dax page has been
- * remapped UC to prevent speculative consumption of poison.
- */
- SetPageHWPoison(page);

/*
* Unlike System-RAM there is no possibility to swap in a
@@ -1225,9 +1213,10 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
* SIGBUS (i.e. MF_MUST_KILL)
*/
flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
- collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
+ collect_procs_file(page, mapping, index, &to_kill,
+ flags & MF_ACTION_REQUIRED);

- list_for_each_entry(tk, &tokill, nd)
+ list_for_each_entry(tk, &to_kill, nd)
if (tk->size_shift)
size = max(size, 1UL << tk->size_shift);
if (size) {
@@ -1237,13 +1226,51 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
* actual size of the mapping being torn down is
* communicated in siginfo, see kill_proc()
*/
- start = (page->index << PAGE_SHIFT) & ~(size - 1);
- unmap_mapping_range(page->mapping, start, start + size, 0);
+ start = (index << PAGE_SHIFT) & ~(size - 1);
+ unmap_mapping_range(mapping, start, start + size, 0);
}
- kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
+
+ kill_procs(&to_kill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
rc = 0;
unlock:
- dax_unlock_page(page, cookie);
+ dax_unlock(mapping, index, cookie);
+ return rc;
+}
+
+static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
+ struct dev_pagemap *pgmap)
+{
+ struct page *page = pfn_to_page(pfn);
+ struct mf_recover_controller mfrc = {
+ .recover_fn = memory_failure_dev_pagemap_kill_procs,
+ .pfn = pfn,
+ .flags = flags,
+ };
+ int rc;
+
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ /*
+ * TODO: Handle HMM pages which may need coordination
+ * with device-side memory.
+ */
+ goto out;
+ }
+
+ if (hwpoison_filter(page)) {
+ rc = 0;
+ goto out;
+ }
+
+ /*
+ * Use this flag as an indication that the dax page has been
+ * remapped UC to prevent speculative consumption of poison.
+ */
+ SetPageHWPoison(page);
+
+ /* call driver to handle the memory failure */
+ if (pgmap->ops->memory_failure)
+ rc = pgmap->ops->memory_failure(pgmap, &mfrc);
+
out:
/* drop pgmap ref acquired in caller */
put_dev_pagemap(pgmap);
--
2.28.0



2020-09-15 10:16:28

by Ruan Shiyang

[permalink] [raw]
Subject: [RFC PATCH 1/4] fs: introduce ->storage_lost() for memory-failure

This function is used to handle errors which may cause data lost in
filesystem. Such as memory-failure in fsdax mode.

In XFS, it requires "rmapbt" feature in order to query for files or
metadata which associated to the error block. Then we could call fs
recover functions to try to repair the damaged data.(did not implemented
in this patch)

After that, the memory-failure also needs to kill processes who are
using those files. The struct mf_recover_controller is created to store
necessary parameters.

Signed-off-by: Shiyang Ruan <[email protected]>
---
fs/xfs/xfs_super.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++
include/linux/fs.h | 1 +
include/linux/mm.h | 6 ++++
3 files changed, 87 insertions(+)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 71ac6c1cdc36..118d9c5d9e1e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -35,6 +35,10 @@
#include "xfs_refcount_item.h"
#include "xfs_bmap_item.h"
#include "xfs_reflink.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_bit.h"

#include <linux/magic.h>
#include <linux/fs_context.h>
@@ -1104,6 +1108,81 @@ xfs_fs_free_cached_objects(
return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
}

+static int
+xfs_storage_lost_helper(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_inode *ip;
+ struct mf_recover_controller *mfrc = priv;
+ int rc = 0;
+
+ if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner)) {
+ // TODO check and try to fix metadata
+ } else {
+ /*
+ * Get files that incore, filter out others that are not in use.
+ */
+ xfs_iget(cur->bc_mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
+ 0, &ip);
+ if (!ip)
+ return 0;
+ if (!VFS_I(ip)->i_mapping)
+ goto out;
+
+ rc = mfrc->recover_fn(mfrc->pfn, mfrc->flags,
+ VFS_I(ip)->i_mapping, rec->rm_offset);
+
+ // TODO try to fix data
+out:
+ xfs_irele(ip);
+ }
+
+ return rc;
+}
+
+static int
+xfs_fs_storage_lost(
+ struct super_block *sb,
+ loff_t offset,
+ void *priv)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+ struct xfs_trans *tp = NULL;
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_rmap_irec rmap_low, rmap_high;
+ struct xfs_buf *agf_bp = NULL;
+ xfs_fsblock_t fsbno = XFS_B_TO_FSB(mp, offset);
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno);
+ xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, fsbno);
+ int error = 0;
+
+ error = xfs_trans_alloc_empty(mp, &tp);
+ if (error)
+ return error;
+
+ error = xfs_alloc_read_agf(mp, tp, agno, 0, &agf_bp);
+ if (error)
+ return error;
+
+ cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, agno);
+
+ /* Construct a range for rmap query */
+ memset(&rmap_low, 0, sizeof(rmap_low));
+ memset(&rmap_high, 0xFF, sizeof(rmap_high));
+ rmap_low.rm_startblock = rmap_high.rm_startblock = agbno;
+
+ error = xfs_rmap_query_range(cur, &rmap_low, &rmap_high,
+ xfs_storage_lost_helper, priv);
+ if (error == -ECANCELED)
+ error = 0;
+
+ xfs_btree_del_cursor(cur, error);
+ xfs_trans_brelse(tp, agf_bp);
+ return error;
+}
+
static const struct super_operations xfs_super_operations = {
.alloc_inode = xfs_fs_alloc_inode,
.destroy_inode = xfs_fs_destroy_inode,
@@ -1117,6 +1196,7 @@ static const struct super_operations xfs_super_operations = {
.show_options = xfs_fs_show_options,
.nr_cached_objects = xfs_fs_nr_cached_objects,
.free_cached_objects = xfs_fs_free_cached_objects,
+ .storage_lost = xfs_fs_storage_lost,
};

static int
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e019ea2f1347..bd90485cfdc9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1951,6 +1951,7 @@ struct super_operations {
struct shrink_control *);
long (*free_cached_objects)(struct super_block *,
struct shrink_control *);
+ int (*storage_lost)(struct super_block *sb, loff_t offset, void *priv);
};

/*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1983e08f5906..3f0c36e1bf3d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3002,6 +3002,12 @@ extern void shake_page(struct page *p, int access);
extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags);

+struct mf_recover_controller {
+ int (*recover_fn)(unsigned long pfn, int flags,
+ struct address_space *mapping, pgoff_t index);
+ unsigned long pfn;
+ int flags;
+};

/*
* Error handlers for various types of pages.
--
2.28.0



2020-09-15 22:32:23

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [RFC PATCH 1/4] fs: introduce ->storage_lost() for memory-failure

On Tue, Sep 15, 2020 at 06:13:08PM +0800, Shiyang Ruan wrote:
> This function is used to handle errors which may cause data lost in
> filesystem. Such as memory-failure in fsdax mode.
>
> In XFS, it requires "rmapbt" feature in order to query for files or
> metadata which associated to the error block. Then we could call fs
> recover functions to try to repair the damaged data.(did not implemented
> in this patch)
>
> After that, the memory-failure also needs to kill processes who are
> using those files. The struct mf_recover_controller is created to store
> necessary parameters.
>
> Signed-off-by: Shiyang Ruan <[email protected]>
> ---
> fs/xfs/xfs_super.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++
> include/linux/fs.h | 1 +
> include/linux/mm.h | 6 ++++
> 3 files changed, 87 insertions(+)
>
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 71ac6c1cdc36..118d9c5d9e1e 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -35,6 +35,10 @@
> #include "xfs_refcount_item.h"
> #include "xfs_bmap_item.h"
> #include "xfs_reflink.h"
> +#include "xfs_alloc.h"
> +#include "xfs_rmap.h"
> +#include "xfs_rmap_btree.h"
> +#include "xfs_bit.h"
>
> #include <linux/magic.h>
> #include <linux/fs_context.h>
> @@ -1104,6 +1108,81 @@ xfs_fs_free_cached_objects(
> return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
> }
>
> +static int
> +xfs_storage_lost_helper(
> + struct xfs_btree_cur *cur,
> + struct xfs_rmap_irec *rec,
> + void *priv)
> +{
> + struct xfs_inode *ip;
> + struct mf_recover_controller *mfrc = priv;
> + int rc = 0;
> +
> + if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner)) {
> + // TODO check and try to fix metadata
> + } else {
> + /*
> + * Get files that incore, filter out others that are not in use.
> + */
> + xfs_iget(cur->bc_mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
> + 0, &ip);

Missing return value check here.

> + if (!ip)
> + return 0;
> + if (!VFS_I(ip)->i_mapping)
> + goto out;
> +
> + rc = mfrc->recover_fn(mfrc->pfn, mfrc->flags,
> + VFS_I(ip)->i_mapping, rec->rm_offset);
> +
> + // TODO try to fix data
> +out:
> + xfs_irele(ip);
> + }
> +
> + return rc;
> +}
> +
> +static int
> +xfs_fs_storage_lost(
> + struct super_block *sb,
> + loff_t offset,

offset into which device? XFS supports three...

I'm also a little surprised you don't pass in a length.

I guess that means this function will get called repeatedly for every
byte in the poisoned range?

> + void *priv)
> +{
> + struct xfs_mount *mp = XFS_M(sb);
> + struct xfs_trans *tp = NULL;
> + struct xfs_btree_cur *cur = NULL;
> + struct xfs_rmap_irec rmap_low, rmap_high;
> + struct xfs_buf *agf_bp = NULL;
> + xfs_fsblock_t fsbno = XFS_B_TO_FSB(mp, offset);
> + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno);
> + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, fsbno);
> + int error = 0;
> +
> + error = xfs_trans_alloc_empty(mp, &tp);
> + if (error)
> + return error;
> +
> + error = xfs_alloc_read_agf(mp, tp, agno, 0, &agf_bp);
> + if (error)
> + return error;
> +
> + cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, agno);

...and this is definitely the wrong call sequence if the malfunctioning
device is the realtime device. If a dax rt device dies, you'll be
shooting down files on the data device, which will cause all sorts of
problems.

Question: Should all this poison recovery stuff go into a new file?
xfs_poison.c? There's already a lot of code in xfs_super.c.

--D

> +
> + /* Construct a range for rmap query */
> + memset(&rmap_low, 0, sizeof(rmap_low));
> + memset(&rmap_high, 0xFF, sizeof(rmap_high));
> + rmap_low.rm_startblock = rmap_high.rm_startblock = agbno;
> +
> + error = xfs_rmap_query_range(cur, &rmap_low, &rmap_high,
> + xfs_storage_lost_helper, priv);
> + if (error == -ECANCELED)
> + error = 0;
> +
> + xfs_btree_del_cursor(cur, error);
> + xfs_trans_brelse(tp, agf_bp);
> + return error;
> +}
> +
> static const struct super_operations xfs_super_operations = {
> .alloc_inode = xfs_fs_alloc_inode,
> .destroy_inode = xfs_fs_destroy_inode,
> @@ -1117,6 +1196,7 @@ static const struct super_operations xfs_super_operations = {
> .show_options = xfs_fs_show_options,
> .nr_cached_objects = xfs_fs_nr_cached_objects,
> .free_cached_objects = xfs_fs_free_cached_objects,
> + .storage_lost = xfs_fs_storage_lost,
> };
>
> static int
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index e019ea2f1347..bd90485cfdc9 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1951,6 +1951,7 @@ struct super_operations {
> struct shrink_control *);
> long (*free_cached_objects)(struct super_block *,
> struct shrink_control *);
> + int (*storage_lost)(struct super_block *sb, loff_t offset, void *priv);
> };
>
> /*
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 1983e08f5906..3f0c36e1bf3d 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3002,6 +3002,12 @@ extern void shake_page(struct page *p, int access);
> extern atomic_long_t num_poisoned_pages __read_mostly;
> extern int soft_offline_page(unsigned long pfn, int flags);
>
> +struct mf_recover_controller {
> + int (*recover_fn)(unsigned long pfn, int flags,
> + struct address_space *mapping, pgoff_t index);
> + unsigned long pfn;
> + int flags;
> +};
>
> /*
> * Error handlers for various types of pages.
> --
> 2.28.0
>
>
>

2020-09-16 02:18:12

by Ruan Shiyang

[permalink] [raw]
Subject: Re: [RFC PATCH 1/4] fs: introduce ->storage_lost() for memory-failure



On 2020/9/16 上午12:16, Darrick J. Wong wrote:
> On Tue, Sep 15, 2020 at 06:13:08PM +0800, Shiyang Ruan wrote:
>> This function is used to handle errors which may cause data lost in
>> filesystem. Such as memory-failure in fsdax mode.
>>
>> In XFS, it requires "rmapbt" feature in order to query for files or
>> metadata which associated to the error block. Then we could call fs
>> recover functions to try to repair the damaged data.(did not implemented
>> in this patch)
>>
>> After that, the memory-failure also needs to kill processes who are
>> using those files. The struct mf_recover_controller is created to store
>> necessary parameters.
>>
>> Signed-off-by: Shiyang Ruan <[email protected]>
>> ---
>> fs/xfs/xfs_super.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++
>> include/linux/fs.h | 1 +
>> include/linux/mm.h | 6 ++++
>> 3 files changed, 87 insertions(+)
>>
>> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
>> index 71ac6c1cdc36..118d9c5d9e1e 100644
>> --- a/fs/xfs/xfs_super.c
>> +++ b/fs/xfs/xfs_super.c
>> @@ -35,6 +35,10 @@
>> #include "xfs_refcount_item.h"
>> #include "xfs_bmap_item.h"
>> #include "xfs_reflink.h"
>> +#include "xfs_alloc.h"
>> +#include "xfs_rmap.h"
>> +#include "xfs_rmap_btree.h"
>> +#include "xfs_bit.h"
>>
>> #include <linux/magic.h>
>> #include <linux/fs_context.h>
>> @@ -1104,6 +1108,81 @@ xfs_fs_free_cached_objects(
>> return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
>> }
>>
>> +static int
>> +xfs_storage_lost_helper(
>> + struct xfs_btree_cur *cur,
>> + struct xfs_rmap_irec *rec,
>> + void *priv)
>> +{
>> + struct xfs_inode *ip;
>> + struct mf_recover_controller *mfrc = priv;
>> + int rc = 0;
>> +
>> + if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner)) {
>> + // TODO check and try to fix metadata
>> + } else {
>> + /*
>> + * Get files that incore, filter out others that are not in use.
>> + */
>> + xfs_iget(cur->bc_mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
>> + 0, &ip);
>
> Missing return value check here.

Yes, I ignored it. My fault.

>
>> + if (!ip)
>> + return 0;
>> + if (!VFS_I(ip)->i_mapping)
>> + goto out;
>> +
>> + rc = mfrc->recover_fn(mfrc->pfn, mfrc->flags,
>> + VFS_I(ip)->i_mapping, rec->rm_offset);
>> +
>> + // TODO try to fix data
>> +out:
>> + xfs_irele(ip);
>> + }
>> +
>> + return rc;
>> +}
>> +
>> +static int
>> +xfs_fs_storage_lost(
>> + struct super_block *sb,
>> + loff_t offset,
>
> offset into which device? XFS supports three...
>
> I'm also a little surprised you don't pass in a length.
>
> I guess that means this function will get called repeatedly for every
> byte in the poisoned range?

The memory-failure will triggered on one PFN each time, so its length
could be one PAGE_SIZE. But I think you are right, it's better to tell
filesystem how much range is effected and need to be fixed. I didn't
know but I think there may be some other cases besides memory-failure.
So the length is necessary.

>
>> + void *priv)
>> +{
>> + struct xfs_mount *mp = XFS_M(sb);
>> + struct xfs_trans *tp = NULL;
>> + struct xfs_btree_cur *cur = NULL;
>> + struct xfs_rmap_irec rmap_low, rmap_high;
>> + struct xfs_buf *agf_bp = NULL;
>> + xfs_fsblock_t fsbno = XFS_B_TO_FSB(mp, offset);
>> + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno);
>> + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, fsbno);
>> + int error = 0;
>> +
>> + error = xfs_trans_alloc_empty(mp, &tp);
>> + if (error)
>> + return error;
>> +
>> + error = xfs_alloc_read_agf(mp, tp, agno, 0, &agf_bp);
>> + if (error)
>> + return error;
>> +
>> + cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, agno);
>
> ...and this is definitely the wrong call sequence if the malfunctioning
> device is the realtime device. If a dax rt device dies, you'll be
> shooting down files on the data device, which will cause all sorts of
> problems.

I didn't notice that. Let me think about it.
>
> Question: Should all this poison recovery stuff go into a new file?
> xfs_poison.c? There's already a lot of code in xfs_super.c.

Yes, it's a bit too much. I'll move them into a new file.


--
Thanks,
Ruan Shiyang.
>
> --D
>
>> +
>> + /* Construct a range for rmap query */
>> + memset(&rmap_low, 0, sizeof(rmap_low));
>> + memset(&rmap_high, 0xFF, sizeof(rmap_high));
>> + rmap_low.rm_startblock = rmap_high.rm_startblock = agbno;
>> +
>> + error = xfs_rmap_query_range(cur, &rmap_low, &rmap_high,
>> + xfs_storage_lost_helper, priv);
>> + if (error == -ECANCELED)
>> + error = 0;
>> +
>> + xfs_btree_del_cursor(cur, error);
>> + xfs_trans_brelse(tp, agf_bp);
>> + return error;
>> +}
>> +
>> static const struct super_operations xfs_super_operations = {
>> .alloc_inode = xfs_fs_alloc_inode,
>> .destroy_inode = xfs_fs_destroy_inode,
>> @@ -1117,6 +1196,7 @@ static const struct super_operations xfs_super_operations = {
>> .show_options = xfs_fs_show_options,
>> .nr_cached_objects = xfs_fs_nr_cached_objects,
>> .free_cached_objects = xfs_fs_free_cached_objects,
>> + .storage_lost = xfs_fs_storage_lost,
>> };
>>
>> static int
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index e019ea2f1347..bd90485cfdc9 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -1951,6 +1951,7 @@ struct super_operations {
>> struct shrink_control *);
>> long (*free_cached_objects)(struct super_block *,
>> struct shrink_control *);
>> + int (*storage_lost)(struct super_block *sb, loff_t offset, void *priv);
>> };
>>
>> /*
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 1983e08f5906..3f0c36e1bf3d 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -3002,6 +3002,12 @@ extern void shake_page(struct page *p, int access);
>> extern atomic_long_t num_poisoned_pages __read_mostly;
>> extern int soft_offline_page(unsigned long pfn, int flags);
>>
>> +struct mf_recover_controller {
>> + int (*recover_fn)(unsigned long pfn, int flags,
>> + struct address_space *mapping, pgoff_t index);
>> + unsigned long pfn;
>> + int flags;
>> +};
>>
>> /*
>> * Error handlers for various types of pages.
>> --
>> 2.28.0
>>
>>
>>
>
>