by Dan Williams

[permalink] [raw]

在 2022/2/16 10:09, Dan Williams 写道:
> On Thu, Jan 27, 2022 at 4:41 AM Shiyang Ruan <[email protected]> wrote:
>>
>> Introduce a PAGE_MAPPING_DAX_COW flag to support association with CoW file
>> mappings. In this case, the dax-RMAP already takes the responsibility
>> to look up for shared files by given dax page. The page->mapping is no
>> longer to used for rmap but for marking that this dax page is shared.
>> And to make sure disassociation works fine, we use page->index as
>> refcount, and clear page->mapping to the initial state when page->index
>> is decreased to 0.
>>
>> With the help of this new flag, it is able to distinguish normal case
>> and CoW case, and keep the warning in normal case.
>>
>> Signed-off-by: Shiyang Ruan <[email protected]>
>> ---
>> fs/dax.c | 65 ++++++++++++++++++++++++++++++++------
>> include/linux/page-flags.h | 6 ++++
>> 2 files changed, 62 insertions(+), 9 deletions(-)
>>
>> diff --git a/fs/dax.c b/fs/dax.c
>> index 250794a5b789..88879c579c1f 100644
>> --- a/fs/dax.c
>> +++ b/fs/dax.c
>> @@ -334,13 +334,46 @@ static unsigned long dax_end_pfn(void *entry)
>> for (pfn = dax_to_pfn(entry); \
>> pfn < dax_end_pfn(entry); pfn++)
>>
>> +static inline void dax_mapping_set_cow_flag(struct address_space *mapping)
>> +{
>> + mapping = (struct address_space *)PAGE_MAPPING_DAX_COW;
>> +}
>> +
>> +static inline bool dax_mapping_is_cow(struct address_space *mapping)
>> +{
>> + return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
>> +}
>> +
>> /*
>> - * TODO: for reflink+dax we need a way to associate a single page with
>> - * multiple address_space instances at different linear_page_index()
>> - * offsets.
>> + * Set or Update the page->mapping with FS_DAX_MAPPING_COW flag.
>> + * Return true if it is an Update.
>> + */
>> +static inline bool dax_mapping_set_cow(struct page *page)
>> +{
>> + if (page->mapping) {
>> + /* flag already set */
>> + if (dax_mapping_is_cow(page->mapping))
>> + return false;
>> +
>> + /*
>> + * This page has been mapped even before it is shared, just
>> + * need to set this FS_DAX_MAPPING_COW flag.
>> + */
>> + dax_mapping_set_cow_flag(page->mapping);
>> + return true;
>> + }
>> + /* Newly associate CoW mapping */
>> + dax_mapping_set_cow_flag(page->mapping);
>> + return false;
>> +}
>> +
>> +/*
>> + * When it is called in dax_insert_entry(), the cow flag will indicate that
>> + * whether this entry is shared by multiple files. If so, set the page->mapping
>> + * to be FS_DAX_MAPPING_COW, and use page->index as refcount.
>> */
>> static void dax_associate_entry(void *entry, struct address_space *mapping,
>> - struct vm_area_struct *vma, unsigned long address)
>> + struct vm_area_struct *vma, unsigned long address, bool cow)
>> {
>> unsigned long size = dax_entry_size(entry), pfn, index;
>> int i = 0;
>> @@ -352,9 +385,17 @@ static void dax_associate_entry(void *entry, struct address_space *mapping,
>> for_each_mapped_pfn(entry, pfn) {
>> struct page *page = pfn_to_page(pfn);
>>
>> - WARN_ON_ONCE(page->mapping);
>> - page->mapping = mapping;
>> - page->index = index + i++;
>> + if (cow) {
>> + if (dax_mapping_set_cow(page)) {
>> + /* Was normal, now updated to CoW */
>> + page->index = 2;
>> + } else
>> + page->index++;
>> + } else {
>> + WARN_ON_ONCE(page->mapping);
>> + page->mapping = mapping;
>> + page->index = index + i++;
>> + }
>> }
>> }
>>
>> @@ -370,7 +411,12 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
>> struct page *page = pfn_to_page(pfn);
>>
>> WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
>> - WARN_ON_ONCE(page->mapping && page->mapping != mapping);
>> + if (!dax_mapping_is_cow(page->mapping)) {
>> + /* keep the CoW flag if this page is still shared */
>> + if (page->index-- > 0)
>> + continue;
>> + } else
>> + WARN_ON_ONCE(page->mapping && page->mapping != mapping);
>> page->mapping = NULL;
>> page->index = 0;
>> }
>> @@ -810,7 +856,8 @@ static void *dax_insert_entry(struct xa_state *xas,
>> void *old;
>>
>> dax_disassociate_entry(entry, mapping, false);
>> - dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
>> + dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
>> + false);
>
> Where is the caller that passes 'true'? Also when that caller arrives
> introduce a separate dax_associate_cow_entry() as that's easier to
> read than dax_associate_entry(..., true) in case someone does not
> remember what that boolean flag means.

This flag is supposed to be used when CoW support is introduced. When
it is a CoW operation, which is decided by iomap & srcmap's flag, this
flag will be set true.

I think I should describe it in detail in the commit message.

>
> However, it's not clear to me that this approach is a good idea given
> that the filesystem is the source of truth for how many address_spaces
> this page mapping might be duplicated. What about a iomap_page_ops for
> fsdax to ask the filesystem when it is ok to clear the mapping
> association for a page?

I'll think how to implement it in this way.

--
Thanks,
Ruan.

>
>> /*
>> * Only swap our new entry into the page cache if the current
>> * entry is a zero page or an empty entry. If a normal PTE or
>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
>> index 1c3b6e5c8bfd..6370d279795a 100644
>> --- a/include/linux/page-flags.h
>> +++ b/include/linux/page-flags.h
>> @@ -572,6 +572,12 @@ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND)
>> #define PAGE_MAPPING_KSM (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
>> #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
>>
>> +/*
>> + * Different with flags above, this flag is used only for fsdax mode. It
>> + * indicates that this page->mapping is now under reflink case.
>> + */
>> +#define PAGE_MAPPING_DAX_COW 0x1
>> +
>> static __always_inline int PageMappingFlags(struct page *page)
>> {
>> return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0;
>> --
>> 2.34.1
>>
>>
>>

2022-02-16 07:39:09

by Dan Williams

[permalink] [raw]

Subject: Re: [PATCH v10 8/9] xfs: Implement ->notify_failure() for XFS

On Thu, Jan 27, 2022 at 4:41 AM Shiyang Ruan <[email protected]> wrote:
>
> Introduce xfs_notify_failure.c to handle failure related works, such as
> implement ->notify_failure(), register/unregister dax holder in xfs, and
> so on.
>
> If the rmap feature of XFS enabled, we can query it to find files and
> metadata which are associated with the corrupt data. For now all we do
> is kill processes with that file mapped into their address spaces, but
> future patches could actually do something about corrupt metadata.
>
> After that, the memory failure needs to notify the processes who are
> using those files.
>
> Signed-off-by: Shiyang Ruan <[email protected]>
> ---
> fs/xfs/Makefile | 1 +
> fs/xfs/xfs_buf.c | 12 ++
> fs/xfs/xfs_fsops.c | 3 +
> fs/xfs/xfs_mount.h | 1 +
> fs/xfs/xfs_notify_failure.c | 222 ++++++++++++++++++++++++++++++++++++
> fs/xfs/xfs_notify_failure.h | 10 ++
> 6 files changed, 249 insertions(+)
> create mode 100644 fs/xfs/xfs_notify_failure.c
> create mode 100644 fs/xfs/xfs_notify_failure.h
>
> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index 04611a1068b4..389970b3e13b 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -84,6 +84,7 @@ xfs-y += xfs_aops.o \
> xfs_message.o \
> xfs_mount.o \
> xfs_mru_cache.o \
> + xfs_notify_failure.o \
> xfs_pwork.o \
> xfs_reflink.o \
> xfs_stats.o \
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index b45e0d50a405..017010b3d601 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -19,6 +19,7 @@
> #include "xfs_errortag.h"
> #include "xfs_error.h"
> #include "xfs_ag.h"
> +#include "xfs_notify_failure.h"
>
> static struct kmem_cache *xfs_buf_cache;
>
> @@ -1892,6 +1893,8 @@ xfs_free_buftarg(
> list_lru_destroy(&btp->bt_lru);
>
> blkdev_issue_flush(btp->bt_bdev);
> + if (btp->bt_daxdev)
> + dax_unregister_holder(btp->bt_daxdev);
> fs_put_dax(btp->bt_daxdev);
>
> kmem_free(btp);
> @@ -1946,6 +1949,15 @@ xfs_alloc_buftarg(
> btp->bt_dev = bdev->bd_dev;
> btp->bt_bdev = bdev;
> btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off);
> + if (btp->bt_daxdev) {
> + if (dax_get_holder(btp->bt_daxdev)) {
> + xfs_err(mp, "DAX device already in use?!");

Per the earlier feedback this can be checked atomically inside of
dax_register_holder() with cmpxchg().

> + goto error_free;
> + }
> +
> + dax_register_holder(btp->bt_daxdev, mp,
> + &xfs_dax_holder_operations);
> + }
>
> /*
> * Buffer IO error rate limiting. Limit it to no more than 10 messages
> diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
> index 33e26690a8c4..d4d36c5bef11 100644
> --- a/fs/xfs/xfs_fsops.c
> +++ b/fs/xfs/xfs_fsops.c
> @@ -542,6 +542,9 @@ xfs_do_force_shutdown(
> } else if (flags & SHUTDOWN_CORRUPT_INCORE) {
> tag = XFS_PTAG_SHUTDOWN_CORRUPT;
> why = "Corruption of in-memory data";
> + } else if (flags & SHUTDOWN_CORRUPT_ONDISK) {
> + tag = XFS_PTAG_SHUTDOWN_CORRUPT;
> + why = "Corruption of on-disk metadata";
> } else {
> tag = XFS_PTAG_SHUTDOWN_IOERROR;
> why = "Metadata I/O Error";
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 00720a02e761..47ff4ac53c4c 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -435,6 +435,7 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
> #define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */
> #define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */
> #define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */
> +#define SHUTDOWN_CORRUPT_ONDISK 0x0010 /* corrupt metadata on device */
>
> #define XFS_SHUTDOWN_STRINGS \
> { SHUTDOWN_META_IO_ERROR, "metadata_io" }, \
> diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
> new file mode 100644
> index 000000000000..6abaa043f4bc
> --- /dev/null
> +++ b/fs/xfs/xfs_notify_failure.c
> @@ -0,0 +1,222 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2021 Fujitsu. All Rights Reserved.
> + */
> +
> +#include "xfs.h"
> +#include "xfs_shared.h"
> +#include "xfs_format.h"
> +#include "xfs_log_format.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_mount.h"
> +#include "xfs_alloc.h"
> +#include "xfs_bit.h"
> +#include "xfs_btree.h"
> +#include "xfs_inode.h"
> +#include "xfs_icache.h"
> +#include "xfs_rmap.h"
> +#include "xfs_rmap_btree.h"
> +#include "xfs_rtalloc.h"
> +#include "xfs_trans.h"
> +
> +#include <linux/mm.h>
> +#include <linux/dax.h>
> +
> +struct failure_info {
> + xfs_agblock_t startblock;
> + xfs_extlen_t blockcount;
> + int mf_flags;
> +};
> +
> +#if IS_ENABLED(CONFIG_MEMORY_FAILURE) && IS_ENABLED(CONFIG_FS_DAX)
> +static pgoff_t
> +xfs_failure_pgoff(
> + struct xfs_mount *mp,
> + const struct xfs_rmap_irec *rec,
> + const struct failure_info *notify)
> +{
> + uint64_t pos = rec->rm_offset;
> +
> + if (notify->startblock > rec->rm_startblock)
> + pos += XFS_FSB_TO_B(mp,
> + notify->startblock - rec->rm_startblock);
> + return pos >> PAGE_SHIFT;
> +}
> +
> +static unsigned long
> +xfs_failure_pgcnt(
> + struct xfs_mount *mp,
> + const struct xfs_rmap_irec *rec,
> + const struct failure_info *notify)
> +{
> + xfs_agblock_t end_rec;
> + xfs_agblock_t end_notify;
> + xfs_agblock_t start_cross;
> + xfs_agblock_t end_cross;
> +
> + start_cross = max(rec->rm_startblock, notify->startblock);
> +
> + end_rec = rec->rm_startblock + rec->rm_blockcount;
> + end_notify = notify->startblock + notify->blockcount;
> + end_cross = min(end_rec, end_notify);
> +
> + return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT;
> +}
> +
> +static int
> +xfs_dax_failure_fn(
> + struct xfs_btree_cur *cur,
> + const struct xfs_rmap_irec *rec,
> + void *data)
> +{
> + struct xfs_mount *mp = cur->bc_mp;
> + struct xfs_inode *ip;
> + struct failure_info *notify = data;
> + int error = 0;
> +
> + if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
> + (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
> + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
> + return -EFSCORRUPTED;
> + }
> +
> + /* Get files that incore, filter out others that are not in use. */
> + error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
> + 0, &ip);
> + /* Continue the rmap query if the inode isn't incore */
> + if (error == -ENODATA)
> + return 0;
> + if (error)
> + return error;
> +
> + error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
> + xfs_failure_pgoff(mp, rec, notify),
> + xfs_failure_pgcnt(mp, rec, notify),
> + notify->mf_flags);
> + xfs_irele(ip);
> + return error;
> +}
> +#else
> +static int
> +xfs_dax_failure_fn(
> + struct xfs_btree_cur *cur,
> + const struct xfs_rmap_irec *rec,
> + void *data)
> +{
> + struct xfs_mount *mp = cur->bc_mp;
> +
> + /* No other option besides shutting down the fs. */
> + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
> + return -EFSCORRUPTED;
> +}
> +#endif /* CONFIG_MEMORY_FAILURE && CONFIG_FS_DAX */
> +
> +static int
> +xfs_dax_notify_ddev_failure(
> + struct xfs_mount *mp,
> + xfs_daddr_t daddr,
> + xfs_daddr_t bblen,
> + int mf_flags)
> +{
> + struct xfs_trans *tp = NULL;
> + struct xfs_btree_cur *cur = NULL;
> + struct xfs_buf *agf_bp = NULL;
> + struct failure_info notify;
> + int error = 0;
> + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr);
> + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno);
> + xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen);
> + xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
> +
> + /*
> + * Once a file is found by rmap, we take the intersection of two ranges:
> + * notification range and file extent range, to make sure we won't go
> + * out of scope.
> + */
> + notify.mf_flags = mf_flags;
> + notify.startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
> + notify.blockcount = XFS_BB_TO_FSB(mp, bblen);
> +
> + error = xfs_trans_alloc_empty(mp, &tp);
> + if (error)
> + return error;
> +
> + for (; agno <= end_agno; agno++) {
> + struct xfs_rmap_irec ri_low = { };
> + struct xfs_rmap_irec ri_high;
> +
> + error = xfs_alloc_read_agf(mp, tp, agno, 0, &agf_bp);
> + if (error)
> + break;
> +
> + cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, agf_bp->b_pag);
> +
> + /*
> + * Set the rmap range from ri_low to ri_high, which represents
> + * a [start, end] where we looking for the files or metadata.
> + * The part of range out of a AG will be ignored. So, it's fine
> + * to set ri_low to "startblock" in all loops. When it reaches
> + * the last AG, set the ri_high to "endblock" to make sure we
> + * actually end at the end.
> + */
> + memset(&ri_high, 0xFF, sizeof(ri_high));
> + ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
> + if (agno == end_agno)
> + ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno);
> +
> + error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
> + xfs_dax_failure_fn, &notify);
> + xfs_btree_del_cursor(cur, error);
> + xfs_trans_brelse(tp, agf_bp);
> + if (error)
> + break;
> +
> + fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0);
> + }
> +
> + xfs_trans_cancel(tp);
> + return error;
> +}
> +
> +static int
> +xfs_dax_notify_failure(
> + struct dax_device *dax_dev,
> + u64 offset,
> + u64 len,
> + int mf_flags)
> +{
> + struct xfs_mount *mp = dax_get_holder(dax_dev);
> +
> + if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) {
> + xfs_warn(mp,
> + "notify_failure() not supported on realtime device!");
> + return -EOPNOTSUPP;
> + }
> +
> + if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
> + mp->m_logdev_targp != mp->m_ddev_targp) {
> + xfs_err(mp, "ondisk log corrupt, shutting down fs!");
> + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
> + return -EFSCORRUPTED;
> + }
> +
> + if (!xfs_has_rmapbt(mp)) {
> + xfs_warn(mp, "notify_failure() needs rmapbt enabled!");

Doesn't this need to be resolved at mount time?

> + return -EOPNOTSUPP;
> + }
> +
> + if (offset < mp->m_ddev_targp->bt_dax_part_off ||
> + ((offset + len) > mp->m_ddev_targp->bt_bdev->bd_nr_sectors <<
> + SECTOR_SHIFT)) {

With the removal of partition support bt_dax_part_off can never be
non-zero and the offset / len validation should be done against the
boundaries of the dax device in terms of physical page offset and
nr_pages.

> + xfs_warn(mp, "notify_failure() goes out of the scope.");
> + return -ENXIO;
> + }
> +
> + offset -= mp->m_ddev_targp->bt_dax_part_off;
> + return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
> + mf_flags);

Same here, all offset adjustment code can be dropped because failure
notification should be disabled at mount time if the mount point is
not associated with a whole disk device.

> +}
> +
> +const struct dax_holder_operations xfs_dax_holder_operations = {
> + .notify_failure = xfs_dax_notify_failure,
> +};
> diff --git a/fs/xfs/xfs_notify_failure.h b/fs/xfs/xfs_notify_failure.h
> new file mode 100644
> index 000000000000..f40cb315e7ce
> --- /dev/null
> +++ b/fs/xfs/xfs_notify_failure.h
> @@ -0,0 +1,10 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2021 Fujitsu. All Rights Reserved.
> + */
> +#ifndef __XFS_NOTIFY_FAILURE_H__
> +#define __XFS_NOTIFY_FAILURE_H__
> +
> +extern const struct dax_holder_operations xfs_dax_holder_operations;
> +
> +#endif /* __XFS_NOTIFY_FAILURE_H__ */
> --
> 2.34.1
>
>
>