2022-05-09 08:48:40

by Shiyang Ruan

[permalink] [raw]
Subject: [PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink

This is a combination of two patchsets:
1.fsdax-rmap: https://lore.kernel.org/linux-xfs/[email protected]/
2.fsdax-reflink: https://lore.kernel.org/linux-xfs/[email protected]/

Changes since v13 of fsdax-rmap:
1. Fixed mistakes during rebasing code to latest next-
2. Rebased to next-20220504

Changes since v10 of fsdax-reflink:
1. Rebased to next-20220504 and fsdax-rmap
2. Dropped a needless cleanup patch: 'fsdax: Convert dax_iomap_zero to
iter model'
3. Fixed many conflicts during rebasing
4. Fixed a dedupe bug in Patch 05: the actuall length to compare could be
shorter than smap->length or dmap->length.
PS: There are many changes during rebasing. I think it's better to
review again.

==
Shiyang Ruan (14):
fsdax-rmap:
dax: Introduce holder for dax_device
mm: factor helpers for memory_failure_dev_pagemap
pagemap,pmem: Introduce ->memory_failure()
fsdax: Introduce dax_lock_mapping_entry()
mm: Introduce mf_dax_kill_procs() for fsdax case
xfs: Implement ->notify_failure() for XFS
fsdax: set a CoW flag when associate reflink mappings
fsdax-reflink:
fsdax: Output address in dax_iomap_pfn() and rename it
fsdax: Introduce dax_iomap_cow_copy()
fsdax: Replace mmap entry in case of CoW
fsdax: Add dax_iomap_cow_copy() for dax zero
fsdax: Dedup file range to use a compare function
xfs: support CoW in fsdax mode
xfs: Add dax dedupe support

drivers/dax/super.c | 67 +++++-
drivers/md/dm.c | 2 +-
drivers/nvdimm/pmem.c | 17 ++
fs/dax.c | 398 ++++++++++++++++++++++++++++++------
fs/erofs/super.c | 13 +-
fs/ext2/super.c | 7 +-
fs/ext4/super.c | 9 +-
fs/remap_range.c | 31 ++-
fs/xfs/Makefile | 5 +
fs/xfs/xfs_buf.c | 10 +-
fs/xfs/xfs_file.c | 9 +-
fs/xfs/xfs_fsops.c | 3 +
fs/xfs/xfs_inode.c | 69 ++++++-
fs/xfs/xfs_inode.h | 1 +
fs/xfs/xfs_iomap.c | 46 ++++-
fs/xfs/xfs_iomap.h | 3 +
fs/xfs/xfs_mount.h | 1 +
fs/xfs/xfs_notify_failure.c | 220 ++++++++++++++++++++
fs/xfs/xfs_reflink.c | 12 +-
fs/xfs/xfs_super.h | 1 +
include/linux/dax.h | 56 ++++-
include/linux/fs.h | 12 +-
include/linux/memremap.h | 12 ++
include/linux/mm.h | 2 +
include/linux/page-flags.h | 6 +
mm/memory-failure.c | 257 ++++++++++++++++-------
26 files changed, 1087 insertions(+), 182 deletions(-)
create mode 100644 fs/xfs/xfs_notify_failure.c

--
2.35.1





2022-05-09 09:42:15

by Shiyang Ruan

[permalink] [raw]
Subject: [PATCH v14 05/07] mm: Introduce mf_dax_kill_procs() for fsdax case

This new function is a variant of mf_generic_kill_procs that accepts a
file, offset pair instead of a struct to support multiple files sharing
a DAX mapping. It is intended to be called by the file systems as part
of the memory_failure handler after the file system performed a reverse
mapping from the storage address to the file and file offset.

Signed-off-by: Shiyang Ruan <[email protected]>
Reviewed-by: Dan Williams <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
Reviewed-by: Darrick J. Wong <[email protected]>
Reviewed-by: Miaohe Lin <[email protected]>
---
include/linux/mm.h | 2 +
mm/memory-failure.c | 96 ++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 88 insertions(+), 10 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index de32c0383387..e2c0f69f0660 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3227,6 +3227,8 @@ enum mf_flags {
MF_SOFT_OFFLINE = 1 << 3,
MF_UNPOISON = 1 << 4,
};
+int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
+ unsigned long count, int mf_flags);
extern int memory_failure(unsigned long pfn, int flags);
extern void memory_failure_queue(unsigned long pfn, int flags);
extern void memory_failure_queue_kick(int cpu);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index aeb19593af9c..aedfc5097420 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -295,10 +295,9 @@ void shake_page(struct page *p)
}
EXPORT_SYMBOL_GPL(shake_page);

-static unsigned long dev_pagemap_mapping_shift(struct page *page,
- struct vm_area_struct *vma)
+static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
+ unsigned long address)
{
- unsigned long address = vma_address(page, vma);
unsigned long ret = 0;
pgd_t *pgd;
p4d_t *p4d;
@@ -338,10 +337,14 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
/*
* Schedule a process for later kill.
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
+ *
+ * Notice: @fsdax_pgoff is used only when @p is a fsdax page.
+ * In other cases, such as anonymous and file-backend page, the address to be
+ * killed can be caculated by @p itself.
*/
static void add_to_kill(struct task_struct *tsk, struct page *p,
- struct vm_area_struct *vma,
- struct list_head *to_kill)
+ pgoff_t fsdax_pgoff, struct vm_area_struct *vma,
+ struct list_head *to_kill)
{
struct to_kill *tk;

@@ -352,9 +355,15 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
}

tk->addr = page_address_in_vma(p, vma);
- if (is_zone_device_page(p))
- tk->size_shift = dev_pagemap_mapping_shift(p, vma);
- else
+ if (is_zone_device_page(p)) {
+ /*
+ * Since page->mapping is not used for fsdax, we need
+ * calculate the address based on the vma.
+ */
+ if (p->pgmap->type == MEMORY_DEVICE_FS_DAX)
+ tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
+ tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
+ } else
tk->size_shift = page_shift(compound_head(p));

/*
@@ -503,7 +512,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
if (!page_mapped_in_vma(page, vma))
continue;
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill);
+ add_to_kill(t, page, 0, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -539,13 +548,41 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* to be informed of all such data corruptions.
*/
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill);
+ add_to_kill(t, page, 0, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
i_mmap_unlock_read(mapping);
}

+#ifdef CONFIG_FS_DAX
+/*
+ * Collect processes when the error hit a fsdax page.
+ */
+static void collect_procs_fsdax(struct page *page,
+ struct address_space *mapping, pgoff_t pgoff,
+ struct list_head *to_kill)
+{
+ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+
+ i_mmap_lock_read(mapping);
+ read_lock(&tasklist_lock);
+ for_each_process(tsk) {
+ struct task_struct *t = task_early_kill(tsk, true);
+
+ if (!t)
+ continue;
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+ if (vma->vm_mm == t->mm)
+ add_to_kill(t, page, pgoff, vma, to_kill);
+ }
+ }
+ read_unlock(&tasklist_lock);
+ i_mmap_unlock_read(mapping);
+}
+#endif /* CONFIG_FS_DAX */
+
/*
* Collect the processes who have the corrupted page mapped to kill.
*/
@@ -1580,6 +1617,45 @@ static int mf_generic_kill_procs(unsigned long long pfn, int flags,
return rc;
}

+#ifdef CONFIG_FS_DAX
+/**
+ * mf_dax_kill_procs - Collect and kill processes who are using this file range
+ * @mapping: address_space of the file in use
+ * @index: start pgoff of the range within the file
+ * @count: length of the range, in unit of PAGE_SIZE
+ * @mf_flags: memory failure flags
+ */
+int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
+ unsigned long count, int mf_flags)
+{
+ LIST_HEAD(to_kill);
+ dax_entry_t cookie;
+ struct page *page;
+ size_t end = index + count;
+
+ mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+
+ for (; index < end; index++) {
+ page = NULL;
+ cookie = dax_lock_mapping_entry(mapping, index, &page);
+ if (!cookie)
+ return -EBUSY;
+ if (!page)
+ goto unlock;
+
+ SetPageHWPoison(page);
+
+ collect_procs_fsdax(page, mapping, index, &to_kill);
+ unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
+ index, mf_flags);
+unlock:
+ dax_unlock_mapping_entry(mapping, index, cookie);
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
+#endif /* CONFIG_FS_DAX */
+
/*
* Called from hugetlb code with hugetlb_lock held.
*
--
2.35.1




2022-05-09 11:33:35

by Shiyang Ruan

[permalink] [raw]
Subject: [PATCH v11 06/07] xfs: support CoW in fsdax mode

In fsdax mode, WRITE and ZERO on a shared extent need CoW performed.
After that, new allocated extents needs to be remapped to the file.
So, add a CoW identification in ->iomap_begin(), and implement
->iomap_end() to do the remapping work.

Signed-off-by: Shiyang Ruan <[email protected]>
Reviewed-by: Darrick J. Wong <[email protected]>
---
fs/xfs/xfs_file.c | 7 ++-----
fs/xfs/xfs_iomap.c | 46 +++++++++++++++++++++++++++++++++++++++++++++-
fs/xfs/xfs_iomap.h | 3 +++
3 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index af954a5b71f8..5a4508b23b51 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -669,7 +669,7 @@ xfs_file_dax_write(
pos = iocb->ki_pos;

trace_xfs_file_dax_write(iocb, from);
- ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
+ ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
i_size_write(inode, iocb->ki_pos);
error = xfs_setfilesize(ip, pos, ret);
@@ -1285,10 +1285,7 @@ __xfs_filemap_fault(
pfn_t pfn;

xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
- (write_fault && !vmf->cow_page) ?
- &xfs_direct_write_iomap_ops :
- &xfs_read_iomap_ops);
+ ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn);
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 5a393259a3a3..e35842215d22 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -27,6 +27,7 @@
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
#include "xfs_reflink.h"
+#include "linux/dax.h"

#define XFS_ALLOC_ALIGN(mp, off) \
(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
@@ -773,7 +774,8 @@ xfs_direct_write_iomap_begin(

/* may drop and re-acquire the ilock */
error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
- &lockmode, flags & IOMAP_DIRECT);
+ &lockmode,
+ (flags & IOMAP_DIRECT) || IS_DAX(inode));
if (error)
goto out_unlock;
if (shared)
@@ -867,6 +869,33 @@ const struct iomap_ops xfs_direct_write_iomap_ops = {
.iomap_begin = xfs_direct_write_iomap_begin,
};

+static int
+xfs_dax_write_iomap_end(
+ struct inode *inode,
+ loff_t pos,
+ loff_t length,
+ ssize_t written,
+ unsigned flags,
+ struct iomap *iomap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+
+ if (!xfs_is_cow_inode(ip))
+ return 0;
+
+ if (!written) {
+ xfs_reflink_cancel_cow_range(ip, pos, length, true);
+ return 0;
+ }
+
+ return xfs_reflink_end_cow(ip, pos, written);
+}
+
+const struct iomap_ops xfs_dax_write_iomap_ops = {
+ .iomap_begin = xfs_direct_write_iomap_begin,
+ .iomap_end = xfs_dax_write_iomap_end,
+};
+
static int
xfs_buffered_write_iomap_begin(
struct inode *inode,
@@ -1358,3 +1387,18 @@ xfs_truncate_page(
return iomap_truncate_page(inode, pos, did_zero,
&xfs_buffered_write_iomap_ops);
}
+
+#ifdef CONFIG_FS_DAX
+int
+xfs_dax_fault(
+ struct vm_fault *vmf,
+ enum page_entry_size pe_size,
+ bool write_fault,
+ pfn_t *pfn)
+{
+ return dax_iomap_fault(vmf, pe_size, pfn, NULL,
+ (write_fault && !vmf->cow_page) ?
+ &xfs_dax_write_iomap_ops :
+ &xfs_read_iomap_ops);
+}
+#endif
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index e88dc162c785..89dfa3bb099f 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -25,6 +25,8 @@ int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
bool *did_zero);
int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero);
+int xfs_dax_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+ bool write_fault, pfn_t *pfn);

static inline xfs_filblks_t
xfs_aligned_fsb_count(
@@ -51,5 +53,6 @@ extern const struct iomap_ops xfs_direct_write_iomap_ops;
extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
+extern const struct iomap_ops xfs_dax_write_iomap_ops;

#endif /* __XFS_IOMAP_H__*/
--
2.35.1




2022-05-10 10:06:06

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v11 06/07] xfs: support CoW in fsdax mode

> +#ifdef CONFIG_FS_DAX
> +int
> +xfs_dax_fault(
> + struct vm_fault *vmf,
> + enum page_entry_size pe_size,
> + bool write_fault,
> + pfn_t *pfn)
> +{
> + return dax_iomap_fault(vmf, pe_size, pfn, NULL,
> + (write_fault && !vmf->cow_page) ?
> + &xfs_dax_write_iomap_ops :
> + &xfs_read_iomap_ops);
> +}
> +#endif

Is there any reason this is in xfs_iomap.c and not xfs_file.c?

Otherwise the patch looks good:


Reviewed-by: Christoph Hellwig <[email protected]>

2022-05-10 11:34:32

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink

The patch numbering due looks odd due to the combination of the
two series. But otherwise this looks good to me modulo the one
minor nitpick.

2022-05-10 13:46:25

by Shiyang Ruan

[permalink] [raw]
Subject: Re: [PATCH v11 06/07] xfs: support CoW in fsdax mode



在 2022/5/10 13:45, Christoph Hellwig 写道:
>> +#ifdef CONFIG_FS_DAX
>> +int
>> +xfs_dax_fault(
>> + struct vm_fault *vmf,
>> + enum page_entry_size pe_size,
>> + bool write_fault,
>> + pfn_t *pfn)
>> +{
>> + return dax_iomap_fault(vmf, pe_size, pfn, NULL,
>> + (write_fault && !vmf->cow_page) ?
>> + &xfs_dax_write_iomap_ops :
>> + &xfs_read_iomap_ops);
>> +}
>> +#endif
>
> Is there any reason this is in xfs_iomap.c and not xfs_file.c?

Yes, It's better to put it in xfs_file.c since it's the only caller. I
didn't notice it...


--
Thanks,
Ruan.

>
> Otherwise the patch looks good:
>
>
> Reviewed-by: Christoph Hellwig <[email protected]>



2022-05-11 02:55:55

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink

[ add Andrew ]


On Tue, May 10, 2022 at 6:49 PM Dave Chinner <[email protected]> wrote:
>
> On Tue, May 10, 2022 at 05:03:52PM -0700, Darrick J. Wong wrote:
> > On Sun, May 08, 2022 at 10:36:06PM +0800, Shiyang Ruan wrote:
> > > This is a combination of two patchsets:
> > > 1.fsdax-rmap: https://lore.kernel.org/linux-xfs/[email protected]/
> > > 2.fsdax-reflink: https://lore.kernel.org/linux-xfs/[email protected]/
> > >
> > > Changes since v13 of fsdax-rmap:
> > > 1. Fixed mistakes during rebasing code to latest next-
> > > 2. Rebased to next-20220504
> > >
> > > Changes since v10 of fsdax-reflink:
> > > 1. Rebased to next-20220504 and fsdax-rmap
> > > 2. Dropped a needless cleanup patch: 'fsdax: Convert dax_iomap_zero to
> > > iter model'
> > > 3. Fixed many conflicts during rebasing
> > > 4. Fixed a dedupe bug in Patch 05: the actuall length to compare could be
> > > shorter than smap->length or dmap->length.
> > > PS: There are many changes during rebasing. I think it's better to
> > > review again.
> > >
> > > ==
> > > Shiyang Ruan (14):
> > > fsdax-rmap:
> > > dax: Introduce holder for dax_device
> > > mm: factor helpers for memory_failure_dev_pagemap
> > > pagemap,pmem: Introduce ->memory_failure()
> > > fsdax: Introduce dax_lock_mapping_entry()
> > > mm: Introduce mf_dax_kill_procs() for fsdax case
> >
> > Hmm. This patchset touches at least the dax, pagecache, and xfs
> > subsystems. Assuming it's too late for 5.19, how should we stage this
> > for 5.20?
>
> Yeah, it's past my "last date for this merge cycle" which was
> -rc6. I expected stuff might slip a little - as it has with the LARP
> code - but I don't have the time and bandwidth to start working
> on merging another feature from scratch before the merge window
> comes around.
>
> Getting the dax+reflink stuff in this cycle was always an optimistic
> stretch, but I wanted to try so that there was no doubt it would be
> ready for merge in the next cycle...
>
> > I could just add the entire series to iomap-5.20-merge and base the
> > xfs-5.20-merge off of that? But I'm not sure what else might be landing
> > in the other subsystems, so I'm open to input.
>
> It'll need to be a stable branch somewhere, but I don't think it
> really matters where al long as it's merged into the xfs for-next
> tree so it gets filesystem test coverage...

So how about let the notify_failure() bits go through -mm this cycle,
if Andrew will have it, and then the reflnk work has a clean v5.19-rc1
baseline to build from?

2022-05-11 05:53:55

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink

On Tue, 10 May 2022 19:43:01 -0700 "Darrick J. Wong" <[email protected]> wrote:

> On Tue, May 10, 2022 at 07:28:53PM -0700, Andrew Morton wrote:
> > On Tue, 10 May 2022 18:55:50 -0700 Dan Williams <[email protected]> wrote:
> >
> > > > It'll need to be a stable branch somewhere, but I don't think it
> > > > really matters where al long as it's merged into the xfs for-next
> > > > tree so it gets filesystem test coverage...
> > >
> > > So how about let the notify_failure() bits go through -mm this cycle,
> > > if Andrew will have it, and then the reflnk work has a clean v5.19-rc1
> > > baseline to build from?
> >
> > What are we referring to here? I think a minimal thing would be the
> > memremap.h and memory-failure.c changes from
> > https://lkml.kernel.org/r/[email protected] ?
> >
> > Sure, I can scoot that into 5.19-rc1 if you think that's best. It
> > would probably be straining things to slip it into 5.19.
> >
> > The use of EOPNOTSUPP is a bit suspect, btw. It *sounds* like the
> > right thing, but it's a networking errno. I suppose livable with if it
> > never escapes the kernel, but if it can get back to userspace then a
> > user would be justified in wondering how the heck a filesystem
> > operation generated a networking errno?
>
> <shrug> most filesystems return EOPNOTSUPP rather enthusiastically when
> they don't know how to do something...

Can it propagate back to userspace?

2022-05-11 07:03:08

by Dave Chinner

[permalink] [raw]
Subject: Re: [PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink

On Tue, May 10, 2022 at 06:55:50PM -0700, Dan Williams wrote:
> [ add Andrew ]
>
>
> On Tue, May 10, 2022 at 6:49 PM Dave Chinner <[email protected]> wrote:
> >
> > On Tue, May 10, 2022 at 05:03:52PM -0700, Darrick J. Wong wrote:
> > > On Sun, May 08, 2022 at 10:36:06PM +0800, Shiyang Ruan wrote:
> > > > This is a combination of two patchsets:
> > > > 1.fsdax-rmap: https://lore.kernel.org/linux-xfs/[email protected]/
> > > > 2.fsdax-reflink: https://lore.kernel.org/linux-xfs/[email protected]/
> > > >
> > > > Changes since v13 of fsdax-rmap:
> > > > 1. Fixed mistakes during rebasing code to latest next-
> > > > 2. Rebased to next-20220504
> > > >
> > > > Changes since v10 of fsdax-reflink:
> > > > 1. Rebased to next-20220504 and fsdax-rmap
> > > > 2. Dropped a needless cleanup patch: 'fsdax: Convert dax_iomap_zero to
> > > > iter model'
> > > > 3. Fixed many conflicts during rebasing
> > > > 4. Fixed a dedupe bug in Patch 05: the actuall length to compare could be
> > > > shorter than smap->length or dmap->length.
> > > > PS: There are many changes during rebasing. I think it's better to
> > > > review again.
> > > >
> > > > ==
> > > > Shiyang Ruan (14):
> > > > fsdax-rmap:
> > > > dax: Introduce holder for dax_device
> > > > mm: factor helpers for memory_failure_dev_pagemap
> > > > pagemap,pmem: Introduce ->memory_failure()
> > > > fsdax: Introduce dax_lock_mapping_entry()
> > > > mm: Introduce mf_dax_kill_procs() for fsdax case
> > >
> > > Hmm. This patchset touches at least the dax, pagecache, and xfs
> > > subsystems. Assuming it's too late for 5.19, how should we stage this
> > > for 5.20?
> >
> > Yeah, it's past my "last date for this merge cycle" which was
> > -rc6. I expected stuff might slip a little - as it has with the LARP
> > code - but I don't have the time and bandwidth to start working
> > on merging another feature from scratch before the merge window
> > comes around.
> >
> > Getting the dax+reflink stuff in this cycle was always an optimistic
> > stretch, but I wanted to try so that there was no doubt it would be
> > ready for merge in the next cycle...
> >
> > > I could just add the entire series to iomap-5.20-merge and base the
> > > xfs-5.20-merge off of that? But I'm not sure what else might be landing
> > > in the other subsystems, so I'm open to input.
> >
> > It'll need to be a stable branch somewhere, but I don't think it
> > really matters where al long as it's merged into the xfs for-next
> > tree so it gets filesystem test coverage...
>
> So how about let the notify_failure() bits go through -mm this cycle,
> if Andrew will have it, and then the reflnk work has a clean v5.19-rc1
> baseline to build from?

Sure, if you want to push them that way I'm not going to complain
or stop you. :)

Anything that makes the eventual XFS feature merge simpler counts as
a win in my books.

Cheers,

Dave.
--
Dave Chinner
[email protected]

2022-05-11 07:23:53

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink

On Tue, 10 May 2022 18:55:50 -0700 Dan Williams <[email protected]> wrote:

> > It'll need to be a stable branch somewhere, but I don't think it
> > really matters where al long as it's merged into the xfs for-next
> > tree so it gets filesystem test coverage...
>
> So how about let the notify_failure() bits go through -mm this cycle,
> if Andrew will have it, and then the reflnk work has a clean v5.19-rc1
> baseline to build from?

What are we referring to here? I think a minimal thing would be the
memremap.h and memory-failure.c changes from
https://lkml.kernel.org/r/[email protected] ?

Sure, I can scoot that into 5.19-rc1 if you think that's best. It
would probably be straining things to slip it into 5.19.

The use of EOPNOTSUPP is a bit suspect, btw. It *sounds* like the
right thing, but it's a networking errno. I suppose livable with if it
never escapes the kernel, but if it can get back to userspace then a
user would be justified in wondering how the heck a filesystem
operation generated a networking errno?


2022-05-11 07:38:23

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink

On Tue, May 10, 2022 at 7:29 PM Andrew Morton <[email protected]> wrote:
>
> On Tue, 10 May 2022 18:55:50 -0700 Dan Williams <[email protected]> wrote:
>
> > > It'll need to be a stable branch somewhere, but I don't think it
> > > really matters where al long as it's merged into the xfs for-next
> > > tree so it gets filesystem test coverage...
> >
> > So how about let the notify_failure() bits go through -mm this cycle,
> > if Andrew will have it, and then the reflnk work has a clean v5.19-rc1
> > baseline to build from?
>
> What are we referring to here? I think a minimal thing would be the
> memremap.h and memory-failure.c changes from
> https://lkml.kernel.org/r/[email protected] ?

Latest is here:
https://lore.kernel.org/all/[email protected]/

> Sure, I can scoot that into 5.19-rc1 if you think that's best. It
> would probably be straining things to slip it into 5.19.

Hmm, if it's straining things and XFS will also target v5.20 I think
the best course for all involved is just wait. Let some of the current
conflicts in -mm land in v5.19 and then I can merge the DAX baseline
and publish a stable branch for XFS and BTRFS to build upon for v5.20.

> The use of EOPNOTSUPP is a bit suspect, btw. It *sounds* like the
> right thing, but it's a networking errno. I suppose livable with if it
> never escapes the kernel, but if it can get back to userspace then a
> user would be justified in wondering how the heck a filesystem
> operation generated a networking errno?

2022-05-11 07:55:42

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink

On Tue, May 10, 2022 at 07:28:53PM -0700, Andrew Morton wrote:
> On Tue, 10 May 2022 18:55:50 -0700 Dan Williams <[email protected]> wrote:
>
> > > It'll need to be a stable branch somewhere, but I don't think it
> > > really matters where al long as it's merged into the xfs for-next
> > > tree so it gets filesystem test coverage...
> >
> > So how about let the notify_failure() bits go through -mm this cycle,
> > if Andrew will have it, and then the reflnk work has a clean v5.19-rc1
> > baseline to build from?
>
> What are we referring to here? I think a minimal thing would be the
> memremap.h and memory-failure.c changes from
> https://lkml.kernel.org/r/[email protected] ?
>
> Sure, I can scoot that into 5.19-rc1 if you think that's best. It
> would probably be straining things to slip it into 5.19.
>
> The use of EOPNOTSUPP is a bit suspect, btw. It *sounds* like the
> right thing, but it's a networking errno. I suppose livable with if it
> never escapes the kernel, but if it can get back to userspace then a
> user would be justified in wondering how the heck a filesystem
> operation generated a networking errno?

<shrug> most filesystems return EOPNOTSUPP rather enthusiastically when
they don't know how to do something...

--D

2022-05-11 09:17:03

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink

On Tue, May 10, 2022 at 09:20:57PM -0700, Dan Williams wrote:
> On Tue, May 10, 2022 at 7:29 PM Andrew Morton <[email protected]> wrote:
> >
> > On Tue, 10 May 2022 18:55:50 -0700 Dan Williams <[email protected]> wrote:
> >
> > > > It'll need to be a stable branch somewhere, but I don't think it
> > > > really matters where al long as it's merged into the xfs for-next
> > > > tree so it gets filesystem test coverage...
> > >
> > > So how about let the notify_failure() bits go through -mm this cycle,
> > > if Andrew will have it, and then the reflnk work has a clean v5.19-rc1
> > > baseline to build from?
> >
> > What are we referring to here? I think a minimal thing would be the
> > memremap.h and memory-failure.c changes from
> > https://lkml.kernel.org/r/[email protected] ?
>
> Latest is here:
> https://lore.kernel.org/all/[email protected]/
>
> > Sure, I can scoot that into 5.19-rc1 if you think that's best. It
> > would probably be straining things to slip it into 5.19.
>
> Hmm, if it's straining things and XFS will also target v5.20 I think
> the best course for all involved is just wait. Let some of the current
> conflicts in -mm land in v5.19 and then I can merge the DAX baseline
> and publish a stable branch for XFS and BTRFS to build upon for v5.20.

Sounds good to /me...

--D

> > The use of EOPNOTSUPP is a bit suspect, btw. It *sounds* like the
> > right thing, but it's a networking errno. I suppose livable with if it
> > never escapes the kernel, but if it can get back to userspace then a
> > user would be justified in wondering how the heck a filesystem
> > operation generated a networking errno?

2022-05-11 09:18:43

by Dave Chinner

[permalink] [raw]
Subject: Re: [PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink

On Tue, May 10, 2022 at 05:03:52PM -0700, Darrick J. Wong wrote:
> On Sun, May 08, 2022 at 10:36:06PM +0800, Shiyang Ruan wrote:
> > This is a combination of two patchsets:
> > 1.fsdax-rmap: https://lore.kernel.org/linux-xfs/[email protected]/
> > 2.fsdax-reflink: https://lore.kernel.org/linux-xfs/[email protected]/
> >
> > Changes since v13 of fsdax-rmap:
> > 1. Fixed mistakes during rebasing code to latest next-
> > 2. Rebased to next-20220504
> >
> > Changes since v10 of fsdax-reflink:
> > 1. Rebased to next-20220504 and fsdax-rmap
> > 2. Dropped a needless cleanup patch: 'fsdax: Convert dax_iomap_zero to
> > iter model'
> > 3. Fixed many conflicts during rebasing
> > 4. Fixed a dedupe bug in Patch 05: the actuall length to compare could be
> > shorter than smap->length or dmap->length.
> > PS: There are many changes during rebasing. I think it's better to
> > review again.
> >
> > ==
> > Shiyang Ruan (14):
> > fsdax-rmap:
> > dax: Introduce holder for dax_device
> > mm: factor helpers for memory_failure_dev_pagemap
> > pagemap,pmem: Introduce ->memory_failure()
> > fsdax: Introduce dax_lock_mapping_entry()
> > mm: Introduce mf_dax_kill_procs() for fsdax case
>
> Hmm. This patchset touches at least the dax, pagecache, and xfs
> subsystems. Assuming it's too late for 5.19, how should we stage this
> for 5.20?

Yeah, it's past my "last date for this merge cycle" which was
-rc6. I expected stuff might slip a little - as it has with the LARP
code - but I don't have the time and bandwidth to start working
on merging another feature from scratch before the merge window
comes around.

Getting the dax+reflink stuff in this cycle was always an optimistic
stretch, but I wanted to try so that there was no doubt it would be
ready for merge in the next cycle...

> I could just add the entire series to iomap-5.20-merge and base the
> xfs-5.20-merge off of that? But I'm not sure what else might be landing
> in the other subsystems, so I'm open to input.

It'll need to be a stable branch somewhere, but I don't think it
really matters where al long as it's merged into the xfs for-next
tree so it gets filesystem test coverage...

Cheers,

Dave.
--
Dave Chinner
[email protected]

2022-05-11 09:46:42

by Shiyang Ruan

[permalink] [raw]
Subject: [PATCH v11.1 06/07] xfs: support CoW in fsdax mode

In fsdax mode, WRITE and ZERO on a shared extent need CoW performed.
After that, new allocated extents needs to be remapped to the file.
So, add a CoW identification in ->iomap_begin(), and implement
->iomap_end() to do the remapping work.

Signed-off-by: Shiyang Ruan <[email protected]>
Reviewed-by: Darrick J. Wong <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
---
fs/xfs/xfs_file.c | 33 ++++++++++++++++++++++++++++-----
fs/xfs/xfs_iomap.c | 30 +++++++++++++++++++++++++++++-
fs/xfs/xfs_iomap.h | 1 +
3 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index af954a5b71f8..fe9f92586acf 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -25,6 +25,7 @@
#include "xfs_iomap.h"
#include "xfs_reflink.h"

+#include <linux/dax.h>
#include <linux/falloc.h>
#include <linux/backing-dev.h>
#include <linux/mman.h>
@@ -669,7 +670,7 @@ xfs_file_dax_write(
pos = iocb->ki_pos;

trace_xfs_file_dax_write(iocb, from);
- ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
+ ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
i_size_write(inode, iocb->ki_pos);
error = xfs_setfilesize(ip, pos, ret);
@@ -1254,6 +1255,31 @@ xfs_file_llseek(
return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
}

+#ifdef CONFIG_FS_DAX
+int
+xfs_dax_fault(
+ struct vm_fault *vmf,
+ enum page_entry_size pe_size,
+ bool write_fault,
+ pfn_t *pfn)
+{
+ return dax_iomap_fault(vmf, pe_size, pfn, NULL,
+ (write_fault && !vmf->cow_page) ?
+ &xfs_dax_write_iomap_ops :
+ &xfs_read_iomap_ops);
+}
+#else
+int
+xfs_dax_fault(
+ struct vm_fault *vmf,
+ enum page_entry_size pe_size,
+ bool write_fault,
+ pfn_t *pfn)
+{
+ return 0;
+}
+#endif
+
/*
* Locking for serialisation of IO during page faults. This results in a lock
* ordering of:
@@ -1285,10 +1311,7 @@ __xfs_filemap_fault(
pfn_t pfn;

xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
- (write_fault && !vmf->cow_page) ?
- &xfs_direct_write_iomap_ops :
- &xfs_read_iomap_ops);
+ ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn);
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 5a393259a3a3..4c07f5e718fb 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -773,7 +773,8 @@ xfs_direct_write_iomap_begin(

/* may drop and re-acquire the ilock */
error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
- &lockmode, flags & IOMAP_DIRECT);
+ &lockmode,
+ (flags & IOMAP_DIRECT) || IS_DAX(inode));
if (error)
goto out_unlock;
if (shared)
@@ -867,6 +868,33 @@ const struct iomap_ops xfs_direct_write_iomap_ops = {
.iomap_begin = xfs_direct_write_iomap_begin,
};

+static int
+xfs_dax_write_iomap_end(
+ struct inode *inode,
+ loff_t pos,
+ loff_t length,
+ ssize_t written,
+ unsigned flags,
+ struct iomap *iomap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+
+ if (!xfs_is_cow_inode(ip))
+ return 0;
+
+ if (!written) {
+ xfs_reflink_cancel_cow_range(ip, pos, length, true);
+ return 0;
+ }
+
+ return xfs_reflink_end_cow(ip, pos, written);
+}
+
+const struct iomap_ops xfs_dax_write_iomap_ops = {
+ .iomap_begin = xfs_direct_write_iomap_begin,
+ .iomap_end = xfs_dax_write_iomap_end,
+};
+
static int
xfs_buffered_write_iomap_begin(
struct inode *inode,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index e88dc162c785..c782e8c0479c 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -51,5 +51,6 @@ extern const struct iomap_ops xfs_direct_write_iomap_ops;
extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
+extern const struct iomap_ops xfs_dax_write_iomap_ops;

#endif /* __XFS_IOMAP_H__*/
--
2.35.1




2022-05-11 10:18:19

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink

On Sun, May 08, 2022 at 10:36:06PM +0800, Shiyang Ruan wrote:
> This is a combination of two patchsets:
> 1.fsdax-rmap: https://lore.kernel.org/linux-xfs/[email protected]/
> 2.fsdax-reflink: https://lore.kernel.org/linux-xfs/[email protected]/
>
> Changes since v13 of fsdax-rmap:
> 1. Fixed mistakes during rebasing code to latest next-
> 2. Rebased to next-20220504
>
> Changes since v10 of fsdax-reflink:
> 1. Rebased to next-20220504 and fsdax-rmap
> 2. Dropped a needless cleanup patch: 'fsdax: Convert dax_iomap_zero to
> iter model'
> 3. Fixed many conflicts during rebasing
> 4. Fixed a dedupe bug in Patch 05: the actuall length to compare could be
> shorter than smap->length or dmap->length.
> PS: There are many changes during rebasing. I think it's better to
> review again.
>
> ==
> Shiyang Ruan (14):
> fsdax-rmap:
> dax: Introduce holder for dax_device
> mm: factor helpers for memory_failure_dev_pagemap
> pagemap,pmem: Introduce ->memory_failure()
> fsdax: Introduce dax_lock_mapping_entry()
> mm: Introduce mf_dax_kill_procs() for fsdax case

Hmm. This patchset touches at least the dax, pagecache, and xfs
subsystems. Assuming it's too late for 5.19, how should we stage this
for 5.20?

I could just add the entire series to iomap-5.20-merge and base the
xfs-5.20-merge off of that? But I'm not sure what else might be landing
in the other subsystems, so I'm open to input.

--D

> xfs: Implement ->notify_failure() for XFS
> fsdax: set a CoW flag when associate reflink mappings
> fsdax-reflink:
> fsdax: Output address in dax_iomap_pfn() and rename it
> fsdax: Introduce dax_iomap_cow_copy()
> fsdax: Replace mmap entry in case of CoW
> fsdax: Add dax_iomap_cow_copy() for dax zero
> fsdax: Dedup file range to use a compare function
> xfs: support CoW in fsdax mode
> xfs: Add dax dedupe support
>
> drivers/dax/super.c | 67 +++++-
> drivers/md/dm.c | 2 +-
> drivers/nvdimm/pmem.c | 17 ++
> fs/dax.c | 398 ++++++++++++++++++++++++++++++------
> fs/erofs/super.c | 13 +-
> fs/ext2/super.c | 7 +-
> fs/ext4/super.c | 9 +-
> fs/remap_range.c | 31 ++-
> fs/xfs/Makefile | 5 +
> fs/xfs/xfs_buf.c | 10 +-
> fs/xfs/xfs_file.c | 9 +-
> fs/xfs/xfs_fsops.c | 3 +
> fs/xfs/xfs_inode.c | 69 ++++++-
> fs/xfs/xfs_inode.h | 1 +
> fs/xfs/xfs_iomap.c | 46 ++++-
> fs/xfs/xfs_iomap.h | 3 +
> fs/xfs/xfs_mount.h | 1 +
> fs/xfs/xfs_notify_failure.c | 220 ++++++++++++++++++++
> fs/xfs/xfs_reflink.c | 12 +-
> fs/xfs/xfs_super.h | 1 +
> include/linux/dax.h | 56 ++++-
> include/linux/fs.h | 12 +-
> include/linux/memremap.h | 12 ++
> include/linux/mm.h | 2 +
> include/linux/page-flags.h | 6 +
> mm/memory-failure.c | 257 ++++++++++++++++-------
> 26 files changed, 1087 insertions(+), 182 deletions(-)
> create mode 100644 fs/xfs/xfs_notify_failure.c
>
> --
> 2.35.1
>
>
>

2022-05-11 10:53:26

by Dave Chinner

[permalink] [raw]
Subject: Re: [PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink

On Tue, May 10, 2022 at 10:24:28PM -0700, Andrew Morton wrote:
> On Tue, 10 May 2022 19:43:01 -0700 "Darrick J. Wong" <[email protected]> wrote:
>
> > On Tue, May 10, 2022 at 07:28:53PM -0700, Andrew Morton wrote:
> > > On Tue, 10 May 2022 18:55:50 -0700 Dan Williams <[email protected]> wrote:
> > >
> > > > > It'll need to be a stable branch somewhere, but I don't think it
> > > > > really matters where al long as it's merged into the xfs for-next
> > > > > tree so it gets filesystem test coverage...
> > > >
> > > > So how about let the notify_failure() bits go through -mm this cycle,
> > > > if Andrew will have it, and then the reflnk work has a clean v5.19-rc1
> > > > baseline to build from?
> > >
> > > What are we referring to here? I think a minimal thing would be the
> > > memremap.h and memory-failure.c changes from
> > > https://lkml.kernel.org/r/[email protected] ?
> > >
> > > Sure, I can scoot that into 5.19-rc1 if you think that's best. It
> > > would probably be straining things to slip it into 5.19.
> > >
> > > The use of EOPNOTSUPP is a bit suspect, btw. It *sounds* like the
> > > right thing, but it's a networking errno. I suppose livable with if it
> > > never escapes the kernel, but if it can get back to userspace then a
> > > user would be justified in wondering how the heck a filesystem
> > > operation generated a networking errno?
> >
> > <shrug> most filesystems return EOPNOTSUPP rather enthusiastically when
> > they don't know how to do something...
>
> Can it propagate back to userspace?

Maybe not this one, but the point Darrick is making is that we
really don't care if it does because we've been propagating it to
userspace in documented filesystem APIs for at least 15 years now.

e.g:

$ man 2 fallocate
.....
Errors
.....
EOPNOTSUPP
The filesystem containing the file referred to by fd
does not support this operation; or the mode is not
supported by the filesystem containing the file
referred to by fd.
.....

Other random examples:

pwritev2(RWF_NOWAIT) can return -EOPNOTSUPP on buffered writes.
Documented in the man page.

FICLONERANGE on an filesystem that doesn't support reflink will
return -EOPNOTSUPP. Documented in the man page.

mmap(MAP_SYNC) returns -EOPNOTSUPP if the underlying filesystem
and/or storage doesn't support DAX. Documented in the man page.

I could go on, but I think I've made the point already...

Cheers,

Dave.
--
Dave Chinner
[email protected]

2022-05-12 19:02:24

by Shiyang Ruan

[permalink] [raw]
Subject: Re: [PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink



在 2022/5/11 23:46, Dan Williams 写道:
> On Wed, May 11, 2022 at 8:21 AM Darrick J. Wong <[email protected]> wrote:
>>
>> Oan Tue, May 10, 2022 at 10:24:28PM -0700, Andrew Morton wrote:
>>> On Tue, 10 May 2022 19:43:01 -0700 "Darrick J. Wong" <[email protected]> wrote:
>>>
>>>> On Tue, May 10, 2022 at 07:28:53PM -0700, Andrew Morton wrote:
>>>>> On Tue, 10 May 2022 18:55:50 -0700 Dan Williams <[email protected]> wrote:
>>>>>
>>>>>>> It'll need to be a stable branch somewhere, but I don't think it
>>>>>>> really matters where al long as it's merged into the xfs for-next
>>>>>>> tree so it gets filesystem test coverage...
>>>>>>
>>>>>> So how about let the notify_failure() bits go through -mm this cycle,
>>>>>> if Andrew will have it, and then the reflnk work has a clean v5.19-rc1
>>>>>> baseline to build from?
>>>>>
>>>>> What are we referring to here? I think a minimal thing would be the
>>>>> memremap.h and memory-failure.c changes from
>>>>> https://lkml.kernel.org/r/[email protected] ?
>>>>>
>>>>> Sure, I can scoot that into 5.19-rc1 if you think that's best. It
>>>>> would probably be straining things to slip it into 5.19.
>>>>>
>>>>> The use of EOPNOTSUPP is a bit suspect, btw. It *sounds* like the
>>>>> right thing, but it's a networking errno. I suppose livable with if it
>>>>> never escapes the kernel, but if it can get back to userspace then a
>>>>> user would be justified in wondering how the heck a filesystem
>>>>> operation generated a networking errno?
>>>>
>>>> <shrug> most filesystems return EOPNOTSUPP rather enthusiastically when
>>>> they don't know how to do something...
>>>
>>> Can it propagate back to userspace?
>>
>> AFAICT, the new code falls back to the current (mf_generic_kill_procs)
>> failure code if the filesystem doesn't provide a ->memory_failure
>> function or if it returns -EOPNOSUPP. mf_generic_kill_procs can also
>> return -EOPNOTSUPP, but all the memory_failure() callers (madvise, etc.)
>> convert that to 0 before returning it to userspace.
>>
>> I suppose the weirder question is going to be what happens when madvise
>> starts returning filesystem errors like EIO or EFSCORRUPTED when pmem
>> loses half its brains and even the fs can't deal with it.
>
> Even then that notification is not in a system call context so it
> would still result in a SIGBUS notification not a EOPNOTSUPP return
> code. The only potential gap I see are what are the possible error
> codes that MADV_SOFT_OFFLINE might see? The man page is silent on soft
> offline failure codes. Shiyang, that's something to check / update if
> necessary.

According to the code around MADV_SOFT_OFFLINE, it will return -EIO when
the backend is NVDIMM.

Here is the logic:
madvise_inject_error() {
...
if (MADV_SOFT_OFFLINE) {
ret = soft_offline_page() {
...
/* Only online pages can be soft-offlined (esp., not
ZONE_DEVICE). */
page = pfn_to_online_page(pfn);
if (!page) {
put_ref_page(ref_page);
return -EIO;
}
...
}
} else {
ret = memory_failure()
}
return ret
}


--
Thanks,
Ruan.



2022-05-13 11:10:30

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink

Oan Tue, May 10, 2022 at 10:24:28PM -0700, Andrew Morton wrote:
> On Tue, 10 May 2022 19:43:01 -0700 "Darrick J. Wong" <[email protected]> wrote:
>
> > On Tue, May 10, 2022 at 07:28:53PM -0700, Andrew Morton wrote:
> > > On Tue, 10 May 2022 18:55:50 -0700 Dan Williams <[email protected]> wrote:
> > >
> > > > > It'll need to be a stable branch somewhere, but I don't think it
> > > > > really matters where al long as it's merged into the xfs for-next
> > > > > tree so it gets filesystem test coverage...
> > > >
> > > > So how about let the notify_failure() bits go through -mm this cycle,
> > > > if Andrew will have it, and then the reflnk work has a clean v5.19-rc1
> > > > baseline to build from?
> > >
> > > What are we referring to here? I think a minimal thing would be the
> > > memremap.h and memory-failure.c changes from
> > > https://lkml.kernel.org/r/[email protected] ?
> > >
> > > Sure, I can scoot that into 5.19-rc1 if you think that's best. It
> > > would probably be straining things to slip it into 5.19.
> > >
> > > The use of EOPNOTSUPP is a bit suspect, btw. It *sounds* like the
> > > right thing, but it's a networking errno. I suppose livable with if it
> > > never escapes the kernel, but if it can get back to userspace then a
> > > user would be justified in wondering how the heck a filesystem
> > > operation generated a networking errno?
> >
> > <shrug> most filesystems return EOPNOTSUPP rather enthusiastically when
> > they don't know how to do something...
>
> Can it propagate back to userspace?

AFAICT, the new code falls back to the current (mf_generic_kill_procs)
failure code if the filesystem doesn't provide a ->memory_failure
function or if it returns -EOPNOSUPP. mf_generic_kill_procs can also
return -EOPNOTSUPP, but all the memory_failure() callers (madvise, etc.)
convert that to 0 before returning it to userspace.

I suppose the weirder question is going to be what happens when madvise
starts returning filesystem errors like EIO or EFSCORRUPTED when pmem
loses half its brains and even the fs can't deal with it.

--D

2022-05-14 01:43:36

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink

On Wed, May 11, 2022 at 8:21 AM Darrick J. Wong <[email protected]> wrote:
>
> Oan Tue, May 10, 2022 at 10:24:28PM -0700, Andrew Morton wrote:
> > On Tue, 10 May 2022 19:43:01 -0700 "Darrick J. Wong" <[email protected]> wrote:
> >
> > > On Tue, May 10, 2022 at 07:28:53PM -0700, Andrew Morton wrote:
> > > > On Tue, 10 May 2022 18:55:50 -0700 Dan Williams <[email protected]> wrote:
> > > >
> > > > > > It'll need to be a stable branch somewhere, but I don't think it
> > > > > > really matters where al long as it's merged into the xfs for-next
> > > > > > tree so it gets filesystem test coverage...
> > > > >
> > > > > So how about let the notify_failure() bits go through -mm this cycle,
> > > > > if Andrew will have it, and then the reflnk work has a clean v5.19-rc1
> > > > > baseline to build from?
> > > >
> > > > What are we referring to here? I think a minimal thing would be the
> > > > memremap.h and memory-failure.c changes from
> > > > https://lkml.kernel.org/r/[email protected] ?
> > > >
> > > > Sure, I can scoot that into 5.19-rc1 if you think that's best. It
> > > > would probably be straining things to slip it into 5.19.
> > > >
> > > > The use of EOPNOTSUPP is a bit suspect, btw. It *sounds* like the
> > > > right thing, but it's a networking errno. I suppose livable with if it
> > > > never escapes the kernel, but if it can get back to userspace then a
> > > > user would be justified in wondering how the heck a filesystem
> > > > operation generated a networking errno?
> > >
> > > <shrug> most filesystems return EOPNOTSUPP rather enthusiastically when
> > > they don't know how to do something...
> >
> > Can it propagate back to userspace?
>
> AFAICT, the new code falls back to the current (mf_generic_kill_procs)
> failure code if the filesystem doesn't provide a ->memory_failure
> function or if it returns -EOPNOSUPP. mf_generic_kill_procs can also
> return -EOPNOTSUPP, but all the memory_failure() callers (madvise, etc.)
> convert that to 0 before returning it to userspace.
>
> I suppose the weirder question is going to be what happens when madvise
> starts returning filesystem errors like EIO or EFSCORRUPTED when pmem
> loses half its brains and even the fs can't deal with it.

Even then that notification is not in a system call context so it
would still result in a SIGBUS notification not a EOPNOTSUPP return
code. The only potential gap I see are what are the possible error
codes that MADV_SOFT_OFFLINE might see? The man page is silent on soft
offline failure codes. Shiyang, that's something to check / update if
necessary.

2022-06-02 16:14:48

by Shiyang Ruan

[permalink] [raw]
Subject: Re: [PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink

Hi,

Is there any other work I should do with these two patchsets? I think
they are good for now. So... since the 5.19-rc1 is coming, could the
notify_failure() part be merged as your plan?


--
Thanks,
Ruan.


在 2022/5/12 20:27, Shiyang Ruan 写道:
>
>
> 在 2022/5/11 23:46, Dan Williams 写道:
>> On Wed, May 11, 2022 at 8:21 AM Darrick J. Wong <[email protected]>
>> wrote:
>>>
>>> Oan Tue, May 10, 2022 at 10:24:28PM -0700, Andrew Morton wrote:
>>>> On Tue, 10 May 2022 19:43:01 -0700 "Darrick J. Wong"
>>>> <[email protected]> wrote:
>>>>
>>>>> On Tue, May 10, 2022 at 07:28:53PM -0700, Andrew Morton wrote:
>>>>>> On Tue, 10 May 2022 18:55:50 -0700 Dan Williams
>>>>>> <[email protected]> wrote:
>>>>>>
>>>>>>>> It'll need to be a stable branch somewhere, but I don't think it
>>>>>>>> really matters where al long as it's merged into the xfs for-next
>>>>>>>> tree so it gets filesystem test coverage...
>>>>>>>
>>>>>>> So how about let the notify_failure() bits go through -mm this
>>>>>>> cycle,
>>>>>>> if Andrew will have it, and then the reflnk work has a clean
>>>>>>> v5.19-rc1
>>>>>>> baseline to build from?
>>>>>>
>>>>>> What are we referring to here?  I think a minimal thing would be the
>>>>>> memremap.h and memory-failure.c changes from
>>>>>> https://lkml.kernel.org/r/[email protected]
>>>>>> ?
>>>>>>
>>>>>> Sure, I can scoot that into 5.19-rc1 if you think that's best.  It
>>>>>> would probably be straining things to slip it into 5.19.
>>>>>>
>>>>>> The use of EOPNOTSUPP is a bit suspect, btw.  It *sounds* like the
>>>>>> right thing, but it's a networking errno.  I suppose livable with
>>>>>> if it
>>>>>> never escapes the kernel, but if it can get back to userspace then a
>>>>>> user would be justified in wondering how the heck a filesystem
>>>>>> operation generated a networking errno?
>>>>>
>>>>> <shrug> most filesystems return EOPNOTSUPP rather enthusiastically
>>>>> when
>>>>> they don't know how to do something...
>>>>
>>>> Can it propagate back to userspace?
>>>
>>> AFAICT, the new code falls back to the current (mf_generic_kill_procs)
>>> failure code if the filesystem doesn't provide a ->memory_failure
>>> function or if it returns -EOPNOSUPP.  mf_generic_kill_procs can also
>>> return -EOPNOTSUPP, but all the memory_failure() callers (madvise, etc.)
>>> convert that to 0 before returning it to userspace.
>>>
>>> I suppose the weirder question is going to be what happens when madvise
>>> starts returning filesystem errors like EIO or EFSCORRUPTED when pmem
>>> loses half its brains and even the fs can't deal with it.
>>
>> Even then that notification is not in a system call context so it
>> would still result in a SIGBUS notification not a EOPNOTSUPP return
>> code. The only potential gap I see are what are the possible error
>> codes that MADV_SOFT_OFFLINE might see? The man page is silent on soft
>> offline failure codes. Shiyang, that's something to check / update if
>> necessary.
>
> According to the code around MADV_SOFT_OFFLINE, it will return -EIO when
> the backend is NVDIMM.
>
> Here is the logic:
>  madvise_inject_error() {
>      ...
>      if (MADV_SOFT_OFFLINE) {
>          ret = soft_offline_page() {
>              ...
>              /* Only online pages can be soft-offlined (esp., not
> ZONE_DEVICE). */
>              page = pfn_to_online_page(pfn);
>              if (!page) {
>                  put_ref_page(ref_page);
>                  return -EIO;
>              }
>              ...
>          }
>      } else {
>          ret = memory_failure()
>      }
>      return ret
>  }
>
>
> --
> Thanks,
> Ruan.
>
>



2022-06-03 08:24:16

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink

On Sun, 8 May 2022 22:36:06 +0800 Shiyang Ruan <[email protected]> wrote:

> This is a combination of two patchsets:
> 1.fsdax-rmap: https://lore.kernel.org/linux-xfs/[email protected]/
> 2.fsdax-reflink: https://lore.kernel.org/linux-xfs/[email protected]/

I'm getting lost in conflicts trying to get this merged up. Mainly
memory-failure.c due to patch series "mm, hwpoison: enable 1GB hugepage
support".

Could you please take a look at what's in the mm-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm a few hours from
now? Or the next linux-next.

And I suggest that converting it all into a single 14-patch series
would be more straightforward.

Thanks.

2022-06-03 11:49:52

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink

On Thu, 2 Jun 2022 10:18:09 -0700 "Darrick J. Wong" <[email protected]> wrote:

> On Thu, Jun 02, 2022 at 05:42:13PM +0800, Shiyang Ruan wrote:
> > Hi,
> >
> > Is there any other work I should do with these two patchsets? I think they
> > are good for now. So... since the 5.19-rc1 is coming, could the
> > notify_failure() part be merged as your plan?
>
> Hmm. I don't see any of the patches 1-5,7-13 in current upstream, so
> I'm guessing this means Andrew isn't taking it for 5.19?

Sorry, the volume of commentary led me to believe that it wasn't
considered finalized. Shall take a look now.




2022-06-03 17:56:14

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink

On Thu, Jun 02, 2022 at 05:42:13PM +0800, Shiyang Ruan wrote:
> Hi,
>
> Is there any other work I should do with these two patchsets? I think they
> are good for now. So... since the 5.19-rc1 is coming, could the
> notify_failure() part be merged as your plan?

Hmm. I don't see any of the patches 1-5,7-13 in current upstream, so
I'm guessing this means Andrew isn't taking it for 5.19?

--D

>
>
> --
> Thanks,
> Ruan.
>
>
> 在 2022/5/12 20:27, Shiyang Ruan 写道:
> >
> >
> > 在 2022/5/11 23:46, Dan Williams 写道:
> > > On Wed, May 11, 2022 at 8:21 AM Darrick J. Wong <[email protected]>
> > > wrote:
> > > >
> > > > Oan Tue, May 10, 2022 at 10:24:28PM -0700, Andrew Morton wrote:
> > > > > On Tue, 10 May 2022 19:43:01 -0700 "Darrick J. Wong"
> > > > > <[email protected]> wrote:
> > > > >
> > > > > > On Tue, May 10, 2022 at 07:28:53PM -0700, Andrew Morton wrote:
> > > > > > > On Tue, 10 May 2022 18:55:50 -0700 Dan Williams
> > > > > > > <[email protected]> wrote:
> > > > > > >
> > > > > > > > > It'll need to be a stable branch somewhere, but I don't think it
> > > > > > > > > really matters where al long as it's merged into the xfs for-next
> > > > > > > > > tree so it gets filesystem test coverage...
> > > > > > > >
> > > > > > > > So how about let the notify_failure() bits go
> > > > > > > > through -mm this cycle,
> > > > > > > > if Andrew will have it, and then the reflnk work
> > > > > > > > has a clean v5.19-rc1
> > > > > > > > baseline to build from?
> > > > > > >
> > > > > > > What are we referring to here?  I think a minimal thing would be the
> > > > > > > memremap.h and memory-failure.c changes from
> > > > > > > https://lkml.kernel.org/r/[email protected]
> > > > > > > ?
> > > > > > >
> > > > > > > Sure, I can scoot that into 5.19-rc1 if you think that's best.  It
> > > > > > > would probably be straining things to slip it into 5.19.
> > > > > > >
> > > > > > > The use of EOPNOTSUPP is a bit suspect, btw.  It *sounds* like the
> > > > > > > right thing, but it's a networking errno.  I suppose
> > > > > > > livable with if it
> > > > > > > never escapes the kernel, but if it can get back to userspace then a
> > > > > > > user would be justified in wondering how the heck a filesystem
> > > > > > > operation generated a networking errno?
> > > > > >
> > > > > > <shrug> most filesystems return EOPNOTSUPP rather
> > > > > > enthusiastically when
> > > > > > they don't know how to do something...
> > > > >
> > > > > Can it propagate back to userspace?
> > > >
> > > > AFAICT, the new code falls back to the current (mf_generic_kill_procs)
> > > > failure code if the filesystem doesn't provide a ->memory_failure
> > > > function or if it returns -EOPNOSUPP.  mf_generic_kill_procs can also
> > > > return -EOPNOTSUPP, but all the memory_failure() callers (madvise, etc.)
> > > > convert that to 0 before returning it to userspace.
> > > >
> > > > I suppose the weirder question is going to be what happens when madvise
> > > > starts returning filesystem errors like EIO or EFSCORRUPTED when pmem
> > > > loses half its brains and even the fs can't deal with it.
> > >
> > > Even then that notification is not in a system call context so it
> > > would still result in a SIGBUS notification not a EOPNOTSUPP return
> > > code. The only potential gap I see are what are the possible error
> > > codes that MADV_SOFT_OFFLINE might see? The man page is silent on soft
> > > offline failure codes. Shiyang, that's something to check / update if
> > > necessary.
> >
> > According to the code around MADV_SOFT_OFFLINE, it will return -EIO when
> > the backend is NVDIMM.
> >
> > Here is the logic:
> >  madvise_inject_error() {
> >      ...
> >      if (MADV_SOFT_OFFLINE) {
> >          ret = soft_offline_page() {
> >              ...
> >              /* Only online pages can be soft-offlined (esp., not
> > ZONE_DEVICE). */
> >              page = pfn_to_online_page(pfn);
> >              if (!page) {
> >                  put_ref_page(ref_page);
> >                  return -EIO;
> >              }
> >              ...
> >          }
> >      } else {
> >          ret = memory_failure()
> >      }
> >      return ret
> >  }
> >
> >
> > --
> > Thanks,
> > Ruan.
> >
> >
>
>

2022-06-03 21:06:15

by Shiyang Ruan

[permalink] [raw]
Subject: Re: [PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink



在 2022/6/3 9:07, Shiyang Ruan 写道:
>
>
> 在 2022/6/3 2:56, Andrew Morton 写道:
>> On Sun, 8 May 2022 22:36:06 +0800 Shiyang Ruan
>> <[email protected]> wrote:
>>
>>> This is a combination of two patchsets:
>>>   1.fsdax-rmap:
>>> https://lore.kernel.org/linux-xfs/[email protected]/
>>>
>>>   2.fsdax-reflink:
>>> https://lore.kernel.org/linux-xfs/[email protected]/
>>>
>>
>> I'm getting lost in conflicts trying to get this merged up.  Mainly
>> memory-failure.c due to patch series "mm, hwpoison: enable 1GB hugepage
>> support".
>>
>> Could you please take a look at what's in the mm-unstable branch at
>> git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm a few hours from
>> now?  Or the next linux-next.

OK, let me rebase this patchset on your mm-unstable branch.


--
Thanks,
Ruan.

>>
>> And I suggest that converting it all into a single 14-patch series
>> would be more straightforward.
>
> The patchset in this thread is the 14-patch series.  I have solved many
> conflicts.  It's an updated / newest version, and a combination of the 2
> urls quoted above.  In an other word, instead of using this two:
>
> >> This is a combination of two patchsets:
> >>   1.fsdax-rmap: https://...
> >>   2.fsdax-reflink: https://...
>
> you could take this (the url of the current thread):
> https://lore.kernel.org/linux-xfs/[email protected]/
>
>
> My description misleaded you.  Sorry for that.
>
>
> --
> Thanks,
> Ruan.
>
>>
>> Thanks.
>
>
>


2022-06-05 03:47:52

by Shiyang Ruan

[permalink] [raw]
Subject: Re: [PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink



在 2022/6/3 2:56, Andrew Morton 写道:
> On Sun, 8 May 2022 22:36:06 +0800 Shiyang Ruan <[email protected]> wrote:
>
>> This is a combination of two patchsets:
>> 1.fsdax-rmap: https://lore.kernel.org/linux-xfs/[email protected]/
>> 2.fsdax-reflink: https://lore.kernel.org/linux-xfs/[email protected]/
>
> I'm getting lost in conflicts trying to get this merged up. Mainly
> memory-failure.c due to patch series "mm, hwpoison: enable 1GB hugepage
> support".
>
> Could you please take a look at what's in the mm-unstable branch at
> git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm a few hours from
> now? Or the next linux-next.
>
> And I suggest that converting it all into a single 14-patch series
> would be more straightforward.

The patchset in this thread is the 14-patch series. I have solved many
conflicts. It's an updated / newest version, and a combination of the 2
urls quoted above. In an other word, instead of using this two:

>> This is a combination of two patchsets:
>> 1.fsdax-rmap: https://...
>> 2.fsdax-reflink: https://...

you could take this (the url of the current thread):
https://lore.kernel.org/linux-xfs/[email protected]/

My description misleaded you. Sorry for that.


--
Thanks,
Ruan.

>
> Thanks.