2021-03-19 01:56:51

by Shiyang Ruan

[permalink] [raw]
Subject: [PATCH v3 00/10] fsdax,xfs: Add reflink&dedupe support for fsdax

From: Shiyang Ruan <[email protected]>

This patchset is attempt to add CoW support for fsdax, and take XFS,
which has both reflink and fsdax feature, as an example.

Changes from V2:
- Fix the mistake in iomap_apply2() and dax_dedupe_file_range_compare()
- Add CoW judgement in dax_iomap_zero()
- Fix other code style problems and mistakes

Changes from V1:
- Factor some helper functions to simplify dax fault code
- Introduce iomap_apply2() for dax_dedupe_file_range_compare()
- Fix mistakes and other problems
- Rebased on v5.11

One of the key mechanism need to be implemented in fsdax is CoW. Copy
the data from srcmap before we actually write data to the destance
iomap. And we just copy range in which data won't be changed.

Another mechanism is range comparison. In page cache case, readpage()
is used to load data on disk to page cache in order to be able to
compare data. In fsdax case, readpage() does not work. So, we need
another compare data with direct access support.

With the two mechanism implemented in fsdax, we are able to make reflink
and fsdax work together in XFS.


Some of the patches are picked up from Goldwyn's patchset. I made some
changes to adapt to this patchset.

(Rebased on v5.11)
==

Shiyang Ruan (10):
fsdax: Factor helpers to simplify dax fault code
fsdax: Factor helper: dax_fault_actor()
fsdax: Output address in dax_iomap_pfn() and rename it
fsdax: Introduce dax_iomap_cow_copy()
fsdax: Replace mmap entry in case of CoW
fsdax: Add dax_iomap_cow_copy() for dax_iomap_zero
iomap: Introduce iomap_apply2() for operations on two files
fsdax: Dedup file range to use a compare function
fs/xfs: Handle CoW for fsdax write() path
fs/xfs: Add dedupe support for fsdax

fs/dax.c | 596 ++++++++++++++++++++++++++---------------
fs/iomap/apply.c | 56 ++++
fs/iomap/buffered-io.c | 2 +-
fs/remap_range.c | 45 +++-
fs/xfs/xfs_bmap_util.c | 3 +-
fs/xfs/xfs_file.c | 29 +-
fs/xfs/xfs_inode.c | 8 +-
fs/xfs/xfs_inode.h | 1 +
fs/xfs/xfs_iomap.c | 58 +++-
fs/xfs/xfs_iomap.h | 4 +
fs/xfs/xfs_iops.c | 7 +-
fs/xfs/xfs_reflink.c | 17 +-
include/linux/dax.h | 7 +-
include/linux/fs.h | 15 +-
include/linux/iomap.h | 7 +-
15 files changed, 602 insertions(+), 253 deletions(-)

--
2.30.1




2021-03-19 01:58:05

by Shiyang Ruan

[permalink] [raw]
Subject: [PATCH v3 01/10] fsdax: Factor helpers to simplify dax fault code

The dax page fault code is too long and a bit difficult to read. And it
is hard to understand when we trying to add new features. Some of the
PTE/PMD codes have similar logic. So, factor them as helper functions to
simplify the code.

Signed-off-by: Shiyang Ruan <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
---
fs/dax.c | 152 ++++++++++++++++++++++++++++++-------------------------
1 file changed, 84 insertions(+), 68 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 26d5dcd2d69e..7031e4302b13 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1243,6 +1243,52 @@ static bool dax_fault_is_synchronous(unsigned long flags,
&& (iomap->flags & IOMAP_F_DIRTY);
}

+/*
+ * If we are doing synchronous page fault and inode needs fsync, we can insert
+ * PTE/PMD into page tables only after that happens. Skip insertion for now and
+ * return the pfn so that caller can insert it after fsync is done.
+ */
+static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
+{
+ if (WARN_ON_ONCE(!pfnp))
+ return VM_FAULT_SIGBUS;
+
+ *pfnp = pfn;
+ return VM_FAULT_NEEDDSYNC;
+}
+
+static int dax_fault_cow_page(struct vm_fault *vmf, struct iomap *iomap,
+ loff_t pos, vm_fault_t *ret)
+{
+ int error = 0;
+ unsigned long vaddr = vmf->address;
+ sector_t sector = dax_iomap_sector(iomap, pos);
+
+ switch (iomap->type) {
+ case IOMAP_HOLE:
+ case IOMAP_UNWRITTEN:
+ clear_user_highpage(vmf->cow_page, vaddr);
+ break;
+ case IOMAP_MAPPED:
+ error = copy_cow_page_dax(iomap->bdev, iomap->dax_dev,
+ sector, vmf->cow_page, vaddr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ error = -EIO;
+ break;
+ }
+
+ if (error)
+ return error;
+
+ __SetPageUptodate(vmf->cow_page);
+ *ret = finish_fault(vmf);
+ if (!*ret)
+ *ret = VM_FAULT_DONE_COW;
+ return 0;
+}
+
static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
int *iomap_errp, const struct iomap_ops *ops)
{
@@ -1311,30 +1357,9 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
}

if (vmf->cow_page) {
- sector_t sector = dax_iomap_sector(&iomap, pos);
-
- switch (iomap.type) {
- case IOMAP_HOLE:
- case IOMAP_UNWRITTEN:
- clear_user_highpage(vmf->cow_page, vaddr);
- break;
- case IOMAP_MAPPED:
- error = copy_cow_page_dax(iomap.bdev, iomap.dax_dev,
- sector, vmf->cow_page, vaddr);
- break;
- default:
- WARN_ON_ONCE(1);
- error = -EIO;
- break;
- }
-
+ error = dax_fault_cow_page(vmf, &iomap, pos, &ret);
if (error)
- goto error_finish_iomap;
-
- __SetPageUptodate(vmf->cow_page);
- ret = finish_fault(vmf);
- if (!ret)
- ret = VM_FAULT_DONE_COW;
+ ret = dax_fault_return(error);
goto finish_iomap;
}

@@ -1354,19 +1379,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
0, write && !sync);

- /*
- * If we are doing synchronous page fault and inode needs fsync,
- * we can insert PTE into page tables only after that happens.
- * Skip insertion for now and return the pfn so that caller can
- * insert it after fsync is done.
- */
if (sync) {
- if (WARN_ON_ONCE(!pfnp)) {
- error = -EIO;
- goto error_finish_iomap;
- }
- *pfnp = pfn;
- ret = VM_FAULT_NEEDDSYNC | major;
+ ret = dax_fault_synchronous_pfnp(pfnp, pfn);
goto finish_iomap;
}
trace_dax_insert_mapping(inode, vmf, entry);
@@ -1465,13 +1479,45 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
return VM_FAULT_FALLBACK;
}

+static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas,
+ pgoff_t max_pgoff)
+{
+ unsigned long pmd_addr = vmf->address & PMD_MASK;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+ /*
+ * Make sure that the faulting address's PMD offset (color) matches
+ * the PMD offset from the start of the file. This is necessary so
+ * that a PMD range in the page table overlaps exactly with a PMD
+ * range in the page cache.
+ */
+ if ((vmf->pgoff & PG_PMD_COLOUR) !=
+ ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
+ return true;
+
+ /* Fall back to PTEs if we're going to COW */
+ if (write && !(vmf->vma->vm_flags & VM_SHARED))
+ return true;
+
+ /* If the PMD would extend outside the VMA */
+ if (pmd_addr < vmf->vma->vm_start)
+ return true;
+ if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
+ return true;
+
+ /* If the PMD would extend beyond the file size */
+ if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff)
+ return true;
+
+ return false;
+}
+
static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
{
struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping = vma->vm_file->f_mapping;
XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
- unsigned long pmd_addr = vmf->address & PMD_MASK;
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool sync;
unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
@@ -1494,33 +1540,12 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,

trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);

- /*
- * Make sure that the faulting address's PMD offset (color) matches
- * the PMD offset from the start of the file. This is necessary so
- * that a PMD range in the page table overlaps exactly with a PMD
- * range in the page cache.
- */
- if ((vmf->pgoff & PG_PMD_COLOUR) !=
- ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
- goto fallback;
-
- /* Fall back to PTEs if we're going to COW */
- if (write && !(vma->vm_flags & VM_SHARED))
- goto fallback;
-
- /* If the PMD would extend outside the VMA */
- if (pmd_addr < vma->vm_start)
- goto fallback;
- if ((pmd_addr + PMD_SIZE) > vma->vm_end)
- goto fallback;
-
if (xas.xa_index >= max_pgoff) {
result = VM_FAULT_SIGBUS;
goto out;
}

- /* If the PMD would extend beyond the file size */
- if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff)
+ if (dax_fault_check_fallback(vmf, &xas, max_pgoff))
goto fallback;

/*
@@ -1572,17 +1597,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
DAX_PMD, write && !sync);

- /*
- * If we are doing synchronous page fault and inode needs fsync,
- * we can insert PMD into page tables only after that happens.
- * Skip insertion for now and return the pfn so that caller can
- * insert it after fsync is done.
- */
if (sync) {
- if (WARN_ON_ONCE(!pfnp))
- goto finish_iomap;
- *pfnp = pfn;
- result = VM_FAULT_NEEDDSYNC;
+ result = dax_fault_synchronous_pfnp(pfnp, pfn);
goto finish_iomap;
}

--
2.30.1



2021-03-19 02:00:03

by Shiyang Ruan

[permalink] [raw]
Subject: [PATCH v3 02/10] fsdax: Factor helper: dax_fault_actor()

The core logic in the two dax page fault functions is similar. So, move
the logic into a common helper function. Also, to facilitate the
addition of new features, such as CoW, switch-case is no longer used to
handle different iomap types.

Signed-off-by: Shiyang Ruan <[email protected]>
---
fs/dax.c | 291 +++++++++++++++++++++++++++----------------------------
1 file changed, 145 insertions(+), 146 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 7031e4302b13..33ddad0f3091 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1053,6 +1053,66 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
return ret;
}

+#ifdef CONFIG_FS_DAX_PMD
+static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
+ struct iomap *iomap, void **entry)
+{
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ unsigned long pmd_addr = vmf->address & PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ struct inode *inode = mapping->host;
+ pgtable_t pgtable = NULL;
+ struct page *zero_page;
+ spinlock_t *ptl;
+ pmd_t pmd_entry;
+ pfn_t pfn;
+
+ zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
+
+ if (unlikely(!zero_page))
+ goto fallback;
+
+ pfn = page_to_pfn_t(zero_page);
+ *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
+ DAX_PMD | DAX_ZERO_PAGE, false);
+
+ if (arch_needs_pgtable_deposit()) {
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (!pgtable)
+ return VM_FAULT_OOM;
+ }
+
+ ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
+ if (!pmd_none(*(vmf->pmd))) {
+ spin_unlock(ptl);
+ goto fallback;
+ }
+
+ if (pgtable) {
+ pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+ mm_inc_nr_ptes(vma->vm_mm);
+ }
+ pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
+ pmd_entry = pmd_mkhuge(pmd_entry);
+ set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
+ spin_unlock(ptl);
+ trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
+ return VM_FAULT_NOPAGE;
+
+fallback:
+ if (pgtable)
+ pte_free(vma->vm_mm, pgtable);
+ trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
+ return VM_FAULT_FALLBACK;
+}
+#else
+static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
+ struct iomap *iomap, void **entry)
+{
+ return VM_FAULT_FALLBACK;
+}
+#endif /* CONFIG_FS_DAX_PMD */
+
s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
{
sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
@@ -1289,6 +1349,61 @@ static int dax_fault_cow_page(struct vm_fault *vmf, struct iomap *iomap,
return 0;
}

+/**
+ * dax_fault_actor - Common actor to handle pfn insertion in PTE/PMD fault.
+ * @vmf: vm fault instance
+ * @pfnp: pfn to be returned
+ * @xas: the dax mapping tree of a file
+ * @entry: an unlocked dax entry to be inserted
+ * @pmd: distinguish whether it is a pmd fault
+ * @flags: iomap flags
+ * @iomap: from iomap_begin()
+ * @srcmap: from iomap_begin(), not equal to iomap if it is a CoW
+ */
+static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp,
+ struct xa_state *xas, void *entry, bool pmd, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
+{
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
+ loff_t pos = (loff_t)xas->xa_offset << PAGE_SHIFT;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+ bool sync = dax_fault_is_synchronous(flags, vmf->vma, iomap);
+ int err = 0;
+ pfn_t pfn;
+
+ /* if we are reading UNWRITTEN and HOLE, return a hole. */
+ if (!write &&
+ (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) {
+ if (!pmd)
+ return dax_load_hole(xas, mapping, &entry, vmf);
+ else
+ return dax_pmd_load_hole(xas, vmf, iomap, &entry);
+ }
+
+ if (iomap->type != IOMAP_MAPPED) {
+ WARN_ON_ONCE(1);
+ return VM_FAULT_SIGBUS;
+ }
+
+ err = dax_iomap_pfn(iomap, pos, size, &pfn);
+ if (err)
+ return dax_fault_return(err);
+
+ entry = dax_insert_entry(xas, mapping, vmf, entry, pfn, 0,
+ write && !sync);
+
+ if (sync)
+ return dax_fault_synchronous_pfnp(pfnp, pfn);
+
+ if (pmd)
+ return vmf_insert_pfn_pmd(vmf, pfn, write);
+ if (write)
+ return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
+ else
+ return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
+}
+
static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
int *iomap_errp, const struct iomap_ops *ops)
{
@@ -1296,17 +1411,14 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
struct address_space *mapping = vma->vm_file->f_mapping;
XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
struct inode *inode = mapping->host;
- unsigned long vaddr = vmf->address;
loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
struct iomap iomap = { .type = IOMAP_HOLE };
struct iomap srcmap = { .type = IOMAP_HOLE };
unsigned flags = IOMAP_FAULT;
int error, major = 0;
bool write = vmf->flags & FAULT_FLAG_WRITE;
- bool sync;
vm_fault_t ret = 0;
void *entry;
- pfn_t pfn;

trace_dax_pte_fault(inode, vmf, ret);
/*
@@ -1352,8 +1464,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
goto unlock_entry;
}
if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
- error = -EIO; /* fs corruption? */
- goto error_finish_iomap;
+ ret = VM_FAULT_SIGBUS; /* fs corruption? */
+ goto finish_iomap;
}

if (vmf->cow_page) {
@@ -1363,49 +1475,19 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
goto finish_iomap;
}

- sync = dax_fault_is_synchronous(flags, vma, &iomap);
-
- switch (iomap.type) {
- case IOMAP_MAPPED:
- if (iomap.flags & IOMAP_F_NEW) {
- count_vm_event(PGMAJFAULT);
- count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
- major = VM_FAULT_MAJOR;
- }
- error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
- if (error < 0)
- goto error_finish_iomap;
-
- entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
- 0, write && !sync);
-
- if (sync) {
- ret = dax_fault_synchronous_pfnp(pfnp, pfn);
- goto finish_iomap;
- }
- trace_dax_insert_mapping(inode, vmf, entry);
- if (write)
- ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn);
- else
- ret = vmf_insert_mixed(vma, vaddr, pfn);
-
+ ret = dax_fault_actor(vmf, pfnp, &xas, entry, false, flags,
+ &iomap, &srcmap);
+ if (ret == VM_FAULT_SIGBUS)
goto finish_iomap;
- case IOMAP_UNWRITTEN:
- case IOMAP_HOLE:
- if (!write) {
- ret = dax_load_hole(&xas, mapping, &entry, vmf);
- goto finish_iomap;
- }
- fallthrough;
- default:
- WARN_ON_ONCE(1);
- error = -EIO;
- break;
+
+ /* read/write MAPPED, CoW UNWRITTEN */
+ if (iomap.flags & IOMAP_F_NEW) {
+ count_vm_event(PGMAJFAULT);
+ count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
+ major = VM_FAULT_MAJOR;
}

- error_finish_iomap:
- ret = dax_fault_return(error);
- finish_iomap:
+finish_iomap:
if (ops->iomap_end) {
int copied = PAGE_SIZE;

@@ -1419,66 +1501,14 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
*/
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
}
- unlock_entry:
+unlock_entry:
dax_unlock_entry(&xas, entry);
- out:
+out:
trace_dax_pte_fault_done(inode, vmf, ret);
return ret | major;
}

#ifdef CONFIG_FS_DAX_PMD
-static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
- struct iomap *iomap, void **entry)
-{
- struct address_space *mapping = vmf->vma->vm_file->f_mapping;
- unsigned long pmd_addr = vmf->address & PMD_MASK;
- struct vm_area_struct *vma = vmf->vma;
- struct inode *inode = mapping->host;
- pgtable_t pgtable = NULL;
- struct page *zero_page;
- spinlock_t *ptl;
- pmd_t pmd_entry;
- pfn_t pfn;
-
- zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
-
- if (unlikely(!zero_page))
- goto fallback;
-
- pfn = page_to_pfn_t(zero_page);
- *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
- DAX_PMD | DAX_ZERO_PAGE, false);
-
- if (arch_needs_pgtable_deposit()) {
- pgtable = pte_alloc_one(vma->vm_mm);
- if (!pgtable)
- return VM_FAULT_OOM;
- }
-
- ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
- if (!pmd_none(*(vmf->pmd))) {
- spin_unlock(ptl);
- goto fallback;
- }
-
- if (pgtable) {
- pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
- mm_inc_nr_ptes(vma->vm_mm);
- }
- pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
- pmd_entry = pmd_mkhuge(pmd_entry);
- set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
- spin_unlock(ptl);
- trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
- return VM_FAULT_NOPAGE;
-
-fallback:
- if (pgtable)
- pte_free(vma->vm_mm, pgtable);
- trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
- return VM_FAULT_FALLBACK;
-}
-
static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas,
pgoff_t max_pgoff)
{
@@ -1519,17 +1549,15 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
struct address_space *mapping = vma->vm_file->f_mapping;
XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
bool write = vmf->flags & FAULT_FLAG_WRITE;
- bool sync;
- unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
+ unsigned int flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
struct inode *inode = mapping->host;
- vm_fault_t result = VM_FAULT_FALLBACK;
+ vm_fault_t ret = VM_FAULT_FALLBACK;
struct iomap iomap = { .type = IOMAP_HOLE };
struct iomap srcmap = { .type = IOMAP_HOLE };
pgoff_t max_pgoff;
void *entry;
loff_t pos;
int error;
- pfn_t pfn;

/*
* Check whether offset isn't beyond end of file now. Caller is
@@ -1541,7 +1569,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);

if (xas.xa_index >= max_pgoff) {
- result = VM_FAULT_SIGBUS;
+ ret = VM_FAULT_SIGBUS;
goto out;
}

@@ -1556,7 +1584,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
*/
entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
if (xa_is_internal(entry)) {
- result = xa_to_internal(entry);
+ ret = xa_to_internal(entry);
goto fallback;
}

@@ -1568,7 +1596,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
*/
if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
!pmd_devmap(*vmf->pmd)) {
- result = 0;
+ ret = 0;
goto unlock_entry;
}

@@ -1578,49 +1606,21 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* to look up our filesystem block.
*/
pos = (loff_t)xas.xa_index << PAGE_SHIFT;
- error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap,
- &srcmap);
+ error = ops->iomap_begin(inode, pos, PMD_SIZE, flags, &iomap, &srcmap);
if (error)
goto unlock_entry;

if (iomap.offset + iomap.length < pos + PMD_SIZE)
goto finish_iomap;

- sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
-
- switch (iomap.type) {
- case IOMAP_MAPPED:
- error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
- if (error < 0)
- goto finish_iomap;
+ ret = dax_fault_actor(vmf, pfnp, &xas, entry, true, flags,
+ &iomap, &srcmap);

- entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
- DAX_PMD, write && !sync);
-
- if (sync) {
- result = dax_fault_synchronous_pfnp(pfnp, pfn);
- goto finish_iomap;
- }
-
- trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
- result = vmf_insert_pfn_pmd(vmf, pfn, write);
- break;
- case IOMAP_UNWRITTEN:
- case IOMAP_HOLE:
- if (WARN_ON_ONCE(write))
- break;
- result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry);
- break;
- default:
- WARN_ON_ONCE(1);
- break;
- }
-
- finish_iomap:
+finish_iomap:
if (ops->iomap_end) {
int copied = PMD_SIZE;

- if (result == VM_FAULT_FALLBACK)
+ if (ret == VM_FAULT_FALLBACK)
copied = 0;
/*
* The fault is done by now and there's no way back (other
@@ -1628,19 +1628,18 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* Just ignore error from ->iomap_end since we cannot do much
* with it.
*/
- ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
- &iomap);
+ ops->iomap_end(inode, pos, PMD_SIZE, copied, flags, &iomap);
}
- unlock_entry:
+unlock_entry:
dax_unlock_entry(&xas, entry);
- fallback:
- if (result == VM_FAULT_FALLBACK) {
+fallback:
+ if (ret == VM_FAULT_FALLBACK) {
split_huge_pmd(vma, vmf->pmd, vmf->address);
count_vm_event(THP_FAULT_FALLBACK);
}
out:
- trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
- return result;
+ trace_dax_pmd_fault_done(inode, vmf, max_pgoff, ret);
+ return ret;
}
#else
static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
--
2.30.1



2021-03-19 02:00:20

by Shiyang Ruan

[permalink] [raw]
Subject: [PATCH v3 03/10] fsdax: Output address in dax_iomap_pfn() and rename it

Add address output in dax_iomap_pfn() in order to perform a memcpy() in
CoW case. Since this function both output address and pfn, rename it to
dax_iomap_direct_access().

Signed-off-by: Shiyang Ruan <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
---
fs/dax.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 33ddad0f3091..a70e6aa285bb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -997,8 +997,8 @@ static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
}

-static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
- pfn_t *pfnp)
+static int dax_iomap_direct_access(struct iomap *iomap, loff_t pos, size_t size,
+ void **kaddr, pfn_t *pfnp)
{
const sector_t sector = dax_iomap_sector(iomap, pos);
pgoff_t pgoff;
@@ -1010,11 +1010,13 @@ static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
return rc;
id = dax_read_lock();
length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
- NULL, pfnp);
+ kaddr, pfnp);
if (length < 0) {
rc = length;
goto out;
}
+ if (!pfnp)
+ goto out_check_addr;
rc = -EINVAL;
if (PFN_PHYS(length) < size)
goto out;
@@ -1024,6 +1026,12 @@ static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
if (length > 1 && !pfn_t_devmap(*pfnp))
goto out;
rc = 0;
+
+out_check_addr:
+ if (!kaddr)
+ goto out;
+ if (!*kaddr)
+ rc = -EFAULT;
out:
dax_read_unlock(id);
return rc;
@@ -1386,7 +1394,7 @@ static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp,
return VM_FAULT_SIGBUS;
}

- err = dax_iomap_pfn(iomap, pos, size, &pfn);
+ err = dax_iomap_direct_access(iomap, pos, size, NULL, &pfn);
if (err)
return dax_fault_return(err);

--
2.30.1



2021-03-19 02:01:15

by Shiyang Ruan

[permalink] [raw]
Subject: [PATCH v3 04/10] fsdax: Introduce dax_iomap_cow_copy()

In the case where the iomap is a write operation and iomap is not equal
to srcmap after iomap_begin, we consider it is a CoW operation.

The destance extent which iomap indicated is new allocated extent.
So, it is needed to copy the data from srcmap to new allocated extent.
In theory, it is better to copy the head and tail ranges which is
outside of the non-aligned area instead of copying the whole aligned
range. But in dax page fault, it will always be an aligned range. So,
we have to copy the whole range in this case.

Signed-off-by: Shiyang Ruan <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
---
fs/dax.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 66 insertions(+), 5 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index a70e6aa285bb..181aad97136a 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1037,6 +1037,51 @@ static int dax_iomap_direct_access(struct iomap *iomap, loff_t pos, size_t size,
return rc;
}

+/*
+ * Copy the head and tail part of the pages not included in the write but
+ * required for CoW, because pos/pos+length are not page aligned. But in dax
+ * page fault case, the range is page aligned, we need to copy the whole range
+ * of data. Use copy_edge to distinguish these cases.
+ */
+static int dax_iomap_cow_copy(loff_t pos, loff_t length, size_t align_size,
+ struct iomap *srcmap, void *daddr, bool copy_edge)
+{
+ loff_t head_off = pos & (align_size - 1);
+ size_t size = ALIGN(head_off + length, align_size);
+ loff_t end = pos + length;
+ loff_t pg_end = round_up(end, align_size);
+ void *saddr = 0;
+ int ret = 0;
+
+ ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
+ if (ret)
+ return ret;
+
+ if (!copy_edge)
+ return copy_mc_to_kernel(daddr, saddr, length);
+
+ /* Copy the head part of the range. Note: we pass offset as length. */
+ if (head_off) {
+ if (saddr)
+ ret = copy_mc_to_kernel(daddr, saddr, head_off);
+ else
+ memset(daddr, 0, head_off);
+ }
+ /* Copy the tail part of the range */
+ if (end < pg_end) {
+ loff_t tail_off = head_off + length;
+ loff_t tail_len = pg_end - end;
+
+ if (saddr)
+ ret = copy_mc_to_kernel(daddr + tail_off,
+ saddr + tail_off, tail_len);
+ else
+ memset(daddr + tail_off, 0, tail_len);
+ }
+
+ return ret;
+}
+
/*
* The user has performed a load from a hole in the file. Allocating a new
* page in the file would cause excessive storage usage for workloads with
@@ -1166,11 +1211,12 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct dax_device *dax_dev = iomap->dax_dev;
struct iov_iter *iter = data;
loff_t end = pos + length, done = 0;
+ bool write = iov_iter_rw(iter) == WRITE;
ssize_t ret = 0;
size_t xfer;
int id;

- if (iov_iter_rw(iter) == READ) {
+ if (!write) {
end = min(end, i_size_read(inode));
if (pos >= end)
return 0;
@@ -1179,7 +1225,8 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
return iov_iter_zero(min(length, end - pos), iter);
}

- if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
+ if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
+ !(iomap->flags & IOMAP_F_SHARED)))
return -EIO;

/*
@@ -1218,6 +1265,13 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
break;
}

+ if (write && srcmap->addr != iomap->addr) {
+ ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap,
+ kaddr, true);
+ if (ret)
+ break;
+ }
+
map_len = PFN_PHYS(map_len);
kaddr += offset;
map_len -= offset;
@@ -1229,7 +1283,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
* validated via access_ok() in either vfs_read() or
* vfs_write(), depending on which operation we are doing.
*/
- if (iov_iter_rw(iter) == WRITE)
+ if (write)
xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
map_len, iter);
else
@@ -1379,6 +1433,7 @@ static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp,
bool sync = dax_fault_is_synchronous(flags, vmf->vma, iomap);
int err = 0;
pfn_t pfn;
+ void *kaddr;

/* if we are reading UNWRITTEN and HOLE, return a hole. */
if (!write &&
@@ -1389,18 +1444,24 @@ static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp,
return dax_pmd_load_hole(xas, vmf, iomap, &entry);
}

- if (iomap->type != IOMAP_MAPPED) {
+ if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) {
WARN_ON_ONCE(1);
return VM_FAULT_SIGBUS;
}

- err = dax_iomap_direct_access(iomap, pos, size, NULL, &pfn);
+ err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
if (err)
return dax_fault_return(err);

entry = dax_insert_entry(xas, mapping, vmf, entry, pfn, 0,
write && !sync);

+ if (write && srcmap->addr != iomap->addr) {
+ err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr, false);
+ if (err)
+ return dax_fault_return(err);
+ }
+
if (sync)
return dax_fault_synchronous_pfnp(pfnp, pfn);

--
2.30.1



2021-03-19 02:02:00

by Shiyang Ruan

[permalink] [raw]
Subject: [PATCH v3 06/10] fsdax: Add dax_iomap_cow_copy() for dax_iomap_zero

Punch hole on a reflinked file needs dax_copy_edge() too. Otherwise,
data in not aligned area will be not correct. So, add the srcmap to
dax_iomap_zero() and replace memset() as dax_copy_edge().

Signed-off-by: Shiyang Ruan <[email protected]>
---
fs/dax.c | 9 +++++++--
fs/iomap/buffered-io.c | 2 +-
include/linux/dax.h | 3 ++-
3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index cfe513eb111e..348297b38f76 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1174,7 +1174,8 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
}
#endif /* CONFIG_FS_DAX_PMD */

-s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
+s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
+ struct iomap *srcmap)
{
sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
pgoff_t pgoff;
@@ -1204,7 +1205,11 @@ s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
}

if (!page_aligned) {
- memset(kaddr + offset, 0, size);
+ if (iomap->addr != srcmap->addr)
+ dax_iomap_cow_copy(offset, size, PAGE_SIZE, srcmap,
+ kaddr, true);
+ else
+ memset(kaddr + offset, 0, size);
dax_flush(iomap->dax_dev, kaddr + offset, size);
}
dax_read_unlock(id);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 16a1e82e3aeb..d754b1f1a05d 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -933,7 +933,7 @@ static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos,
s64 bytes;

if (IS_DAX(inode))
- bytes = dax_iomap_zero(pos, length, iomap);
+ bytes = dax_iomap_zero(pos, length, iomap, srcmap);
else
bytes = iomap_zero(inode, pos, length, iomap, srcmap);
if (bytes < 0)
diff --git a/include/linux/dax.h b/include/linux/dax.h
index b52f084aa643..3275e01ed33d 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -237,7 +237,8 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index);
-s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap);
+s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
+ struct iomap *srcmap);
static inline bool dax_mapping(struct address_space *mapping)
{
return mapping->host && IS_DAX(mapping->host);
--
2.30.1



2021-03-19 02:03:18

by Shiyang Ruan

[permalink] [raw]
Subject: [PATCH v3 08/10] fsdax: Dedup file range to use a compare function

With dax we cannot deal with readpage() etc. So, we create a dax
comparison funciton which is similar with
vfs_dedupe_file_range_compare().
And introduce dax_remap_file_range_prep() for filesystem use.

Signed-off-by: Goldwyn Rodrigues <[email protected]>
Signed-off-by: Shiyang Ruan <[email protected]>
---
fs/dax.c | 56 ++++++++++++++++++++++++++++++++++++++++++++
fs/remap_range.c | 45 ++++++++++++++++++++++++++++-------
fs/xfs/xfs_reflink.c | 9 +++++--
include/linux/dax.h | 4 ++++
include/linux/fs.h | 15 ++++++++----
5 files changed, 115 insertions(+), 14 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 348297b38f76..76f81f1d76ec 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1833,3 +1833,59 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
return dax_insert_pfn_mkwrite(vmf, pfn, order);
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
+
+static loff_t dax_range_compare_actor(struct inode *ino1, loff_t pos1,
+ struct inode *ino2, loff_t pos2, loff_t len, void *data,
+ struct iomap *smap, struct iomap *dmap)
+{
+ void *saddr, *daddr;
+ bool *same = data;
+ int ret;
+
+ if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
+ *same = true;
+ return len;
+ }
+
+ if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
+ *same = false;
+ return 0;
+ }
+
+ ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
+ &saddr, NULL);
+ if (ret < 0)
+ return -EIO;
+
+ ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
+ &daddr, NULL);
+ if (ret < 0)
+ return -EIO;
+
+ *same = !memcmp(saddr, daddr, len);
+ return len;
+}
+
+int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+ struct inode *dest, loff_t destoff, loff_t len, bool *is_same,
+ const struct iomap_ops *ops)
+{
+ int id, ret = 0;
+
+ id = dax_read_lock();
+ while (len) {
+ ret = iomap_apply2(src, srcoff, dest, destoff, len, 0, ops,
+ is_same, dax_range_compare_actor);
+ if (ret < 0 || !*is_same)
+ goto out;
+
+ len -= ret;
+ srcoff += ret;
+ destoff += ret;
+ }
+ ret = 0;
+out:
+ dax_read_unlock(id);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dax_dedupe_file_range_compare);
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 77dba3a49e65..9079390edaf3 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -14,6 +14,7 @@
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/fs.h>
+#include <linux/dax.h>
#include "internal.h"

#include <linux/uaccess.h>
@@ -199,9 +200,9 @@ static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
* Compare extents of two files to see if they are the same.
* Caller must have locked both inodes to prevent write races.
*/
-static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
- struct inode *dest, loff_t destoff,
- loff_t len, bool *is_same)
+int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+ struct inode *dest, loff_t destoff,
+ loff_t len, bool *is_same)
{
loff_t src_poff;
loff_t dest_poff;
@@ -280,6 +281,7 @@ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
out_error:
return error;
}
+EXPORT_SYMBOL(vfs_dedupe_file_range_compare);

/*
* Check that the two inodes are eligible for cloning, the ranges make
@@ -289,9 +291,11 @@ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
* If there's an error, then the usual negative error code is returned.
* Otherwise returns 0 with *len set to the request length.
*/
-int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *len, unsigned int remap_flags)
+static int
+__generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags,
+ const struct iomap_ops *ops)
{
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
@@ -351,8 +355,15 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (remap_flags & REMAP_FILE_DEDUP) {
bool is_same = false;

- ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
- inode_out, pos_out, *len, &is_same);
+ if (!IS_DAX(inode_in) && !IS_DAX(inode_out))
+ ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
+ inode_out, pos_out, *len, &is_same);
+ else if (IS_DAX(inode_in) && IS_DAX(inode_out) && ops)
+ ret = dax_dedupe_file_range_compare(inode_in, pos_in,
+ inode_out, pos_out, *len, &is_same,
+ ops);
+ else
+ return -EINVAL;
if (ret)
return ret;
if (!is_same)
@@ -370,6 +381,24 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,

return ret;
}
+
+int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags,
+ const struct iomap_ops *ops)
+{
+ return __generic_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags, ops);
+}
+EXPORT_SYMBOL(dax_remap_file_range_prep);
+
+int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags)
+{
+ return __generic_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags, NULL);
+}
EXPORT_SYMBOL(generic_remap_file_range_prep);

loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 6fa05fb78189..f5b3a3da36b7 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1308,8 +1308,13 @@ xfs_reflink_remap_prep(
if (IS_DAX(inode_in) || IS_DAX(inode_out))
goto out_unlock;

- ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
- len, remap_flags);
+ if (IS_DAX(inode_in))
+ ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags);
+ else
+ ret = dax_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags,
+ &xfs_read_iomap_ops);
if (ret || *len == 0)
goto out_unlock;

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 3275e01ed33d..32e1c34349f2 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -239,6 +239,10 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index);
s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
struct iomap *srcmap);
+int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+ struct inode *dest, loff_t destoff,
+ loff_t len, bool *is_same,
+ const struct iomap_ops *ops);
static inline bool dax_mapping(struct address_space *mapping)
{
return mapping->host && IS_DAX(mapping->host);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd47deea7c17..2e6ec5bdf82a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -68,6 +68,7 @@ struct fsverity_info;
struct fsverity_operations;
struct fs_context;
struct fs_parameter_spec;
+struct iomap_ops;

extern void __init inode_init(void);
extern void __init inode_init_early(void);
@@ -1910,13 +1911,19 @@ extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
loff_t, size_t, unsigned int);
+typedef int (*compare_range_t)(struct inode *src, loff_t srcpos,
+ struct inode *dest, loff_t destpos,
+ loff_t len, bool *is_same);
extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t len, unsigned int flags);
-extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *count,
- unsigned int remap_flags);
+int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *count, unsigned int remap_flags);
+int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags,
+ const struct iomap_ops *ops);
extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags);
--
2.30.1



2021-03-19 02:03:40

by Shiyang Ruan

[permalink] [raw]
Subject: [PATCH v3 07/10] iomap: Introduce iomap_apply2() for operations on two files

Some operations, such as comparing a range of data in two files under
fsdax mode, requires nested iomap_open()/iomap_end() on two file. Thus,
we introduce iomap_apply2() to accept arguments from two files and
iomap_actor2_t for actions on two files.

Signed-off-by: Shiyang Ruan <[email protected]>
---
fs/iomap/apply.c | 56 +++++++++++++++++++++++++++++++++++++++++++
include/linux/iomap.h | 7 +++++-
2 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c
index 26ab6563181f..fbc38ce3d5b6 100644
--- a/fs/iomap/apply.c
+++ b/fs/iomap/apply.c
@@ -97,3 +97,59 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,

return written ? written : ret;
}
+
+loff_t
+iomap_apply2(struct inode *ino1, loff_t pos1, struct inode *ino2, loff_t pos2,
+ loff_t length, unsigned int flags, const struct iomap_ops *ops,
+ void *data, iomap_actor2_t actor)
+{
+ struct iomap smap = { .type = IOMAP_HOLE };
+ struct iomap dmap = { .type = IOMAP_HOLE };
+ loff_t written = 0, ret, ret2 = 0;
+ loff_t len1 = length, len2, min_len;
+
+ ret = ops->iomap_begin(ino1, pos1, len1, flags, &smap, NULL);
+ if (ret)
+ goto out_src;
+ if (WARN_ON(smap.offset > pos1)) {
+ written = -EIO;
+ goto out_src;
+ }
+ if (WARN_ON(smap.length == 0)) {
+ written = -EIO;
+ goto out_src;
+ }
+ len2 = min_t(loff_t, len1, smap.length);
+
+ ret = ops->iomap_begin(ino2, pos2, len2, flags, &dmap, NULL);
+ if (ret)
+ goto out_dest;
+ if (WARN_ON(dmap.offset > pos2)) {
+ written = -EIO;
+ goto out_dest;
+ }
+ if (WARN_ON(dmap.length == 0)) {
+ written = -EIO;
+ goto out_dest;
+ }
+ min_len = min_t(loff_t, len2, dmap.length);
+
+ written = actor(ino1, pos1, ino2, pos2, min_len, data, &smap, &dmap);
+
+out_dest:
+ if (ops->iomap_end)
+ ret2 = ops->iomap_end(ino2, pos2, len2,
+ written > 0 ? written : 0, flags, &dmap);
+out_src:
+ if (ops->iomap_end)
+ ret = ops->iomap_end(ino1, pos1, len1,
+ written > 0 ? written : 0, flags, &smap);
+
+ if (ret)
+ return written ? written : ret;
+
+ if (ret2)
+ return written ? written : ret2;
+
+ return written;
+}
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 5bd3cac4df9c..913f98897a77 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -148,10 +148,15 @@ struct iomap_ops {
*/
typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
void *data, struct iomap *iomap, struct iomap *srcmap);
-
+typedef loff_t (*iomap_actor2_t)(struct inode *ino1, loff_t pos1,
+ struct inode *ino2, loff_t pos2, loff_t len, void *data,
+ struct iomap *smap, struct iomap *dmap);
loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
unsigned flags, const struct iomap_ops *ops, void *data,
iomap_actor_t actor);
+loff_t iomap_apply2(struct inode *ino1, loff_t pos1, struct inode *ino2,
+ loff_t pos2, loff_t length, unsigned int flags,
+ const struct iomap_ops *ops, void *data, iomap_actor2_t actor);

ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
const struct iomap_ops *ops);
--
2.30.1



2021-03-19 02:03:47

by Shiyang Ruan

[permalink] [raw]
Subject: [PATCH v3 05/10] fsdax: Replace mmap entry in case of CoW

We replace the existing entry to the newly allocated one in case of CoW.
Also, we mark the entry as PAGECACHE_TAG_TOWRITE so writeback marks this
entry as writeprotected. This helps us snapshots so new write
pagefaults after snapshots trigger a CoW.

Signed-off-by: Goldwyn Rodrigues <[email protected]>
Signed-off-by: Shiyang Ruan <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
---
fs/dax.c | 37 ++++++++++++++++++++++++++-----------
1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 181aad97136a..cfe513eb111e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -722,6 +722,9 @@ static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_d
return 0;
}

+#define DAX_IF_DIRTY (1 << 0)
+#define DAX_IF_COW (1 << 1)
+
/*
* By this point grab_mapping_entry() has ensured that we have a locked entry
* of the appropriate size so we don't have to worry about downgrading PMDs to
@@ -729,16 +732,19 @@ static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_d
* already in the tree, we will skip the insertion and just dirty the PMD as
* appropriate.
*/
-static void *dax_insert_entry(struct xa_state *xas,
- struct address_space *mapping, struct vm_fault *vmf,
- void *entry, pfn_t pfn, unsigned long flags, bool dirty)
+static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
+ void *entry, pfn_t pfn, unsigned long flags,
+ unsigned int insert_flags)
{
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
void *new_entry = dax_make_entry(pfn, flags);
+ bool dirty = insert_flags & DAX_IF_DIRTY;
+ bool cow = insert_flags & DAX_IF_COW;

if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);

- if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
+ if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
unsigned long index = xas->xa_index;
/* we are replacing a zero page with block mapping */
if (dax_is_pmd_entry(entry))
@@ -750,7 +756,7 @@ static void *dax_insert_entry(struct xa_state *xas,

xas_reset(xas);
xas_lock_irq(xas);
- if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+ if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
void *old;

dax_disassociate_entry(entry, mapping, false);
@@ -774,6 +780,9 @@ static void *dax_insert_entry(struct xa_state *xas,
if (dirty)
xas_set_mark(xas, PAGECACHE_TAG_DIRTY);

+ if (cow)
+ xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
+
xas_unlock_irq(xas);
return entry;
}
@@ -1098,8 +1107,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
vm_fault_t ret;

- *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
- DAX_ZERO_PAGE, false);
+ *entry = dax_insert_entry(xas, vmf, *entry, pfn, DAX_ZERO_PAGE, 0);

ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
trace_dax_load_hole(inode, vmf, ret);
@@ -1126,8 +1134,8 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
goto fallback;

pfn = page_to_pfn_t(zero_page);
- *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
- DAX_PMD | DAX_ZERO_PAGE, false);
+ *entry = dax_insert_entry(xas, vmf, *entry, pfn,
+ DAX_PMD | DAX_ZERO_PAGE, 0);

if (arch_needs_pgtable_deposit()) {
pgtable = pte_alloc_one(vma->vm_mm);
@@ -1431,6 +1439,7 @@ static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp,
loff_t pos = (loff_t)xas->xa_offset << PAGE_SHIFT;
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool sync = dax_fault_is_synchronous(flags, vmf->vma, iomap);
+ unsigned int insert_flags = 0;
int err = 0;
pfn_t pfn;
void *kaddr;
@@ -1453,8 +1462,14 @@ static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp,
if (err)
return dax_fault_return(err);

- entry = dax_insert_entry(xas, mapping, vmf, entry, pfn, 0,
- write && !sync);
+ if (write) {
+ if (!sync)
+ insert_flags |= DAX_IF_DIRTY;
+ if (iomap->flags & IOMAP_F_SHARED)
+ insert_flags |= DAX_IF_COW;
+ }
+
+ entry = dax_insert_entry(xas, vmf, entry, pfn, 0, insert_flags);

if (write && srcmap->addr != iomap->addr) {
err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr, false);
--
2.30.1



2021-03-19 02:04:56

by Shiyang Ruan

[permalink] [raw]
Subject: [PATCH v3 10/10] fs/xfs: Add dedupe support for fsdax

Add xfs_break_two_dax_layouts() to break layout for tow dax files. Then
call compare range function only when files are both DAX or not.

Signed-off-by: Shiyang Ruan <[email protected]>
---
fs/xfs/xfs_file.c | 20 ++++++++++++++++++++
fs/xfs/xfs_inode.c | 8 +++++++-
fs/xfs/xfs_inode.h | 1 +
fs/xfs/xfs_reflink.c | 5 +++--
4 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1987d15eab61..82467d08e3ce 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -784,6 +784,26 @@ xfs_break_dax_layouts(
0, 0, xfs_wait_dax_page(inode));
}

+int
+xfs_break_two_dax_layouts(
+ struct inode *src,
+ struct inode *dest)
+{
+ int error;
+ bool retry = false;
+
+retry:
+ error = xfs_break_dax_layouts(src, &retry);
+ if (error || retry)
+ goto retry;
+
+ error = xfs_break_dax_layouts(dest, &retry);
+ if (error || retry)
+ goto retry;
+
+ return error;
+}
+
int
xfs_break_layouts(
struct inode *inode,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b7352bc4c815..c11b11e59a83 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3651,8 +3651,10 @@ xfs_ilock2_io_mmap(
struct xfs_inode *ip2)
{
int ret;
+ struct inode *inode1 = VFS_I(ip1);
+ struct inode *inode2 = VFS_I(ip2);

- ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
+ ret = xfs_iolock_two_inodes_and_break_layout(inode1, inode2);
if (ret)
return ret;
if (ip1 == ip2)
@@ -3660,6 +3662,10 @@ xfs_ilock2_io_mmap(
else
xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL,
ip2, XFS_MMAPLOCK_EXCL);
+
+ if (IS_DAX(inode1) && IS_DAX(inode2))
+ ret = xfs_break_two_dax_layouts(inode1, inode2);
+
return 0;
}

diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index eca333f5f715..9ed7a2895602 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -431,6 +431,7 @@ enum xfs_prealloc_flags {

int xfs_update_prealloc_flags(struct xfs_inode *ip,
enum xfs_prealloc_flags flags);
+int xfs_break_two_dax_layouts(struct inode *inode1, struct inode *inode2);
int xfs_break_layouts(struct inode *inode, uint *iolock,
enum layout_break_reason reason);

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 73c556c2ff76..e62b00c2a0c8 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -29,6 +29,7 @@
#include "xfs_iomap.h"
#include "xfs_sb.h"
#include "xfs_ag_resv.h"
+#include <linux/dax.h>

/*
* Copy on Write of Shared Blocks
@@ -1303,8 +1304,8 @@ xfs_reflink_remap_prep(
if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
goto out_unlock;

- /* Don't share DAX file data for now. */
- if (IS_DAX(inode_in) || IS_DAX(inode_out))
+ /* Don't share DAX file data with non-DAX file. */
+ if (IS_DAX(inode_in) != IS_DAX(inode_out))
goto out_unlock;

if (IS_DAX(inode_in))
--
2.30.1



2021-03-19 02:05:31

by Shiyang Ruan

[permalink] [raw]
Subject: [PATCH v3 09/10] fs/xfs: Handle CoW for fsdax write() path

In fsdax mode, WRITE and ZERO on a shared extent need CoW performed. After
CoW, new allocated extents needs to be remapped to the file. So, add an
iomap_end for dax write ops to do the remapping work.

Signed-off-by: Shiyang Ruan <[email protected]>
---
fs/xfs/xfs_bmap_util.c | 3 +--
fs/xfs/xfs_file.c | 9 +++----
fs/xfs/xfs_iomap.c | 58 +++++++++++++++++++++++++++++++++++++++++-
fs/xfs/xfs_iomap.h | 4 +++
fs/xfs/xfs_iops.c | 7 +++--
fs/xfs/xfs_reflink.c | 3 +--
6 files changed, 69 insertions(+), 15 deletions(-)

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 7371a7f7c652..809013de9915 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -976,8 +976,7 @@ xfs_free_file_space(
return 0;
if (offset + len > XFS_ISIZE(ip))
len = XFS_ISIZE(ip) - offset;
- error = iomap_zero_range(VFS_I(ip), offset, len, NULL,
- &xfs_buffered_write_iomap_ops);
+ error = xfs_iomap_zero_range(VFS_I(ip), offset, len, NULL);
if (error)
return error;

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5b0f93f73837..1987d15eab61 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -623,11 +623,8 @@ xfs_file_dax_write(
count = iov_iter_count(from);

trace_xfs_file_dax_write(ip, count, pos);
- ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
- if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
- i_size_write(inode, iocb->ki_pos);
- error = xfs_setfilesize(ip, pos, ret);
- }
+ ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
+
out:
xfs_iunlock(ip, iolock);
if (error)
@@ -1250,7 +1247,7 @@ __xfs_filemap_fault(

ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
(write_fault && !vmf->cow_page) ?
- &xfs_direct_write_iomap_ops :
+ &xfs_dax_write_iomap_ops :
&xfs_read_iomap_ops);
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 7b9ff824e82d..0afae5dbbce1 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -771,7 +771,8 @@ xfs_direct_write_iomap_begin(

/* may drop and re-acquire the ilock */
error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
- &lockmode, flags & IOMAP_DIRECT);
+ &lockmode,
+ flags & IOMAP_DIRECT || IS_DAX(inode));
if (error)
goto out_unlock;
if (shared)
@@ -850,6 +851,38 @@ const struct iomap_ops xfs_direct_write_iomap_ops = {
.iomap_begin = xfs_direct_write_iomap_begin,
};

+static int
+xfs_dax_write_iomap_end(
+ struct inode *inode,
+ loff_t pos,
+ loff_t length,
+ ssize_t written,
+ unsigned int flags,
+ struct iomap *iomap)
+{
+ int error = 0;
+ xfs_inode_t *ip = XFS_I(inode);
+ bool cow = xfs_is_cow_inode(ip);
+
+ if (pos + written > i_size_read(inode)) {
+ i_size_write(inode, pos + written);
+ error = xfs_setfilesize(ip, pos, written);
+ if (error && cow) {
+ xfs_reflink_cancel_cow_range(ip, pos, written, true);
+ return error;
+ }
+ }
+ if (cow)
+ error = xfs_reflink_end_cow(ip, pos, written);
+
+ return error;
+}
+
+const struct iomap_ops xfs_dax_write_iomap_ops = {
+ .iomap_begin = xfs_direct_write_iomap_begin,
+ .iomap_end = xfs_dax_write_iomap_end,
+};
+
static int
xfs_buffered_write_iomap_begin(
struct inode *inode,
@@ -1308,3 +1341,26 @@ xfs_xattr_iomap_begin(
const struct iomap_ops xfs_xattr_iomap_ops = {
.iomap_begin = xfs_xattr_iomap_begin,
};
+
+int
+xfs_iomap_zero_range(
+ struct inode *inode,
+ loff_t offset,
+ loff_t len,
+ bool *did_zero)
+{
+ return iomap_zero_range(inode, offset, len, did_zero,
+ IS_DAX(inode) ? &xfs_dax_write_iomap_ops :
+ &xfs_buffered_write_iomap_ops);
+}
+
+int
+xfs_iomap_truncate_page(
+ struct inode *inode,
+ loff_t pos,
+ bool *did_zero)
+{
+ return iomap_truncate_page(inode, pos, did_zero,
+ IS_DAX(inode) ? &xfs_dax_write_iomap_ops :
+ &xfs_buffered_write_iomap_ops);
+}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 7d3703556d0e..8adb2bf78a5a 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -14,6 +14,9 @@ struct xfs_bmbt_irec;
int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
xfs_fileoff_t count_fsb, struct xfs_bmbt_irec *imap);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
+int xfs_iomap_zero_range(struct inode *inode, loff_t offset, loff_t len,
+ bool *did_zero);
+int xfs_iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero);
xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
xfs_fileoff_t end_fsb);

@@ -42,6 +45,7 @@ xfs_aligned_fsb_count(

extern const struct iomap_ops xfs_buffered_write_iomap_ops;
extern const struct iomap_ops xfs_direct_write_iomap_ops;
+extern const struct iomap_ops xfs_dax_write_iomap_ops;
extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 67c8dc9de8aa..2281161e05c8 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -889,8 +889,8 @@ xfs_setattr_size(
*/
if (newsize > oldsize) {
trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
- error = iomap_zero_range(inode, oldsize, newsize - oldsize,
- &did_zeroing, &xfs_buffered_write_iomap_ops);
+ error = xfs_iomap_zero_range(inode, oldsize, newsize - oldsize,
+ &did_zeroing);
} else {
/*
* iomap won't detect a dirty page over an unwritten block (or a
@@ -902,8 +902,7 @@ xfs_setattr_size(
newsize);
if (error)
return error;
- error = iomap_truncate_page(inode, newsize, &did_zeroing,
- &xfs_buffered_write_iomap_ops);
+ error = xfs_iomap_truncate_page(inode, newsize, &did_zeroing);
}

if (error)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index f5b3a3da36b7..73c556c2ff76 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1245,8 +1245,7 @@ xfs_reflink_zero_posteof(
return 0;

trace_xfs_zero_eof(ip, isize, pos - isize);
- return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
- &xfs_buffered_write_iomap_ops);
+ return xfs_iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL);
}

/*
--
2.30.1



2021-03-23 15:32:55

by Ritesh Harjani

[permalink] [raw]
Subject: Re: [PATCH v3 00/10] fsdax,xfs: Add reflink&dedupe support for fsdax



On 3/19/21 7:22 AM, Shiyang Ruan wrote:
> From: Shiyang Ruan <[email protected]>
>
> This patchset is attempt to add CoW support for fsdax, and take XFS,
> which has both reflink and fsdax feature, as an example.


Thanks for the patchset. I have tried reviewing the series from logical
correctness and to some extent functional correctness.
Since I am not well versed with the core functionality of COW operation,
so I may have requested for some clarifications where I could not get
the code 100% on what it is doing.


>
> (Rebased on v5.11)
> ==
>
Thanks. Yes, I see some conflicts when tried to apply on latest kernel.

-ritesh

2021-03-23 15:56:36

by Ritesh Harjani

[permalink] [raw]
Subject: Re: [PATCH v3 03/10] fsdax: Output address in dax_iomap_pfn() and rename it



On 3/19/21 7:22 AM, Shiyang Ruan wrote:
> Add address output in dax_iomap_pfn() in order to perform a memcpy() in
> CoW case. Since this function both output address and pfn, rename it to
> dax_iomap_direct_access().
>
> Signed-off-by: Shiyang Ruan <[email protected]>
> Reviewed-by: Christoph Hellwig <[email protected]>
> ---
> fs/dax.c | 16 ++++++++++++----
> 1 file changed, 12 insertions(+), 4 deletions(-)
>

Like the naming convention. It is consistent with
dax_direct_access() function.

Changes looks good to me. Feel free to add.

Reviewed-by: Ritesh Harjani <[email protected]>

2021-03-23 16:03:19

by Ritesh Harjani

[permalink] [raw]
Subject: Re: [PATCH v3 02/10] fsdax: Factor helper: dax_fault_actor()



On 3/19/21 7:22 AM, Shiyang Ruan wrote:
> The core logic in the two dax page fault functions is similar. So, move
> the logic into a common helper function. Also, to facilitate the
> addition of new features, such as CoW, switch-case is no longer used to
> handle different iomap types.
>
> Signed-off-by: Shiyang Ruan <[email protected]>
> ---
> fs/dax.c | 291 +++++++++++++++++++++++++++----------------------------
> 1 file changed, 145 insertions(+), 146 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 7031e4302b13..33ddad0f3091 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1053,6 +1053,66 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
> return ret;
> }
>
> +#ifdef CONFIG_FS_DAX_PMD
> +static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
> + struct iomap *iomap, void **entry)
> +{
> + struct address_space *mapping = vmf->vma->vm_file->f_mapping;
> + unsigned long pmd_addr = vmf->address & PMD_MASK;
> + struct vm_area_struct *vma = vmf->vma;
> + struct inode *inode = mapping->host;
> + pgtable_t pgtable = NULL;
> + struct page *zero_page;
> + spinlock_t *ptl;
> + pmd_t pmd_entry;
> + pfn_t pfn;
> +
> + zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
> +
> + if (unlikely(!zero_page))
> + goto fallback;
> +
> + pfn = page_to_pfn_t(zero_page);
> + *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
> + DAX_PMD | DAX_ZERO_PAGE, false);
> +
> + if (arch_needs_pgtable_deposit()) {
> + pgtable = pte_alloc_one(vma->vm_mm);
> + if (!pgtable)
> + return VM_FAULT_OOM;
> + }
> +
> + ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
> + if (!pmd_none(*(vmf->pmd))) {
> + spin_unlock(ptl);
> + goto fallback;
> + }
> +
> + if (pgtable) {
> + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
> + mm_inc_nr_ptes(vma->vm_mm);
> + }
> + pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
> + pmd_entry = pmd_mkhuge(pmd_entry);
> + set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
> + spin_unlock(ptl);
> + trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
> + return VM_FAULT_NOPAGE;
> +
> +fallback:
> + if (pgtable)
> + pte_free(vma->vm_mm, pgtable);
> + trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
> + return VM_FAULT_FALLBACK;
> +}
> +#else
> +static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
> + struct iomap *iomap, void **entry)
> +{
> + return VM_FAULT_FALLBACK;
> +}
> +#endif /* CONFIG_FS_DAX_PMD */
> +
> s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
> {
> sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
> @@ -1289,6 +1349,61 @@ static int dax_fault_cow_page(struct vm_fault *vmf, struct iomap *iomap,
> return 0;
> }
>
> +/**
> + * dax_fault_actor - Common actor to handle pfn insertion in PTE/PMD fault.
> + * @vmf: vm fault instance
> + * @pfnp: pfn to be returned
> + * @xas: the dax mapping tree of a file
> + * @entry: an unlocked dax entry to be inserted
> + * @pmd: distinguish whether it is a pmd fault
> + * @flags: iomap flags
> + * @iomap: from iomap_begin()
> + * @srcmap: from iomap_begin(), not equal to iomap if it is a CoW
> + */
> +static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp,
> + struct xa_state *xas, void *entry, bool pmd, unsigned int flags,
> + struct iomap *iomap, struct iomap *srcmap)
> +{
> + struct address_space *mapping = vmf->vma->vm_file->f_mapping;
> + size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
> + loff_t pos = (loff_t)xas->xa_offset << PAGE_SHIFT;

shouldn't we use xa_index here for pos ?
(loff_t)xas->xa_index << PAGE_SHIFT;

> + bool write = vmf->flags & FAULT_FLAG_WRITE;
> + bool sync = dax_fault_is_synchronous(flags, vmf->vma, iomap);
> + int err = 0;
> + pfn_t pfn;
> +
> + /* if we are reading UNWRITTEN and HOLE, return a hole. */
> + if (!write &&
> + (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) {
> + if (!pmd)
> + return dax_load_hole(xas, mapping, &entry, vmf);
> + else
> + return dax_pmd_load_hole(xas, vmf, iomap, &entry);
> + }
> +
> + if (iomap->type != IOMAP_MAPPED) {
> + WARN_ON_ONCE(1);
> + return VM_FAULT_SIGBUS;
> + }

So now in case if mapping is not mapped, we always cause
VM_FAULT_SIGBUG. But earlier we were only doing WARN_ON_ONCE(1).
Can you pls help answer why the change in behavior?




> +
> + err = dax_iomap_pfn(iomap, pos, size, &pfn);
> + if (err)
> + return dax_fault_return(err);

Same case here as well. This could return SIGBUS while earlier I am not
sure why were we only returning FALLBACK?


> +
> + entry = dax_insert_entry(xas, mapping, vmf, entry, pfn, 0,
> + write && !sync);

In dax_insert_entry() we are passing 0 as flags.
We should be passing DAX_PMD/DAX_PTE no?


> +
> + if (sync)
> + return dax_fault_synchronous_pfnp(pfnp, pfn);
> +


/* handle PMD case here */
> + if (pmd)
> + return vmf_insert_pfn_pmd(vmf, pfn, write);

/* handle PTE case here */
> + if (write)
> + return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
> + else
> + return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
> +}

It is easy to miss the return from if(pmd) case while reading.
A comment like above could be helpful for code review.


> +
> static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
> int *iomap_errp, const struct iomap_ops *ops)
> {
> @@ -1296,17 +1411,14 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
> struct address_space *mapping = vma->vm_file->f_mapping;
> XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
> struct inode *inode = mapping->host;
> - unsigned long vaddr = vmf->address;
> loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
> struct iomap iomap = { .type = IOMAP_HOLE };
> struct iomap srcmap = { .type = IOMAP_HOLE };
> unsigned flags = IOMAP_FAULT;
> int error, major = 0;
> bool write = vmf->flags & FAULT_FLAG_WRITE;
> - bool sync;
> vm_fault_t ret = 0;
> void *entry;
> - pfn_t pfn;
>
> trace_dax_pte_fault(inode, vmf, ret);
> /*
> @@ -1352,8 +1464,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
> goto unlock_entry;
> }
> if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
> - error = -EIO; /* fs corruption? */
> - goto error_finish_iomap;
> + ret = VM_FAULT_SIGBUS; /* fs corruption? */
> + goto finish_iomap;
> }
>
> if (vmf->cow_page) {
> @@ -1363,49 +1475,19 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
> goto finish_iomap;
> }
>
> - sync = dax_fault_is_synchronous(flags, vma, &iomap);
> -
> - switch (iomap.type) {
> - case IOMAP_MAPPED:
> - if (iomap.flags & IOMAP_F_NEW) {
> - count_vm_event(PGMAJFAULT);
> - count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
> - major = VM_FAULT_MAJOR;
> - }
> - error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
> - if (error < 0)
> - goto error_finish_iomap;
> -
> - entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
> - 0, write && !sync);
> -
> - if (sync) {
> - ret = dax_fault_synchronous_pfnp(pfnp, pfn);
> - goto finish_iomap;
> - }
> - trace_dax_insert_mapping(inode, vmf, entry);
> - if (write)
> - ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn);
> - else
> - ret = vmf_insert_mixed(vma, vaddr, pfn);
> -
> + ret = dax_fault_actor(vmf, pfnp, &xas, entry, false, flags,
> + &iomap, &srcmap);
> + if (ret == VM_FAULT_SIGBUS)
> goto finish_iomap;
> - case IOMAP_UNWRITTEN:
> - case IOMAP_HOLE:
> - if (!write) {
> - ret = dax_load_hole(&xas, mapping, &entry, vmf);
> - goto finish_iomap;
> - }
> - fallthrough;
> - default:
> - WARN_ON_ONCE(1);
> - error = -EIO;
> - break;
> +
> + /* read/write MAPPED, CoW UNWRITTEN */
> + if (iomap.flags & IOMAP_F_NEW) {
> + count_vm_event(PGMAJFAULT);
> + count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
> + major = VM_FAULT_MAJOR;
> }

It is much better if above accounting is also done in dax_fault_actor()
function itself. Then at the end of this function we need to just do
"return ret" instead of "return ret | major"


>
> - error_finish_iomap:
> - ret = dax_fault_return(error);
> - finish_iomap:
> +finish_iomap:
> if (ops->iomap_end) {
> int copied = PAGE_SIZE;
>
> @@ -1419,66 +1501,14 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
> */
> ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
> }
> - unlock_entry:
> +unlock_entry:
> dax_unlock_entry(&xas, entry);
> - out:
> +out:
> trace_dax_pte_fault_done(inode, vmf, ret);
> return ret | major;
> }


-ritesh

2021-03-24 03:36:06

by Ritesh Harjani

[permalink] [raw]
Subject: Re: [PATCH v3 01/10] fsdax: Factor helpers to simplify dax fault code



On 3/19/21 7:22 AM, Shiyang Ruan wrote:
> The dax page fault code is too long and a bit difficult to read. And it
> is hard to understand when we trying to add new features. Some of the
> PTE/PMD codes have similar logic. So, factor them as helper functions to
> simplify the code.
>
> Signed-off-by: Shiyang Ruan <[email protected]>
> Reviewed-by: Christoph Hellwig <[email protected]>
> ---
> fs/dax.c | 152 ++++++++++++++++++++++++++++++-------------------------
> 1 file changed, 84 insertions(+), 68 deletions(-)
>

Refactoring & the changes looks good to me.
Feel free to add.

Reviewed-by: Ritesh Harjani <[email protected]>

2021-03-24 05:43:59

by Ritesh Harjani

[permalink] [raw]
Subject: Re: [PATCH v3 04/10] fsdax: Introduce dax_iomap_cow_copy()



On 3/19/21 7:22 AM, Shiyang Ruan wrote:
> In the case where the iomap is a write operation and iomap is not equal
> to srcmap after iomap_begin, we consider it is a CoW operation.
>
> The destance extent which iomap indicated is new allocated extent.
> So, it is needed to copy the data from srcmap to new allocated extent.
> In theory, it is better to copy the head and tail ranges which is
> outside of the non-aligned area instead of copying the whole aligned
> range. But in dax page fault, it will always be an aligned range. So,
> we have to copy the whole range in this case.
>
> Signed-off-by: Shiyang Ruan <[email protected]>
> Reviewed-by: Christoph Hellwig <[email protected]>
> ---
> fs/dax.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 66 insertions(+), 5 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index a70e6aa285bb..181aad97136a 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1037,6 +1037,51 @@ static int dax_iomap_direct_access(struct iomap *iomap, loff_t pos, size_t size,
> return rc;
> }
>
> +/*
> + * Copy the head and tail part of the pages not included in the write but
> + * required for CoW, because pos/pos+length are not page aligned. But in dax
> + * page fault case, the range is page aligned, we need to copy the whole range
> + * of data. Use copy_edge to distinguish these cases.
> + */


Is this version better? Feel free to update/change.

dax_iomap_cow_copy(): This can be called from two places.
Either during DAX write fault, to copy the length size data to daddr.
Or, while doing normal DAX write operation, dax_iomap_actor() might call
this to do the copy of either start or end unaligned address. In this
case the rest of the copy of aligned ranges is taken care by
dax_iomap_actor() itself.
Also, note DAX fault will always result in aligned pos and pos + length.

* @pos: address to do copy from.
* @length: size of copy operation.
* @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
* @srcmap: iomap srcmap
* @daddr: destination address to copy to.

> +static int dax_iomap_cow_copy(loff_t pos, loff_t length, size_t align_size,
> + struct iomap *srcmap, void *daddr, bool copy_edge)

do we need bool copy_edge here?
We can detect non-align case directly if head_off != pos or pd_end !=
end no?


> +{
> + loff_t head_off = pos & (align_size - 1);
> + size_t size = ALIGN(head_off + length, align_size);
> + loff_t end = pos + length;
> + loff_t pg_end = round_up(end, align_size);
> + void *saddr = 0;
> + int ret = 0;
> +
> + ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
> + if (ret)
> + return ret;
> +
> + if (!copy_edge)
> + return copy_mc_to_kernel(daddr, saddr, length);
> +
> + /* Copy the head part of the range. Note: we pass offset as length. */
> + if (head_off) {
> + if (saddr)
> + ret = copy_mc_to_kernel(daddr, saddr, head_off);
> + else
> + memset(daddr, 0, head_off);
> + }
> + /* Copy the tail part of the range */
> + if (end < pg_end) {
> + loff_t tail_off = head_off + length;
> + loff_t tail_len = pg_end - end;
> +
> + if (saddr)
> + ret = copy_mc_to_kernel(daddr + tail_off,
> + saddr + tail_off, tail_len);
> + else
> + memset(daddr + tail_off, 0, tail_len);
> + }

Can you pls help me understand in which case where saddr is 0 and we
will fall back to memset API ?
I was thinking shouldn't such restrictions be coded inside
copy_mc_to_kernel() function in general?


-ritesh

2021-03-31 04:05:57

by Shiyang Ruan

[permalink] [raw]
Subject: RE: [PATCH v3 02/10] fsdax: Factor helper: dax_fault_actor()



> -----Original Message-----
> From: Ritesh Harjani <[email protected]>
> Sent: Tuesday, March 23, 2021 11:48 PM
> Subject: Re: [PATCH v3 02/10] fsdax: Factor helper: dax_fault_actor()
>
>
>
> On 3/19/21 7:22 AM, Shiyang Ruan wrote:
> > The core logic in the two dax page fault functions is similar. So,
> > move the logic into a common helper function. Also, to facilitate the
> > addition of new features, such as CoW, switch-case is no longer used
> > to handle different iomap types.
> >
> > Signed-off-by: Shiyang Ruan <[email protected]>
> > ---
> > fs/dax.c | 291 +++++++++++++++++++++++++++----------------------------
> > 1 file changed, 145 insertions(+), 146 deletions(-)
> >
> > diff --git a/fs/dax.c b/fs/dax.c
> > index 7031e4302b13..33ddad0f3091 100644
> > --- a/fs/dax.c
> > +++ b/fs/dax.c
> > @@ -1053,6 +1053,66 @@ static vm_fault_t dax_load_hole(struct xa_state
> *xas,
> > return ret;
> > }
> >
> > +#ifdef CONFIG_FS_DAX_PMD
> > +static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault
> *vmf,
> > + struct iomap *iomap, void **entry)
> > +{
> > + struct address_space *mapping = vmf->vma->vm_file->f_mapping;
> > + unsigned long pmd_addr = vmf->address & PMD_MASK;
> > + struct vm_area_struct *vma = vmf->vma;
> > + struct inode *inode = mapping->host;
> > + pgtable_t pgtable = NULL;
> > + struct page *zero_page;
> > + spinlock_t *ptl;
> > + pmd_t pmd_entry;
> > + pfn_t pfn;
> > +
> > + zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
> > +
> > + if (unlikely(!zero_page))
> > + goto fallback;
> > +
> > + pfn = page_to_pfn_t(zero_page);
> > + *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
> > + DAX_PMD | DAX_ZERO_PAGE, false);
> > +
> > + if (arch_needs_pgtable_deposit()) {
> > + pgtable = pte_alloc_one(vma->vm_mm);
> > + if (!pgtable)
> > + return VM_FAULT_OOM;
> > + }
> > +
> > + ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
> > + if (!pmd_none(*(vmf->pmd))) {
> > + spin_unlock(ptl);
> > + goto fallback;
> > + }
> > +
> > + if (pgtable) {
> > + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
> > + mm_inc_nr_ptes(vma->vm_mm);
> > + }
> > + pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
> > + pmd_entry = pmd_mkhuge(pmd_entry);
> > + set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
> > + spin_unlock(ptl);
> > + trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
> > + return VM_FAULT_NOPAGE;
> > +
> > +fallback:
> > + if (pgtable)
> > + pte_free(vma->vm_mm, pgtable);
> > + trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
> > + return VM_FAULT_FALLBACK;
> > +}
> > +#else
> > +static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault
> *vmf,
> > + struct iomap *iomap, void **entry)
> > +{
> > + return VM_FAULT_FALLBACK;
> > +}
> > +#endif /* CONFIG_FS_DAX_PMD */
> > +
> > s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
> > {
> > sector_t sector = iomap_sector(iomap, pos & PAGE_MASK); @@
> -1289,6
> > +1349,61 @@ static int dax_fault_cow_page(struct vm_fault *vmf, struct
> iomap *iomap,
> > return 0;
> > }
> >
> > +/**
> > + * dax_fault_actor - Common actor to handle pfn insertion in PTE/PMD fault.
> > + * @vmf: vm fault instance
> > + * @pfnp: pfn to be returned
> > + * @xas: the dax mapping tree of a file
> > + * @entry: an unlocked dax entry to be inserted
> > + * @pmd: distinguish whether it is a pmd fault
> > + * @flags: iomap flags
> > + * @iomap: from iomap_begin()
> > + * @srcmap: from iomap_begin(), not equal to iomap if it is a CoW
> > + */
> > +static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp,
> > + struct xa_state *xas, void *entry, bool pmd, unsigned int flags,
> > + struct iomap *iomap, struct iomap *srcmap) {
> > + struct address_space *mapping = vmf->vma->vm_file->f_mapping;
> > + size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
> > + loff_t pos = (loff_t)xas->xa_offset << PAGE_SHIFT;
>
> shouldn't we use xa_index here for pos ?
> (loff_t)xas->xa_index << PAGE_SHIFT;
Yes.
>
> > + bool write = vmf->flags & FAULT_FLAG_WRITE;
> > + bool sync = dax_fault_is_synchronous(flags, vmf->vma, iomap);
> > + int err = 0;
> > + pfn_t pfn;
> > +
> > + /* if we are reading UNWRITTEN and HOLE, return a hole. */
> > + if (!write &&
> > + (iomap->type == IOMAP_UNWRITTEN || iomap->type ==
> IOMAP_HOLE)) {
> > + if (!pmd)
> > + return dax_load_hole(xas, mapping, &entry, vmf);
> > + else
> > + return dax_pmd_load_hole(xas, vmf, iomap, &entry);
> > + }
> > +
> > + if (iomap->type != IOMAP_MAPPED) {
> > + WARN_ON_ONCE(1);
> > + return VM_FAULT_SIGBUS;
> > + }
>
> So now in case if mapping is not mapped, we always cause VM_FAULT_SIGBUG.
> But earlier we were only doing WARN_ON_ONCE(1).
> Can you pls help answer why the change in behavior?
>

The behavior in PTE fault was always check the error code by dax_fault_return(error) after WARN_ON_ONCE(1). So, I moved the dax_fault_return() into dax_fault_actor(). But I just found that, in PMD fault, it didn't do this check. So, I think I should move dax_fault_return() outside the dax_fault_actor() to keep the previous logic.

>
>
>
> > +
> > + err = dax_iomap_pfn(iomap, pos, size, &pfn);
> > + if (err)
> > + return dax_fault_return(err);
>
> Same case here as well. This could return SIGBUS while earlier I am not sure
> why were we only returning FALLBACK?
>
Yes. Thanks for pointing out.
>
> > +
> > + entry = dax_insert_entry(xas, mapping, vmf, entry, pfn, 0,
> > + write && !sync);
>
> In dax_insert_entry() we are passing 0 as flags.
> We should be passing DAX_PMD/DAX_PTE no?
>
My mistake.
>
> > +
> > + if (sync)
> > + return dax_fault_synchronous_pfnp(pfnp, pfn);
> > +
>
>
> /* handle PMD case here */
> > + if (pmd)
> > + return vmf_insert_pfn_pmd(vmf, pfn, write);
>
> /* handle PTE case here */
> > + if (write)
> > + return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
> > + else
> > + return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
> > +}
>
> It is easy to miss the return from if(pmd) case while reading.
> A comment like above could be helpful for code review.
>
>
> > +
> > static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
> > int *iomap_errp, const struct iomap_ops *ops)
> > {
> > @@ -1296,17 +1411,14 @@ static vm_fault_t dax_iomap_pte_fault(struct
> vm_fault *vmf, pfn_t *pfnp,
> > struct address_space *mapping = vma->vm_file->f_mapping;
> > XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
> > struct inode *inode = mapping->host;
> > - unsigned long vaddr = vmf->address;
> > loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
> > struct iomap iomap = { .type = IOMAP_HOLE };
> > struct iomap srcmap = { .type = IOMAP_HOLE };
> > unsigned flags = IOMAP_FAULT;
> > int error, major = 0;
> > bool write = vmf->flags & FAULT_FLAG_WRITE;
> > - bool sync;
> > vm_fault_t ret = 0;
> > void *entry;
> > - pfn_t pfn;
> >
> > trace_dax_pte_fault(inode, vmf, ret);
> > /*
> > @@ -1352,8 +1464,8 @@ static vm_fault_t dax_iomap_pte_fault(struct
> vm_fault *vmf, pfn_t *pfnp,
> > goto unlock_entry;
> > }
> > if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE))
> {
> > - error = -EIO; /* fs corruption? */
> > - goto error_finish_iomap;
> > + ret = VM_FAULT_SIGBUS; /* fs corruption? */
> > + goto finish_iomap;
> > }
> >
> > if (vmf->cow_page) {
> > @@ -1363,49 +1475,19 @@ static vm_fault_t dax_iomap_pte_fault(struct
> vm_fault *vmf, pfn_t *pfnp,
> > goto finish_iomap;
> > }
> >
> > - sync = dax_fault_is_synchronous(flags, vma, &iomap);
> > -
> > - switch (iomap.type) {
> > - case IOMAP_MAPPED:
> > - if (iomap.flags & IOMAP_F_NEW) {
> > - count_vm_event(PGMAJFAULT);
> > - count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
> > - major = VM_FAULT_MAJOR;
> > - }
> > - error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
> > - if (error < 0)
> > - goto error_finish_iomap;
> > -
> > - entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
> > - 0, write && !sync);
> > -
> > - if (sync) {
> > - ret = dax_fault_synchronous_pfnp(pfnp, pfn);
> > - goto finish_iomap;
> > - }
> > - trace_dax_insert_mapping(inode, vmf, entry);
> > - if (write)
> > - ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn);
> > - else
> > - ret = vmf_insert_mixed(vma, vaddr, pfn);
> > -
> > + ret = dax_fault_actor(vmf, pfnp, &xas, entry, false, flags,
> > + &iomap, &srcmap);
> > + if (ret == VM_FAULT_SIGBUS)
> > goto finish_iomap;
> > - case IOMAP_UNWRITTEN:
> > - case IOMAP_HOLE:
> > - if (!write) {
> > - ret = dax_load_hole(&xas, mapping, &entry, vmf);
> > - goto finish_iomap;
> > - }
> > - fallthrough;
> > - default:
> > - WARN_ON_ONCE(1);
> > - error = -EIO;
> > - break;
> > +
> > + /* read/write MAPPED, CoW UNWRITTEN */
> > + if (iomap.flags & IOMAP_F_NEW) {
> > + count_vm_event(PGMAJFAULT);
> > + count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
> > + major = VM_FAULT_MAJOR;
> > }
>
> It is much better if above accounting is also done in dax_fault_actor()
> function itself. Then at the end of this function we need to just do
> "return ret" instead of "return ret | major"
>
Yes.


--
Thanks,
Ruan Shiyang.
>
> >
> > - error_finish_iomap:
> > - ret = dax_fault_return(error);
> > - finish_iomap:
> > +finish_iomap:
> > if (ops->iomap_end) {
> > int copied = PAGE_SIZE;
> >
> > @@ -1419,66 +1501,14 @@ static vm_fault_t dax_iomap_pte_fault(struct
> vm_fault *vmf, pfn_t *pfnp,
> > */
> > ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
> > }
> > - unlock_entry:
> > +unlock_entry:
> > dax_unlock_entry(&xas, entry);
> > - out:
> > +out:
> > trace_dax_pte_fault_done(inode, vmf, ret);
> > return ret | major;
> > }
>
>
> -ritesh

2021-04-01 06:43:34

by Ritesh Harjani

[permalink] [raw]
Subject: Re: [PATCH v3 05/10] fsdax: Replace mmap entry in case of CoW

On 21/03/19 09:52AM, Shiyang Ruan wrote:
> We replace the existing entry to the newly allocated one in case of CoW.
> Also, we mark the entry as PAGECACHE_TAG_TOWRITE so writeback marks this
> entry as writeprotected. This helps us snapshots so new write
> pagefaults after snapshots trigger a CoW.
>

Please correct me here. So the flow is like this.
1. In case of CoW or a reflinked file, on an mmaped file if write is attempted,
Then in DAX fault handler code, ->iomap_begin() on a given filesystem will
populate iomap and srcmap. srcmap being from where the read needs to be
attempted from and iomap on where the new write should go to.
2. So the dax_insert_entry() code as part of the fault handling will take care
of removing the old entry and inserting the new pfn entry to xas and mark
it with PAGECACHE_TAG_TOWRITE so that dax writeback can mark the entry as
write protected.
Is my above understanding correct?

> Signed-off-by: Goldwyn Rodrigues <[email protected]>
> Signed-off-by: Shiyang Ruan <[email protected]>
> Reviewed-by: Christoph Hellwig <[email protected]>
> ---
> fs/dax.c | 37 ++++++++++++++++++++++++++-----------
> 1 file changed, 26 insertions(+), 11 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 181aad97136a..cfe513eb111e 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -722,6 +722,9 @@ static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_d
> return 0;
> }
>
> +#define DAX_IF_DIRTY (1 << 0)
> +#define DAX_IF_COW (1 << 1)
> +
>
small comment expalining this means DAX insert flags used in dax_insert_entry()

>
> /*
> * By this point grab_mapping_entry() has ensured that we have a locked entry
> * of the appropriate size so we don't have to worry about downgrading PMDs to
> @@ -729,16 +732,19 @@ static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_d
> * already in the tree, we will skip the insertion and just dirty the PMD as
> * appropriate.
> */
> -static void *dax_insert_entry(struct xa_state *xas,
> - struct address_space *mapping, struct vm_fault *vmf,
> - void *entry, pfn_t pfn, unsigned long flags, bool dirty)
> +static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
> + void *entry, pfn_t pfn, unsigned long flags,
> + unsigned int insert_flags)
> {
> + struct address_space *mapping = vmf->vma->vm_file->f_mapping;
> void *new_entry = dax_make_entry(pfn, flags);
> + bool dirty = insert_flags & DAX_IF_DIRTY;
> + bool cow = insert_flags & DAX_IF_COW;
>
> if (dirty)
> __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
>
> - if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
> + if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
> unsigned long index = xas->xa_index;
> /* we are replacing a zero page with block mapping */
> if (dax_is_pmd_entry(entry))
> @@ -750,7 +756,7 @@ static void *dax_insert_entry(struct xa_state *xas,
>
> xas_reset(xas);
> xas_lock_irq(xas);
> - if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
> + if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
> void *old;
>
> dax_disassociate_entry(entry, mapping, false);
> @@ -774,6 +780,9 @@ static void *dax_insert_entry(struct xa_state *xas,
> if (dirty)
> xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
>
> + if (cow)
> + xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
> +
> xas_unlock_irq(xas);
> return entry;
> }
> @@ -1098,8 +1107,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
> pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
> vm_fault_t ret;
>
> - *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
> - DAX_ZERO_PAGE, false);
> + *entry = dax_insert_entry(xas, vmf, *entry, pfn, DAX_ZERO_PAGE, 0);
>
> ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
> trace_dax_load_hole(inode, vmf, ret);
> @@ -1126,8 +1134,8 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
> goto fallback;
>
> pfn = page_to_pfn_t(zero_page);
> - *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
> - DAX_PMD | DAX_ZERO_PAGE, false);
> + *entry = dax_insert_entry(xas, vmf, *entry, pfn,
> + DAX_PMD | DAX_ZERO_PAGE, 0);
>
> if (arch_needs_pgtable_deposit()) {
> pgtable = pte_alloc_one(vma->vm_mm);
> @@ -1431,6 +1439,7 @@ static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp,
> loff_t pos = (loff_t)xas->xa_offset << PAGE_SHIFT;
> bool write = vmf->flags & FAULT_FLAG_WRITE;
> bool sync = dax_fault_is_synchronous(flags, vmf->vma, iomap);
> + unsigned int insert_flags = 0;
> int err = 0;
> pfn_t pfn;
> void *kaddr;
> @@ -1453,8 +1462,14 @@ static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp,
> if (err)
> return dax_fault_return(err);
>
> - entry = dax_insert_entry(xas, mapping, vmf, entry, pfn, 0,
> - write && !sync);
> + if (write) {
> + if (!sync)
> + insert_flags |= DAX_IF_DIRTY;
> + if (iomap->flags & IOMAP_F_SHARED)
> + insert_flags |= DAX_IF_COW;
> + }
> +
> + entry = dax_insert_entry(xas, vmf, entry, pfn, 0, insert_flags);
>
> if (write && srcmap->addr != iomap->addr) {
> err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr, false);
>

Rest looks good to me. Please feel free to add
Reviewed-by: Ritesh Harjani <[email protected]>

sorry about changing my email in between of this code review.
I am planning to use above gmail id as primary account for all upstream work
from now.

> --
> 2.30.1
>
>
>

2021-04-01 06:46:36

by Ritesh Harjani

[permalink] [raw]
Subject: Re: [PATCH v3 06/10] fsdax: Add dax_iomap_cow_copy() for dax_iomap_zero

On 21/03/19 09:52AM, Shiyang Ruan wrote:
> Punch hole on a reflinked file needs dax_copy_edge() too. Otherwise,
> data in not aligned area will be not correct. So, add the srcmap to
> dax_iomap_zero() and replace memset() as dax_copy_edge().
>
> Signed-off-by: Shiyang Ruan <[email protected]>
> ---
> fs/dax.c | 9 +++++++--
> fs/iomap/buffered-io.c | 2 +-
> include/linux/dax.h | 3 ++-
> 3 files changed, 10 insertions(+), 4 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index cfe513eb111e..348297b38f76 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1174,7 +1174,8 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
> }
> #endif /* CONFIG_FS_DAX_PMD */
>
> -s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
> +s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
> + struct iomap *srcmap)

Do we know why does dax_iomap_zero() operates on PAGE_SIZE range?
IIUC, dax_zero_page_range() can take nr_pages as a parameter. But we still
always use one page at a time. Why is that?

> {
> sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
> pgoff_t pgoff;
> @@ -1204,7 +1205,11 @@ s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
> }
>
> if (!page_aligned) {
> - memset(kaddr + offset, 0, size);
> + if (iomap->addr != srcmap->addr)
> + dax_iomap_cow_copy(offset, size, PAGE_SIZE, srcmap,
> + kaddr, true);
> + else
> + memset(kaddr + offset, 0, size);
> dax_flush(iomap->dax_dev, kaddr + offset, size);
> }
> dax_read_unlock(id);
>

Maybe the above could be simplified to this?

if (page_aligned) {
rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
} else {
rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
if (iomap->addr != srcmap->addr)
dax_iomap_cow_copy(offset, size, PAGE_SIZE, srcmap,
kaddr, true);
else
memset(kaddr + offset, 0, size);
dax_flush(iomap->dax_dev, kaddr + offset, size);
}

dax_read_unlock(id);
return rc < 0 ? rc : size;

Other than that looks good.
Feel free to add.
Reviewed-by: Ritesh Harjani <[email protected]>


2021-04-01 07:03:15

by Shiyang Ruan

[permalink] [raw]
Subject: RE: [PATCH v3 06/10] fsdax: Add dax_iomap_cow_copy() for dax_iomap_zero



> -----Original Message-----
> From: Ritesh Harjani <[email protected]>
> Sent: Thursday, April 1, 2021 2:45 PM
> Subject: Re: [PATCH v3 06/10] fsdax: Add dax_iomap_cow_copy() for
> dax_iomap_zero
>
> On 21/03/19 09:52AM, Shiyang Ruan wrote:
> > Punch hole on a reflinked file needs dax_copy_edge() too. Otherwise,
> > data in not aligned area will be not correct. So, add the srcmap to
> > dax_iomap_zero() and replace memset() as dax_copy_edge().
> >
> > Signed-off-by: Shiyang Ruan <[email protected]>
> > ---
> > fs/dax.c | 9 +++++++--
> > fs/iomap/buffered-io.c | 2 +-
> > include/linux/dax.h | 3 ++-
> > 3 files changed, 10 insertions(+), 4 deletions(-)
> >
> > diff --git a/fs/dax.c b/fs/dax.c
> > index cfe513eb111e..348297b38f76 100644
> > --- a/fs/dax.c
> > +++ b/fs/dax.c
> > @@ -1174,7 +1174,8 @@ static vm_fault_t dax_pmd_load_hole(struct
> > xa_state *xas, struct vm_fault *vmf, } #endif /* CONFIG_FS_DAX_PMD
> > */
> >
> > -s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
> > +s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
> > + struct iomap *srcmap)
>
> Do we know why does dax_iomap_zero() operates on PAGE_SIZE range?
> IIUC, dax_zero_page_range() can take nr_pages as a parameter. But we still
> always use one page at a time. Why is that?

I think we can handle more than one page here. The length can be more than one PAGE_SIZE.

>
> > {
> > sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
> > pgoff_t pgoff;
> > @@ -1204,7 +1205,11 @@ s64 dax_iomap_zero(loff_t pos, u64 length, struct
> iomap *iomap)
> > }
> >
> > if (!page_aligned) {
> > - memset(kaddr + offset, 0, size);
> > + if (iomap->addr != srcmap->addr)
> > + dax_iomap_cow_copy(offset, size, PAGE_SIZE, srcmap,
> > + kaddr, true);
> > + else
> > + memset(kaddr + offset, 0, size);
> > dax_flush(iomap->dax_dev, kaddr + offset, size);
> > }
> > dax_read_unlock(id);
> >
>
> Maybe the above could be simplified to this?
>
> if (page_aligned) {
> rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
> } else {
> rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);

This looks good. Need add check for rc here.

> if (iomap->addr != srcmap->addr)
> dax_iomap_cow_copy(offset, size, PAGE_SIZE, srcmap,
> kaddr, true);
> else
> memset(kaddr + offset, 0, size);
> dax_flush(iomap->dax_dev, kaddr + offset, size);
> }
>
> dax_read_unlock(id);
> return rc < 0 ? rc : size;
>
> Other than that looks good.
> Feel free to add.
> Reviewed-by: Ritesh Harjani <[email protected]>
>

--
Thanks,
Ruan Shiyang.

2021-04-01 07:07:42

by Shiyang Ruan

[permalink] [raw]
Subject: RE: [PATCH v3 05/10] fsdax: Replace mmap entry in case of CoW



> -----Original Message-----
> From: Ritesh Harjani <[email protected]>
> Sent: Thursday, April 1, 2021 2:40 PM
> Subject: Re: [PATCH v3 05/10] fsdax: Replace mmap entry in case of CoW
>
> On 21/03/19 09:52AM, Shiyang Ruan wrote:
> > We replace the existing entry to the newly allocated one in case of CoW.
> > Also, we mark the entry as PAGECACHE_TAG_TOWRITE so writeback marks
> > this entry as writeprotected. This helps us snapshots so new write
> > pagefaults after snapshots trigger a CoW.
> >
>
> Please correct me here. So the flow is like this.
> 1. In case of CoW or a reflinked file, on an mmaped file if write is attempted,
> Then in DAX fault handler code, ->iomap_begin() on a given filesystem will
> populate iomap and srcmap. srcmap being from where the read needs to be
> attempted from and iomap on where the new write should go to.
> 2. So the dax_insert_entry() code as part of the fault handling will take care
> of removing the old entry and inserting the new pfn entry to xas and mark
> it with PAGECACHE_TAG_TOWRITE so that dax writeback can mark the
> entry as
> write protected.
> Is my above understanding correct?

Yes, you are right.

>
> > Signed-off-by: Goldwyn Rodrigues <[email protected]>
> > Signed-off-by: Shiyang Ruan <[email protected]>
> > Reviewed-by: Christoph Hellwig <[email protected]>
> > ---
> > fs/dax.c | 37 ++++++++++++++++++++++++++-----------
> > 1 file changed, 26 insertions(+), 11 deletions(-)
> >
> > diff --git a/fs/dax.c b/fs/dax.c
> > index 181aad97136a..cfe513eb111e 100644
> > --- a/fs/dax.c
> > +++ b/fs/dax.c
> > @@ -722,6 +722,9 @@ static int copy_cow_page_dax(struct block_device
> *bdev, struct dax_device *dax_d
> > return 0;
> > }
> >
> > +#define DAX_IF_DIRTY (1 << 0)
> > +#define DAX_IF_COW (1 << 1)
> > +
> >
> small comment expalining this means DAX insert flags used in dax_insert_entry()

OK. I'll add it.
>
> >
> > /*
> > * By this point grab_mapping_entry() has ensured that we have a locked
> entry
> > * of the appropriate size so we don't have to worry about
> > downgrading PMDs to @@ -729,16 +732,19 @@ static int
> copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_d
> > * already in the tree, we will skip the insertion and just dirty the PMD as
> > * appropriate.
> > */
> > -static void *dax_insert_entry(struct xa_state *xas,
> > - struct address_space *mapping, struct vm_fault *vmf,
> > - void *entry, pfn_t pfn, unsigned long flags, bool dirty)
> > +static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
> > + void *entry, pfn_t pfn, unsigned long flags,
> > + unsigned int insert_flags)
> > {
> > + struct address_space *mapping = vmf->vma->vm_file->f_mapping;
> > void *new_entry = dax_make_entry(pfn, flags);
> > + bool dirty = insert_flags & DAX_IF_DIRTY;
> > + bool cow = insert_flags & DAX_IF_COW;
> >
> > if (dirty)
> > __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
> >
> > - if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
> > + if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
> > unsigned long index = xas->xa_index;
> > /* we are replacing a zero page with block mapping */
> > if (dax_is_pmd_entry(entry))
> > @@ -750,7 +756,7 @@ static void *dax_insert_entry(struct xa_state
> > *xas,
> >
> > xas_reset(xas);
> > xas_lock_irq(xas);
> > - if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
> > + if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
> > void *old;
> >
> > dax_disassociate_entry(entry, mapping, false); @@ -774,6 +780,9
> @@
> > static void *dax_insert_entry(struct xa_state *xas,
> > if (dirty)
> > xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
> >
> > + if (cow)
> > + xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
> > +
> > xas_unlock_irq(xas);
> > return entry;
> > }
> > @@ -1098,8 +1107,7 @@ static vm_fault_t dax_load_hole(struct xa_state
> *xas,
> > pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
> > vm_fault_t ret;
> >
> > - *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
> > - DAX_ZERO_PAGE, false);
> > + *entry = dax_insert_entry(xas, vmf, *entry, pfn, DAX_ZERO_PAGE, 0);
> >
> > ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
> > trace_dax_load_hole(inode, vmf, ret); @@ -1126,8 +1134,8 @@ static
> > vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
> > goto fallback;
> >
> > pfn = page_to_pfn_t(zero_page);
> > - *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
> > - DAX_PMD | DAX_ZERO_PAGE, false);
> > + *entry = dax_insert_entry(xas, vmf, *entry, pfn,
> > + DAX_PMD | DAX_ZERO_PAGE, 0);
> >
> > if (arch_needs_pgtable_deposit()) {
> > pgtable = pte_alloc_one(vma->vm_mm); @@ -1431,6 +1439,7 @@
> static
> > vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp,
> > loff_t pos = (loff_t)xas->xa_offset << PAGE_SHIFT;
> > bool write = vmf->flags & FAULT_FLAG_WRITE;
> > bool sync = dax_fault_is_synchronous(flags, vmf->vma, iomap);
> > + unsigned int insert_flags = 0;
> > int err = 0;
> > pfn_t pfn;
> > void *kaddr;
> > @@ -1453,8 +1462,14 @@ static vm_fault_t dax_fault_actor(struct vm_fault
> *vmf, pfn_t *pfnp,
> > if (err)
> > return dax_fault_return(err);
> >
> > - entry = dax_insert_entry(xas, mapping, vmf, entry, pfn, 0,
> > - write && !sync);
> > + if (write) {
> > + if (!sync)
> > + insert_flags |= DAX_IF_DIRTY;
> > + if (iomap->flags & IOMAP_F_SHARED)
> > + insert_flags |= DAX_IF_COW;
> > + }
> > +
> > + entry = dax_insert_entry(xas, vmf, entry, pfn, 0, insert_flags);
> >
> > if (write && srcmap->addr != iomap->addr) {
> > err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr, false);
> >
>
> Rest looks good to me. Please feel free to add
> Reviewed-by: Ritesh Harjani <[email protected]>
>
> sorry about changing my email in between of this code review.
> I am planning to use above gmail id as primary account for all upstream work
> from now.
>

--
Thanks,
Ruan Shiyang.

> > --
> > 2.30.1
> >
> >
> >

2021-04-01 07:15:43

by Ritesh Harjani

[permalink] [raw]
Subject: Re: [PATCH v3 07/10] iomap: Introduce iomap_apply2() for operations on two files

On 21/03/19 09:52AM, Shiyang Ruan wrote:
> Some operations, such as comparing a range of data in two files under
> fsdax mode, requires nested iomap_open()/iomap_end() on two file. Thus,
> we introduce iomap_apply2() to accept arguments from two files and
> iomap_actor2_t for actions on two files.
>
> Signed-off-by: Shiyang Ruan <[email protected]>
> ---
> fs/iomap/apply.c | 56 +++++++++++++++++++++++++++++++++++++++++++
> include/linux/iomap.h | 7 +++++-
> 2 files changed, 62 insertions(+), 1 deletion(-)
>
> diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c
> index 26ab6563181f..fbc38ce3d5b6 100644
> --- a/fs/iomap/apply.c
> +++ b/fs/iomap/apply.c
> @@ -97,3 +97,59 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
>
> return written ? written : ret;
> }
> +
> +loff_t
> +iomap_apply2(struct inode *ino1, loff_t pos1, struct inode *ino2, loff_t pos2,
> + loff_t length, unsigned int flags, const struct iomap_ops *ops,
> + void *data, iomap_actor2_t actor)
> +{
> + struct iomap smap = { .type = IOMAP_HOLE };
> + struct iomap dmap = { .type = IOMAP_HOLE };
> + loff_t written = 0, ret, ret2 = 0;
> + loff_t len1 = length, len2, min_len;
> +
> + ret = ops->iomap_begin(ino1, pos1, len1, flags, &smap, NULL);
> + if (ret)
> + goto out_src;

if above call fails we need not call ->iomap_end() on smap.

> + if (WARN_ON(smap.offset > pos1)) {
> + written = -EIO;
> + goto out_src;
> + }
> + if (WARN_ON(smap.length == 0)) {
> + written = -EIO;
> + goto out_src;
> + }
> + len2 = min_t(loff_t, len1, smap.length);
> +
> + ret = ops->iomap_begin(ino2, pos2, len2, flags, &dmap, NULL);
> + if (ret)
> + goto out_dest;

ditto

> + if (WARN_ON(dmap.offset > pos2)) {
> + written = -EIO;
> + goto out_dest;
> + }
> + if (WARN_ON(dmap.length == 0)) {
> + written = -EIO;
> + goto out_dest;
> + }
> + min_len = min_t(loff_t, len2, dmap.length);
> +
> + written = actor(ino1, pos1, ino2, pos2, min_len, data, &smap, &dmap);
> +
> +out_dest:
> + if (ops->iomap_end)
> + ret2 = ops->iomap_end(ino2, pos2, len2,
> + written > 0 ? written : 0, flags, &dmap);
> +out_src:
> + if (ops->iomap_end)
> + ret = ops->iomap_end(ino1, pos1, len1,
> + written > 0 ? written : 0, flags, &smap);
> +

I guess, this maynot be a problem, but I still think we should be
consistent w.r.t len argument we are passing in ->iomap_end() for both type of
iomap_apply* family of functions.
IIUC, we used to call ->iomap_end() with the length argument filled by the
filesystem from ->iomap_begin() call.

whereas above breaks that behavior. Although I don't think this is FATAL, but
still it is better to be consistent with the APIs.
Thoughts?


> + if (ret)
> + return written ? written : ret;
> +
> + if (ret2)
> + return written ? written : ret2;
> +
> + return written;
> +}

if (written)
return written;

return ret ? ret : ret2;

Is above a simpler version?

-ritesh

2021-04-01 11:13:28

by Ritesh Harjani

[permalink] [raw]
Subject: Re: [PATCH v3 08/10] fsdax: Dedup file range to use a compare function

On 21/03/19 09:52AM, Shiyang Ruan wrote:
> With dax we cannot deal with readpage() etc. So, we create a dax
> comparison funciton which is similar with
> vfs_dedupe_file_range_compare().
> And introduce dax_remap_file_range_prep() for filesystem use.
>
> Signed-off-by: Goldwyn Rodrigues <[email protected]>
> Signed-off-by: Shiyang Ruan <[email protected]>
> ---
> fs/dax.c | 56 ++++++++++++++++++++++++++++++++++++++++++++
> fs/remap_range.c | 45 ++++++++++++++++++++++++++++-------
> fs/xfs/xfs_reflink.c | 9 +++++--
> include/linux/dax.h | 4 ++++
> include/linux/fs.h | 15 ++++++++----
> 5 files changed, 115 insertions(+), 14 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 348297b38f76..76f81f1d76ec 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1833,3 +1833,59 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
> return dax_insert_pfn_mkwrite(vmf, pfn, order);
> }
> EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
> +
> +static loff_t dax_range_compare_actor(struct inode *ino1, loff_t pos1,
> + struct inode *ino2, loff_t pos2, loff_t len, void *data,
> + struct iomap *smap, struct iomap *dmap)
> +{
> + void *saddr, *daddr;
> + bool *same = data;
> + int ret;
> +
> + if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
> + *same = true;
> + return len;
> + }
> +
> + if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
> + *same = false;
> + return 0;
> + }
> +
> + ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
> + &saddr, NULL);

shouldn't it take len as the second argument?

> + if (ret < 0)
> + return -EIO;
> +
> + ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
> + &daddr, NULL);

ditto.
> + if (ret < 0)
> + return -EIO;
> +
> + *same = !memcmp(saddr, daddr, len);
> + return len;
> +}
> +
> +int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> + struct inode *dest, loff_t destoff, loff_t len, bool *is_same,
> + const struct iomap_ops *ops)
> +{
> + int id, ret = 0;
> +
> + id = dax_read_lock();
> + while (len) {
> + ret = iomap_apply2(src, srcoff, dest, destoff, len, 0, ops,
> + is_same, dax_range_compare_actor);
> + if (ret < 0 || !*is_same)
> + goto out;
> +
> + len -= ret;
> + srcoff += ret;
> + destoff += ret;
> + }
> + ret = 0;
> +out:
> + dax_read_unlock(id);
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(dax_dedupe_file_range_compare);
> diff --git a/fs/remap_range.c b/fs/remap_range.c
> index 77dba3a49e65..9079390edaf3 100644
> --- a/fs/remap_range.c
> +++ b/fs/remap_range.c
> @@ -14,6 +14,7 @@
> #include <linux/compat.h>
> #include <linux/mount.h>
> #include <linux/fs.h>
> +#include <linux/dax.h>
> #include "internal.h"
>
> #include <linux/uaccess.h>
> @@ -199,9 +200,9 @@ static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
> * Compare extents of two files to see if they are the same.
> * Caller must have locked both inodes to prevent write races.
> */
> -static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> - struct inode *dest, loff_t destoff,
> - loff_t len, bool *is_same)
> +int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> + struct inode *dest, loff_t destoff,
> + loff_t len, bool *is_same)
> {
> loff_t src_poff;
> loff_t dest_poff;
> @@ -280,6 +281,7 @@ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> out_error:
> return error;
> }
> +EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
>
> /*
> * Check that the two inodes are eligible for cloning, the ranges make
> @@ -289,9 +291,11 @@ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> * If there's an error, then the usual negative error code is returned.
> * Otherwise returns 0 with *len set to the request length.
> */
> -int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> - struct file *file_out, loff_t pos_out,
> - loff_t *len, unsigned int remap_flags)
> +static int
> +__generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> + struct file *file_out, loff_t pos_out,
> + loff_t *len, unsigned int remap_flags,
> + const struct iomap_ops *ops)
> {
> struct inode *inode_in = file_inode(file_in);
> struct inode *inode_out = file_inode(file_out);
> @@ -351,8 +355,15 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> if (remap_flags & REMAP_FILE_DEDUP) {
> bool is_same = false;
>
> - ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
> - inode_out, pos_out, *len, &is_same);
> + if (!IS_DAX(inode_in) && !IS_DAX(inode_out))
> + ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
> + inode_out, pos_out, *len, &is_same);
> + else if (IS_DAX(inode_in) && IS_DAX(inode_out) && ops)
> + ret = dax_dedupe_file_range_compare(inode_in, pos_in,
> + inode_out, pos_out, *len, &is_same,
> + ops);
> + else
> + return -EINVAL;
> if (ret)
> return ret;
> if (!is_same)

should we consider to check !is_same check b4?
you should maybe relook at this error handling side of code.
we still return len from actor function but is_same is set to false.
So we are essentially returning ret (positive value), instead should be
returning -EBADE because of !memcmp

> @@ -370,6 +381,24 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
>
> return ret;
> }
> +
> +int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> + struct file *file_out, loff_t pos_out,
> + loff_t *len, unsigned int remap_flags,
> + const struct iomap_ops *ops)
> +{
> + return __generic_remap_file_range_prep(file_in, pos_in, file_out,
> + pos_out, len, remap_flags, ops);
> +}
> +EXPORT_SYMBOL(dax_remap_file_range_prep);
> +
> +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> + struct file *file_out, loff_t pos_out,
> + loff_t *len, unsigned int remap_flags)
> +{
> + return __generic_remap_file_range_prep(file_in, pos_in, file_out,
> + pos_out, len, remap_flags, NULL);
> +}
> EXPORT_SYMBOL(generic_remap_file_range_prep);
>
> loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 6fa05fb78189..f5b3a3da36b7 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -1308,8 +1308,13 @@ xfs_reflink_remap_prep(
> if (IS_DAX(inode_in) || IS_DAX(inode_out))
> goto out_unlock;
>
> - ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
> - len, remap_flags);
> + if (IS_DAX(inode_in))

if (!IS_DAX(inode_in)) no?

> + ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
> + pos_out, len, remap_flags);
> + else
> + ret = dax_remap_file_range_prep(file_in, pos_in, file_out,
> + pos_out, len, remap_flags,
> + &xfs_read_iomap_ops);
> if (ret || *len == 0)
> goto out_unlock;
>
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 3275e01ed33d..32e1c34349f2 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -239,6 +239,10 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
> pgoff_t index);
> s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
> struct iomap *srcmap);
> +int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> + struct inode *dest, loff_t destoff,
> + loff_t len, bool *is_same,
> + const struct iomap_ops *ops);
> static inline bool dax_mapping(struct address_space *mapping)
> {
> return mapping->host && IS_DAX(mapping->host);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index fd47deea7c17..2e6ec5bdf82a 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -68,6 +68,7 @@ struct fsverity_info;
> struct fsverity_operations;
> struct fs_context;
> struct fs_parameter_spec;
> +struct iomap_ops;
>
> extern void __init inode_init(void);
> extern void __init inode_init_early(void);
> @@ -1910,13 +1911,19 @@ extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
> extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
> extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
> loff_t, size_t, unsigned int);
> +typedef int (*compare_range_t)(struct inode *src, loff_t srcpos,
> + struct inode *dest, loff_t destpos,
> + loff_t len, bool *is_same);

Is this used anywhere?

> extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> struct file *file_out, loff_t pos_out,
> size_t len, unsigned int flags);
> -extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> - struct file *file_out, loff_t pos_out,
> - loff_t *count,
> - unsigned int remap_flags);
> +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> + struct file *file_out, loff_t pos_out,
> + loff_t *count, unsigned int remap_flags);
> +int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> + struct file *file_out, loff_t pos_out,
> + loff_t *len, unsigned int remap_flags,
> + const struct iomap_ops *ops);
> extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
> struct file *file_out, loff_t pos_out,
> loff_t len, unsigned int remap_flags);
> --
> 2.30.1
>
>
>

2021-04-02 07:48:04

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v3 02/10] fsdax: Factor helper: dax_fault_actor()

> + if (!pmd)
> + return dax_load_hole(xas, mapping, &entry, vmf);
> + else
> + return dax_pmd_load_hole(xas, vmf, iomap, &entry);

> + if (pmd)
> + return vmf_insert_pfn_pmd(vmf, pfn, write);
> + if (write)
> + return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
> + else
> + return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
> +}

No need for else statements after returning.

Otherwise looks good:

Reviewed-by: Christoph Hellwig <[email protected]>

2021-04-02 07:51:32

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v3 00/10] fsdax,xfs: Add reflink&dedupe support for fsdax

Shiyang, Dan:

given that the whole reflink+dax thing is going to take a while and thus
not going to happen for this merge window, what about queueing up the
cleanup patches 1,2 and 3 so that we can reduce the patch load a little?

2021-04-02 08:20:24

by Shiyang Ruan

[permalink] [raw]
Subject: RE: [PATCH v3 00/10] fsdax,xfs: Add reflink&dedupe support for fsdax



> -----Original Message-----
> From: Christoph Hellwig <[email protected]>
> Sent: Friday, April 2, 2021 3:50 PM
> Subject: Re: [PATCH v3 00/10] fsdax,xfs: Add reflink&dedupe support for fsdax
>
> Shiyang, Dan:
>
> given that the whole reflink+dax thing is going to take a while and thus not going
> to happen for this merge window, what about queueing up the cleanup patches
> 1,2 and 3 so that we can reduce the patch load a little?

OK. I'll send a new version of these 3 patches based on latest comment.


--
Thanks,
Ruan Shiyang.

2021-04-08 03:23:25

by Shiyang Ruan

[permalink] [raw]
Subject: RE: [PATCH v3 08/10] fsdax: Dedup file range to use a compare function


> -----Original Message-----
> From: Ritesh Harjani <[email protected]>
> Subject: Re: [PATCH v3 08/10] fsdax: Dedup file range to use a compare function
>
> On 21/03/19 09:52AM, Shiyang Ruan wrote:
> > With dax we cannot deal with readpage() etc. So, we create a dax
> > comparison funciton which is similar with
> > vfs_dedupe_file_range_compare().
> > And introduce dax_remap_file_range_prep() for filesystem use.
> >
> > Signed-off-by: Goldwyn Rodrigues <[email protected]>
> > Signed-off-by: Shiyang Ruan <[email protected]>
> > ---
> > fs/dax.c | 56
> ++++++++++++++++++++++++++++++++++++++++++++
> > fs/remap_range.c | 45 ++++++++++++++++++++++++++++-------
> > fs/xfs/xfs_reflink.c | 9 +++++--
> > include/linux/dax.h | 4 ++++
> > include/linux/fs.h | 15 ++++++++----
> > 5 files changed, 115 insertions(+), 14 deletions(-)
> >
> > diff --git a/fs/dax.c b/fs/dax.c
> > index 348297b38f76..76f81f1d76ec 100644
> > --- a/fs/dax.c
> > +++ b/fs/dax.c
> > @@ -1833,3 +1833,59 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault
> *vmf,
> > return dax_insert_pfn_mkwrite(vmf, pfn, order); }
> > EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
> > +
> > +static loff_t dax_range_compare_actor(struct inode *ino1, loff_t pos1,
> > + struct inode *ino2, loff_t pos2, loff_t len, void *data,
> > + struct iomap *smap, struct iomap *dmap) {
> > + void *saddr, *daddr;
> > + bool *same = data;
> > + int ret;
> > +
> > + if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
> > + *same = true;
> > + return len;
> > + }
> > +
> > + if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
> > + *same = false;
> > + return 0;
> > + }
> > +
> > + ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
> > + &saddr, NULL);
>
> shouldn't it take len as the second argument?

The second argument of dax_iomap_direct_access() means offset, and the third one means length. So, I think this is right.

>
> > + if (ret < 0)
> > + return -EIO;
> > +
> > + ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
> > + &daddr, NULL);
>
> ditto.
> > + if (ret < 0)
> > + return -EIO;
> > +
> > + *same = !memcmp(saddr, daddr, len);
> > + return len;
> > +}
> > +
> > +int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> > + struct inode *dest, loff_t destoff, loff_t len, bool *is_same,
> > + const struct iomap_ops *ops)
> > +{
> > + int id, ret = 0;
> > +
> > + id = dax_read_lock();
> > + while (len) {
> > + ret = iomap_apply2(src, srcoff, dest, destoff, len, 0, ops,
> > + is_same, dax_range_compare_actor);
> > + if (ret < 0 || !*is_same)
> > + goto out;
> > +
> > + len -= ret;
> > + srcoff += ret;
> > + destoff += ret;
> > + }
> > + ret = 0;
> > +out:
> > + dax_read_unlock(id);
> > + return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(dax_dedupe_file_range_compare);
> > diff --git a/fs/remap_range.c b/fs/remap_range.c index
> > 77dba3a49e65..9079390edaf3 100644
> > --- a/fs/remap_range.c
> > +++ b/fs/remap_range.c
> > @@ -14,6 +14,7 @@
> > #include <linux/compat.h>
> > #include <linux/mount.h>
> > #include <linux/fs.h>
> > +#include <linux/dax.h>
> > #include "internal.h"
> >
> > #include <linux/uaccess.h>
> > @@ -199,9 +200,9 @@ static void vfs_unlock_two_pages(struct page *page1,
> struct page *page2)
> > * Compare extents of two files to see if they are the same.
> > * Caller must have locked both inodes to prevent write races.
> > */
> > -static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> > - struct inode *dest, loff_t destoff,
> > - loff_t len, bool *is_same)
> > +int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> > + struct inode *dest, loff_t destoff,
> > + loff_t len, bool *is_same)
> > {
> > loff_t src_poff;
> > loff_t dest_poff;
> > @@ -280,6 +281,7 @@ static int vfs_dedupe_file_range_compare(struct
> > inode *src, loff_t srcoff,
> > out_error:
> > return error;
> > }
> > +EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
> >
> > /*
> > * Check that the two inodes are eligible for cloning, the ranges
> > make @@ -289,9 +291,11 @@ static int
> vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> > * If there's an error, then the usual negative error code is returned.
> > * Otherwise returns 0 with *len set to the request length.
> > */
> > -int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > - struct file *file_out, loff_t pos_out,
> > - loff_t *len, unsigned int remap_flags)
> > +static int
> > +__generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > + struct file *file_out, loff_t pos_out,
> > + loff_t *len, unsigned int remap_flags,
> > + const struct iomap_ops *ops)
> > {
> > struct inode *inode_in = file_inode(file_in);
> > struct inode *inode_out = file_inode(file_out); @@ -351,8 +355,15 @@
> > int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > if (remap_flags & REMAP_FILE_DEDUP) {
> > bool is_same = false;
> >
> > - ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
> > - inode_out, pos_out, *len, &is_same);
> > + if (!IS_DAX(inode_in) && !IS_DAX(inode_out))
> > + ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
> > + inode_out, pos_out, *len, &is_same);
> > + else if (IS_DAX(inode_in) && IS_DAX(inode_out) && ops)
> > + ret = dax_dedupe_file_range_compare(inode_in, pos_in,
> > + inode_out, pos_out, *len, &is_same,
> > + ops);
> > + else
> > + return -EINVAL;
> > if (ret)
> > return ret;
> > if (!is_same)
>
> should we consider to check !is_same check b4?
> you should maybe relook at this error handling side of code.
> we still return len from actor function but is_same is set to false.
> So we are essentially returning ret (positive value), instead should be returning
> -EBADE because of !memcmp

The ret from compare function will not be positive, it will always be 0 or negative.
In addition, if something wrong happens, is_same will always be false. The caller could not get correct errno if check !is_same firstly.

>
> > @@ -370,6 +381,24 @@ int generic_remap_file_range_prep(struct file
> > *file_in, loff_t pos_in,
> >
> > return ret;
> > }
> > +
> > +int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > + struct file *file_out, loff_t pos_out,
> > + loff_t *len, unsigned int remap_flags,
> > + const struct iomap_ops *ops) {
> > + return __generic_remap_file_range_prep(file_in, pos_in, file_out,
> > + pos_out, len, remap_flags, ops); }
> > +EXPORT_SYMBOL(dax_remap_file_range_prep);
> > +
> > +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > + struct file *file_out, loff_t pos_out,
> > + loff_t *len, unsigned int remap_flags) {
> > + return __generic_remap_file_range_prep(file_in, pos_in, file_out,
> > + pos_out, len, remap_flags, NULL); }
> > EXPORT_SYMBOL(generic_remap_file_range_prep);
> >
> > loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, diff
> > --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index
> > 6fa05fb78189..f5b3a3da36b7 100644
> > --- a/fs/xfs/xfs_reflink.c
> > +++ b/fs/xfs/xfs_reflink.c
> > @@ -1308,8 +1308,13 @@ xfs_reflink_remap_prep(
> > if (IS_DAX(inode_in) || IS_DAX(inode_out))
> > goto out_unlock;
> >
> > - ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
> > - len, remap_flags);
> > + if (IS_DAX(inode_in))
>
> if (!IS_DAX(inode_in)) no?
My mistake...
>
> > + ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
> > + pos_out, len, remap_flags);
> > + else
> > + ret = dax_remap_file_range_prep(file_in, pos_in, file_out,
> > + pos_out, len, remap_flags,
> > + &xfs_read_iomap_ops);
> > if (ret || *len == 0)
> > goto out_unlock;
> >
> > diff --git a/include/linux/dax.h b/include/linux/dax.h index
> > 3275e01ed33d..32e1c34349f2 100644
> > --- a/include/linux/dax.h
> > +++ b/include/linux/dax.h
> > @@ -239,6 +239,10 @@ int dax_invalidate_mapping_entry_sync(struct
> address_space *mapping,
> > pgoff_t index);
> > s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
> > struct iomap *srcmap);
> > +int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> > + struct inode *dest, loff_t destoff,
> > + loff_t len, bool *is_same,
> > + const struct iomap_ops *ops);
> > static inline bool dax_mapping(struct address_space *mapping) {
> > return mapping->host && IS_DAX(mapping->host); diff --git
> > a/include/linux/fs.h b/include/linux/fs.h index
> > fd47deea7c17..2e6ec5bdf82a 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -68,6 +68,7 @@ struct fsverity_info; struct fsverity_operations;
> > struct fs_context; struct fs_parameter_spec;
> > +struct iomap_ops;
> >
> > extern void __init inode_init(void);
> > extern void __init inode_init_early(void); @@ -1910,13 +1911,19 @@
> > extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t
> > *); extern ssize_t vfs_write(struct file *, const char __user *,
> > size_t, loff_t *); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct
> file *,
> > loff_t, size_t, unsigned int);
> > +typedef int (*compare_range_t)(struct inode *src, loff_t srcpos,
> > + struct inode *dest, loff_t destpos,
> > + loff_t len, bool *is_same);
>
> Is this used anywhere?

No, I forgot to remove it...


--
Thanks,
Ruan Shiyang.
>
> > extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> > struct file *file_out, loff_t pos_out,
> > size_t len, unsigned int flags); -extern int
> > generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > - struct file *file_out, loff_t pos_out,
> > - loff_t *count,
> > - unsigned int remap_flags);
> > +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > + struct file *file_out, loff_t pos_out,
> > + loff_t *count, unsigned int remap_flags); int
> > +dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > + struct file *file_out, loff_t pos_out,
> > + loff_t *len, unsigned int remap_flags,
> > + const struct iomap_ops *ops);
> > extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
> > struct file *file_out, loff_t pos_out,
> > loff_t len, unsigned int remap_flags);
> > --
> > 2.30.1
> >
> >
> >