This patchset is attempt to add CoW support for fsdax, and take XFS,
which has both reflink and fsdax feature, as an example.
Changes from V4:
- Fix the mistake of breaking dax layout for two inodes
- Add CONFIG_FS_DAX judgement for fsdax code in remap_range.c
- Fix other small problems and mistakes
Changes from V3:
- Take out the first 3 patches as a cleanup patchset[1], which has been
sent yesterday.
- Fix usage of code in dax_iomap_cow_copy()
- Add comments for macro definitions
- Fix other code style problems and mistakes
One of the key mechanism need to be implemented in fsdax is CoW. Copy
the data from srcmap before we actually write data to the destance
iomap. And we just copy range in which data won't be changed.
Another mechanism is range comparison. In page cache case, readpage()
is used to load data on disk to page cache in order to be able to
compare data. In fsdax case, readpage() does not work. So, we need
another compare data with direct access support.
With the two mechanisms implemented in fsdax, we are able to make reflink
and fsdax work together in XFS.
Some of the patches are picked up from Goldwyn's patchset. I made some
changes to adapt to this patchset.
(Rebased on v5.13-rc1 and patchset[1])
[1]: https://lkml.org/lkml/2021/4/22/575
Shiyang Ruan (7):
fsdax: Introduce dax_iomap_cow_copy()
fsdax: Replace mmap entry in case of CoW
fsdax: Add dax_iomap_cow_copy() for dax_iomap_zero
iomap: Introduce iomap_apply2() for operations on two files
fsdax: Dedup file range to use a compare function
fs/xfs: Handle CoW for fsdax write() path
fs/xfs: Add dax dedupe support
fs/dax.c | 206 +++++++++++++++++++++++++++++++++++------
fs/iomap/apply.c | 52 +++++++++++
fs/iomap/buffered-io.c | 2 +-
fs/remap_range.c | 57 ++++++++++--
fs/xfs/xfs_bmap_util.c | 3 +-
fs/xfs/xfs_file.c | 11 +--
fs/xfs/xfs_inode.c | 66 ++++++++++++-
fs/xfs/xfs_inode.h | 1 +
fs/xfs/xfs_iomap.c | 61 +++++++++++-
fs/xfs/xfs_iomap.h | 4 +
fs/xfs/xfs_iops.c | 7 +-
fs/xfs/xfs_reflink.c | 15 +--
include/linux/dax.h | 7 +-
include/linux/fs.h | 12 ++-
include/linux/iomap.h | 7 +-
15 files changed, 449 insertions(+), 62 deletions(-)
--
2.31.1
In the case where the iomap is a write operation and iomap is not equal
to srcmap after iomap_begin, we consider it is a CoW operation.
The destance extent which iomap indicated is new allocated extent.
So, it is needed to copy the data from srcmap to new allocated extent.
In theory, it is better to copy the head and tail ranges which is
outside of the non-aligned area instead of copying the whole aligned
range. But in dax page fault, it will always be an aligned range. So,
we have to copy the whole range in this case.
Signed-off-by: Shiyang Ruan <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
---
fs/dax.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 81 insertions(+), 5 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index bf3fc8242e6c..f0249bb1d46a 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1038,6 +1038,61 @@ static int dax_iomap_direct_access(struct iomap *iomap, loff_t pos, size_t size,
return rc;
}
+/**
+ * dax_iomap_cow_copy(): Copy the data from source to destination before write.
+ * @pos: address to do copy from.
+ * @length: size of copy operation.
+ * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
+ * @srcmap: iomap srcmap
+ * @daddr: destination address to copy to.
+ *
+ * This can be called from two places. Either during DAX write fault, to copy
+ * the length size data to daddr. Or, while doing normal DAX write operation,
+ * dax_iomap_actor() might call this to do the copy of either start or end
+ * unaligned address. In this case the rest of the copy of aligned ranges is
+ * taken care by dax_iomap_actor() itself.
+ * Also, note DAX fault will always result in aligned pos and pos + length.
+ */
+static int dax_iomap_cow_copy(loff_t pos, loff_t length, size_t align_size,
+ struct iomap *srcmap, void *daddr)
+{
+ loff_t head_off = pos & (align_size - 1);
+ size_t size = ALIGN(head_off + length, align_size);
+ loff_t end = pos + length;
+ loff_t pg_end = round_up(end, align_size);
+ bool copy_all = head_off == 0 && end == pg_end;
+ void *saddr = 0;
+ int ret = 0;
+
+ ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
+ if (ret)
+ return ret;
+
+ if (copy_all) {
+ ret = copy_mc_to_kernel(daddr, saddr, length);
+ return ret ? -EIO : 0;
+ }
+
+ /* Copy the head part of the range. Note: we pass offset as length. */
+ if (head_off) {
+ ret = copy_mc_to_kernel(daddr, saddr, head_off);
+ if (ret)
+ return -EIO;
+ }
+
+ /* Copy the tail part of the range */
+ if (end < pg_end) {
+ loff_t tail_off = head_off + length;
+ loff_t tail_len = pg_end - end;
+
+ ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off,
+ tail_len);
+ if (ret)
+ return -EIO;
+ }
+ return 0;
+}
+
/*
* The user has performed a load from a hole in the file. Allocating a new
* page in the file would cause excessive storage usage for workloads with
@@ -1167,11 +1222,12 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct dax_device *dax_dev = iomap->dax_dev;
struct iov_iter *iter = data;
loff_t end = pos + length, done = 0;
+ bool write = iov_iter_rw(iter) == WRITE;
ssize_t ret = 0;
size_t xfer;
int id;
- if (iov_iter_rw(iter) == READ) {
+ if (!write) {
end = min(end, i_size_read(inode));
if (pos >= end)
return 0;
@@ -1180,7 +1236,12 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
return iov_iter_zero(min(length, end - pos), iter);
}
- if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
+ /*
+ * In DAX mode, we allow either pure overwrites of written extents, or
+ * writes to unwritten extents as part of a copy-on-write operation.
+ */
+ if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
+ !(iomap->flags & IOMAP_F_SHARED)))
return -EIO;
/*
@@ -1219,6 +1280,13 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
break;
}
+ if (write && srcmap->addr != iomap->addr) {
+ ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap,
+ kaddr);
+ if (ret)
+ break;
+ }
+
map_len = PFN_PHYS(map_len);
kaddr += offset;
map_len -= offset;
@@ -1230,7 +1298,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
* validated via access_ok() in either vfs_read() or
* vfs_write(), depending on which operation we are doing.
*/
- if (iov_iter_rw(iter) == WRITE)
+ if (write)
xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
map_len, iter);
else
@@ -1382,6 +1450,7 @@ static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp,
unsigned long entry_flags = pmd ? DAX_PMD : 0;
int err = 0;
pfn_t pfn;
+ void *kaddr;
/* if we are reading UNWRITTEN and HOLE, return a hole. */
if (!write &&
@@ -1392,18 +1461,25 @@ static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp,
return dax_pmd_load_hole(xas, vmf, iomap, entry);
}
- if (iomap->type != IOMAP_MAPPED) {
+ if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) {
WARN_ON_ONCE(1);
return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
}
- err = dax_iomap_direct_access(iomap, pos, size, NULL, &pfn);
+ err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
if (err)
return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, entry_flags,
write && !sync);
+ if (write &&
+ srcmap->addr != IOMAP_HOLE && srcmap->addr != iomap->addr) {
+ err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr);
+ if (err)
+ return dax_fault_return(err);
+ }
+
if (sync)
return dax_fault_synchronous_pfnp(pfnp, pfn);
--
2.31.1
We replace the existing entry to the newly allocated one in case of CoW.
Also, we mark the entry as PAGECACHE_TAG_TOWRITE so writeback marks this
entry as writeprotected. This helps us snapshots so new write
pagefaults after snapshots trigger a CoW.
Signed-off-by: Goldwyn Rodrigues <[email protected]>
Signed-off-by: Shiyang Ruan <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
Reviewed-by: Ritesh Harjani <[email protected]>
---
fs/dax.c | 39 ++++++++++++++++++++++++++++-----------
1 file changed, 28 insertions(+), 11 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index f0249bb1d46a..ef0e564e7904 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -722,6 +722,10 @@ static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_d
return 0;
}
+/* DAX Insert Flag: The state of the entry we insert */
+#define DAX_IF_DIRTY (1 << 0)
+#define DAX_IF_COW (1 << 1)
+
/*
* By this point grab_mapping_entry() has ensured that we have a locked entry
* of the appropriate size so we don't have to worry about downgrading PMDs to
@@ -729,16 +733,19 @@ static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_d
* already in the tree, we will skip the insertion and just dirty the PMD as
* appropriate.
*/
-static void *dax_insert_entry(struct xa_state *xas,
- struct address_space *mapping, struct vm_fault *vmf,
- void *entry, pfn_t pfn, unsigned long flags, bool dirty)
+static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
+ void *entry, pfn_t pfn, unsigned long flags,
+ unsigned int insert_flags)
{
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
void *new_entry = dax_make_entry(pfn, flags);
+ bool dirty = insert_flags & DAX_IF_DIRTY;
+ bool cow = insert_flags & DAX_IF_COW;
if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
+ if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
unsigned long index = xas->xa_index;
/* we are replacing a zero page with block mapping */
if (dax_is_pmd_entry(entry))
@@ -750,7 +757,7 @@ static void *dax_insert_entry(struct xa_state *xas,
xas_reset(xas);
xas_lock_irq(xas);
- if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+ if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
void *old;
dax_disassociate_entry(entry, mapping, false);
@@ -774,6 +781,9 @@ static void *dax_insert_entry(struct xa_state *xas,
if (dirty)
xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
+ if (cow)
+ xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
+
xas_unlock_irq(xas);
return entry;
}
@@ -1109,8 +1119,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
vm_fault_t ret;
- *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
- DAX_ZERO_PAGE, false);
+ *entry = dax_insert_entry(xas, vmf, *entry, pfn, DAX_ZERO_PAGE, 0);
ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
trace_dax_load_hole(inode, vmf, ret);
@@ -1137,8 +1146,8 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
goto fallback;
pfn = page_to_pfn_t(zero_page);
- *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
- DAX_PMD | DAX_ZERO_PAGE, false);
+ *entry = dax_insert_entry(xas, vmf, *entry, pfn,
+ DAX_PMD | DAX_ZERO_PAGE, 0);
if (arch_needs_pgtable_deposit()) {
pgtable = pte_alloc_one(vma->vm_mm);
@@ -1448,6 +1457,7 @@ static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp,
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool sync = dax_fault_is_synchronous(flags, vmf->vma, iomap);
unsigned long entry_flags = pmd ? DAX_PMD : 0;
+ unsigned int insert_flags = 0;
int err = 0;
pfn_t pfn;
void *kaddr;
@@ -1470,8 +1480,15 @@ static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp,
if (err)
return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
- *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, entry_flags,
- write && !sync);
+ if (write) {
+ if (!sync)
+ insert_flags |= DAX_IF_DIRTY;
+ if (iomap->flags & IOMAP_F_SHARED)
+ insert_flags |= DAX_IF_COW;
+ }
+
+ *entry = dax_insert_entry(xas, vmf, *entry, pfn, entry_flags,
+ insert_flags);
if (write &&
srcmap->addr != IOMAP_HOLE && srcmap->addr != iomap->addr) {
--
2.31.1
Punch hole on a reflinked file needs dax_copy_edge() too. Otherwise,
data in not aligned area will be not correct. So, add the srcmap to
dax_iomap_zero() and replace memset() as dax_copy_edge().
Signed-off-by: Shiyang Ruan <[email protected]>
Reviewed-by: Ritesh Harjani <[email protected]>
---
fs/dax.c | 25 +++++++++++++++----------
fs/iomap/buffered-io.c | 2 +-
include/linux/dax.h | 3 ++-
3 files changed, 18 insertions(+), 12 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index ef0e564e7904..ee9d28a79bfb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1186,7 +1186,8 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
}
#endif /* CONFIG_FS_DAX_PMD */
-s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
+s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
+ struct iomap *srcmap)
{
sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
pgoff_t pgoff;
@@ -1208,19 +1209,23 @@ s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
if (page_aligned)
rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
- else
+ else {
rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
- if (rc < 0) {
- dax_read_unlock(id);
- return rc;
- }
-
- if (!page_aligned) {
- memset(kaddr + offset, 0, size);
+ if (rc < 0)
+ goto out;
+ if (iomap->addr != srcmap->addr) {
+ rc = dax_iomap_cow_copy(offset, size, PAGE_SIZE, srcmap,
+ kaddr);
+ if (rc < 0)
+ goto out;
+ } else
+ memset(kaddr + offset, 0, size);
dax_flush(iomap->dax_dev, kaddr + offset, size);
}
+
+out:
dax_read_unlock(id);
- return size;
+ return rc < 0 ? rc : size;
}
static loff_t
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index f2cd2034a87b..2734955ea67f 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -933,7 +933,7 @@ static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos,
s64 bytes;
if (IS_DAX(inode))
- bytes = dax_iomap_zero(pos, length, iomap);
+ bytes = dax_iomap_zero(pos, length, iomap, srcmap);
else
bytes = iomap_zero(inode, pos, length, iomap, srcmap);
if (bytes < 0)
diff --git a/include/linux/dax.h b/include/linux/dax.h
index b52f084aa643..3275e01ed33d 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -237,7 +237,8 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index);
-s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap);
+s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
+ struct iomap *srcmap);
static inline bool dax_mapping(struct address_space *mapping)
{
return mapping->host && IS_DAX(mapping->host);
--
2.31.1
With dax we cannot deal with readpage() etc. So, we create a dax
comparison funciton which is similar with
vfs_dedupe_file_range_compare().
And introduce dax_remap_file_range_prep() for filesystem use.
Signed-off-by: Goldwyn Rodrigues <[email protected]>
Signed-off-by: Shiyang Ruan <[email protected]>
---
fs/dax.c | 56 +++++++++++++++++++++++++++++++++++++++++++
fs/remap_range.c | 57 +++++++++++++++++++++++++++++++++++++-------
fs/xfs/xfs_reflink.c | 8 +++++--
include/linux/dax.h | 4 ++++
include/linux/fs.h | 12 ++++++----
5 files changed, 123 insertions(+), 14 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index ee9d28a79bfb..dedf1be0155c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1853,3 +1853,59 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
return dax_insert_pfn_mkwrite(vmf, pfn, order);
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
+
+static loff_t dax_range_compare_actor(struct inode *ino1, loff_t pos1,
+ struct inode *ino2, loff_t pos2, loff_t len, void *data,
+ struct iomap *smap, struct iomap *dmap)
+{
+ void *saddr, *daddr;
+ bool *same = data;
+ int ret;
+
+ if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
+ *same = true;
+ return len;
+ }
+
+ if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
+ *same = false;
+ return 0;
+ }
+
+ ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
+ &saddr, NULL);
+ if (ret < 0)
+ return -EIO;
+
+ ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
+ &daddr, NULL);
+ if (ret < 0)
+ return -EIO;
+
+ *same = !memcmp(saddr, daddr, len);
+ return len;
+}
+
+int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+ struct inode *dest, loff_t destoff, loff_t len, bool *is_same,
+ const struct iomap_ops *ops)
+{
+ int id, ret = 0;
+
+ id = dax_read_lock();
+ while (len) {
+ ret = iomap_apply2(src, srcoff, dest, destoff, len, 0, ops,
+ is_same, dax_range_compare_actor);
+ if (ret < 0 || !*is_same)
+ goto out;
+
+ len -= ret;
+ srcoff += ret;
+ destoff += ret;
+ }
+ ret = 0;
+out:
+ dax_read_unlock(id);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dax_dedupe_file_range_compare);
diff --git a/fs/remap_range.c b/fs/remap_range.c
index e4a5fdd7ad7b..7bc4c8e3aa9f 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -14,6 +14,7 @@
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/fs.h>
+#include <linux/dax.h>
#include "internal.h"
#include <linux/uaccess.h>
@@ -199,9 +200,9 @@ static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
* Compare extents of two files to see if they are the same.
* Caller must have locked both inodes to prevent write races.
*/
-static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
- struct inode *dest, loff_t destoff,
- loff_t len, bool *is_same)
+int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+ struct inode *dest, loff_t destoff,
+ loff_t len, bool *is_same)
{
loff_t src_poff;
loff_t dest_poff;
@@ -280,6 +281,7 @@ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
out_error:
return error;
}
+EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
/*
* Check that the two inodes are eligible for cloning, the ranges make
@@ -289,9 +291,11 @@ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
* If there's an error, then the usual negative error code is returned.
* Otherwise returns 0 with *len set to the request length.
*/
-int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *len, unsigned int remap_flags)
+static int
+__generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags,
+ const struct iomap_ops *dax_read_ops)
{
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
@@ -351,8 +355,17 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (remap_flags & REMAP_FILE_DEDUP) {
bool is_same = false;
- ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
- inode_out, pos_out, *len, &is_same);
+ if (!IS_DAX(inode_in))
+ ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
+ inode_out, pos_out, *len, &is_same);
+#ifdef CONFIG_FS_DAX
+ else if (dax_read_ops)
+ ret = dax_dedupe_file_range_compare(inode_in, pos_in,
+ inode_out, pos_out, *len, &is_same,
+ dax_read_ops);
+#endif /* CONFIG_FS_DAX */
+ else
+ return -EINVAL;
if (ret)
return ret;
if (!is_same)
@@ -370,6 +383,34 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
return ret;
}
+
+#ifdef CONFIG_FS_DAX
+int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags,
+ const struct iomap_ops *ops)
+{
+ return __generic_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags, ops);
+}
+#else
+int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags,
+ const struct iomap_ops *ops)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_FS_DAX */
+EXPORT_SYMBOL(dax_remap_file_range_prep);
+
+int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags)
+{
+ return __generic_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags, NULL);
+}
EXPORT_SYMBOL(generic_remap_file_range_prep);
loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 060695d6d56a..d25434f93235 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1329,8 +1329,12 @@ xfs_reflink_remap_prep(
if (IS_DAX(inode_in) || IS_DAX(inode_out))
goto out_unlock;
- ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
- len, remap_flags);
+ if (!IS_DAX(inode_in))
+ ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags);
+ else
+ ret = dax_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags, &xfs_read_iomap_ops);
if (ret || *len == 0)
goto out_unlock;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 3275e01ed33d..32e1c34349f2 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -239,6 +239,10 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index);
s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
struct iomap *srcmap);
+int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+ struct inode *dest, loff_t destoff,
+ loff_t len, bool *is_same,
+ const struct iomap_ops *ops);
static inline bool dax_mapping(struct address_space *mapping)
{
return mapping->host && IS_DAX(mapping->host);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c3c88fdb9b2a..e2c348553d87 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -71,6 +71,7 @@ struct fsverity_operations;
struct fs_context;
struct fs_parameter_spec;
struct fileattr;
+struct iomap_ops;
extern void __init inode_init(void);
extern void __init inode_init_early(void);
@@ -2126,10 +2127,13 @@ extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t len, unsigned int flags);
-extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *count,
- unsigned int remap_flags);
+int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *count, unsigned int remap_flags);
+int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags,
+ const struct iomap_ops *ops);
extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags);
--
2.31.1
In fsdax mode, WRITE and ZERO on a shared extent need CoW performed. After
CoW, new allocated extents needs to be remapped to the file. So, add an
iomap_end for dax write ops to do the remapping work.
Signed-off-by: Shiyang Ruan <[email protected]>
---
fs/xfs/xfs_bmap_util.c | 3 +--
fs/xfs/xfs_file.c | 9 +++----
fs/xfs/xfs_iomap.c | 61 +++++++++++++++++++++++++++++++++++++++++-
fs/xfs/xfs_iomap.h | 4 +++
fs/xfs/xfs_iops.c | 7 +++--
fs/xfs/xfs_reflink.c | 3 +--
6 files changed, 72 insertions(+), 15 deletions(-)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index a5e9d7d34023..2a36dc93ff27 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -965,8 +965,7 @@ xfs_free_file_space(
return 0;
if (offset + len > XFS_ISIZE(ip))
len = XFS_ISIZE(ip) - offset;
- error = iomap_zero_range(VFS_I(ip), offset, len, NULL,
- &xfs_buffered_write_iomap_ops);
+ error = xfs_iomap_zero_range(ip, offset, len, NULL);
if (error)
return error;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 396ef36dcd0a..38d8eca05aee 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -684,11 +684,8 @@ xfs_file_dax_write(
pos = iocb->ki_pos;
trace_xfs_file_dax_write(iocb, from);
- ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
- if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
- i_size_write(inode, iocb->ki_pos);
- error = xfs_setfilesize(ip, pos, ret);
- }
+ ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
+
out:
if (iolock)
xfs_iunlock(ip, iolock);
@@ -1309,7 +1306,7 @@ __xfs_filemap_fault(
ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
(write_fault && !vmf->cow_page) ?
- &xfs_direct_write_iomap_ops :
+ &xfs_dax_write_iomap_ops :
&xfs_read_iomap_ops);
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d154f42e2dc6..8b593a51480d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -761,7 +761,8 @@ xfs_direct_write_iomap_begin(
/* may drop and re-acquire the ilock */
error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
- &lockmode, flags & IOMAP_DIRECT);
+ &lockmode,
+ (flags & IOMAP_DIRECT) || IS_DAX(inode));
if (error)
goto out_unlock;
if (shared)
@@ -854,6 +855,41 @@ const struct iomap_ops xfs_direct_write_iomap_ops = {
.iomap_begin = xfs_direct_write_iomap_begin,
};
+static int
+xfs_dax_write_iomap_end(
+ struct inode *inode,
+ loff_t pos,
+ loff_t length,
+ ssize_t written,
+ unsigned int flags,
+ struct iomap *iomap)
+{
+ int error = 0;
+ struct xfs_inode *ip = XFS_I(inode);
+ bool cow = xfs_is_cow_inode(ip);
+
+ if (!written)
+ return 0;
+
+ if (pos + written > i_size_read(inode) && !(flags & IOMAP_FAULT)) {
+ i_size_write(inode, pos + written);
+ error = xfs_setfilesize(ip, pos, written);
+ if (error && cow) {
+ xfs_reflink_cancel_cow_range(ip, pos, written, true);
+ return error;
+ }
+ }
+ if (cow)
+ error = xfs_reflink_end_cow(ip, pos, written);
+
+ return error;
+}
+
+const struct iomap_ops xfs_dax_write_iomap_ops = {
+ .iomap_begin = xfs_direct_write_iomap_begin,
+ .iomap_end = xfs_dax_write_iomap_end,
+};
+
static int
xfs_buffered_write_iomap_begin(
struct inode *inode,
@@ -1311,3 +1347,26 @@ xfs_xattr_iomap_begin(
const struct iomap_ops xfs_xattr_iomap_ops = {
.iomap_begin = xfs_xattr_iomap_begin,
};
+
+int
+xfs_iomap_zero_range(
+ struct xfs_inode *ip,
+ loff_t offset,
+ loff_t len,
+ bool *did_zero)
+{
+ return iomap_zero_range(VFS_I(ip), offset, len, did_zero,
+ IS_DAX(VFS_I(ip)) ? &xfs_dax_write_iomap_ops
+ : &xfs_buffered_write_iomap_ops);
+}
+
+int
+xfs_iomap_truncate_page(
+ struct xfs_inode *ip,
+ loff_t pos,
+ bool *did_zero)
+{
+ return iomap_truncate_page(VFS_I(ip), pos, did_zero,
+ IS_DAX(VFS_I(ip)) ? &xfs_dax_write_iomap_ops
+ : &xfs_buffered_write_iomap_ops);
+}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 7d3703556d0e..e4e515cd63b5 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -14,6 +14,9 @@ struct xfs_bmbt_irec;
int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
xfs_fileoff_t count_fsb, struct xfs_bmbt_irec *imap);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
+int xfs_iomap_zero_range(struct xfs_inode *ip, loff_t offset, loff_t len,
+ bool *did_zero);
+int xfs_iomap_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero);
xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
xfs_fileoff_t end_fsb);
@@ -42,6 +45,7 @@ xfs_aligned_fsb_count(
extern const struct iomap_ops xfs_buffered_write_iomap_ops;
extern const struct iomap_ops xfs_direct_write_iomap_ops;
+extern const struct iomap_ops xfs_dax_write_iomap_ops;
extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index dfe24b7f26e5..6d936c3e1a6e 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -911,8 +911,8 @@ xfs_setattr_size(
*/
if (newsize > oldsize) {
trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
- error = iomap_zero_range(inode, oldsize, newsize - oldsize,
- &did_zeroing, &xfs_buffered_write_iomap_ops);
+ error = xfs_iomap_zero_range(ip, oldsize, newsize - oldsize,
+ &did_zeroing);
} else {
/*
* iomap won't detect a dirty page over an unwritten block (or a
@@ -924,8 +924,7 @@ xfs_setattr_size(
newsize);
if (error)
return error;
- error = iomap_truncate_page(inode, newsize, &did_zeroing,
- &xfs_buffered_write_iomap_ops);
+ error = xfs_iomap_truncate_page(ip, newsize, &did_zeroing);
}
if (error)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index d25434f93235..9a780948dbd0 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1266,8 +1266,7 @@ xfs_reflink_zero_posteof(
return 0;
trace_xfs_zero_eof(ip, isize, pos - isize);
- return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
- &xfs_buffered_write_iomap_ops);
+ return xfs_iomap_zero_range(ip, isize, pos - isize, NULL);
}
/*
--
2.31.1
Introduce xfs_mmaplock_two_inodes_and_break_dax_layout() for dax files
who are going to be deduped. After that, call compare range function
only when files are both DAX or not.
Signed-off-by: Shiyang Ruan <[email protected]>
---
fs/xfs/xfs_file.c | 2 +-
fs/xfs/xfs_inode.c | 66 +++++++++++++++++++++++++++++++++++++++++++-
fs/xfs/xfs_inode.h | 1 +
fs/xfs/xfs_reflink.c | 4 +--
4 files changed, 69 insertions(+), 4 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 38d8eca05aee..bd5002d38df4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -823,7 +823,7 @@ xfs_wait_dax_page(
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
}
-static int
+int
xfs_break_dax_layouts(
struct inode *inode,
bool *retry)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0369eb22c1bb..0774b6e2b940 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3711,6 +3711,64 @@ xfs_iolock_two_inodes_and_break_layout(
return 0;
}
+static int
+xfs_mmaplock_two_inodes_and_break_dax_layout(
+ struct inode *src,
+ struct inode *dest)
+{
+ int error, attempts = 0;
+ bool retry;
+ struct xfs_inode *ip0, *ip1;
+ struct page *page;
+ struct xfs_log_item *lp;
+
+ if (src > dest)
+ swap(src, dest);
+ ip0 = XFS_I(src);
+ ip1 = XFS_I(dest);
+
+again:
+ retry = false;
+ /* Lock the first inode */
+ xfs_ilock(ip0, XFS_MMAPLOCK_EXCL);
+ error = xfs_break_dax_layouts(src, &retry);
+ if (error || retry) {
+ xfs_iunlock(ip0, XFS_MMAPLOCK_EXCL);
+ goto again;
+ }
+
+ if (src == dest)
+ return 0;
+
+ /* Nested lock the second inode */
+ lp = &ip0->i_itemp->ili_item;
+ if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
+ if (!xfs_ilock_nowait(ip1,
+ xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1))) {
+ xfs_iunlock(ip0, XFS_MMAPLOCK_EXCL);
+ if ((++attempts % 5) == 0)
+ delay(1); /* Don't just spin the CPU */
+ goto again;
+ }
+ } else
+ xfs_ilock(ip1, xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1));
+ /*
+ * We cannot use xfs_break_dax_layouts() directly here because it may
+ * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable
+ * for this nested lock case.
+ */
+ page = dax_layout_busy_page(dest->i_mapping);
+ if (page) {
+ if (page_ref_count(page) != 1) {
+ xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+ xfs_iunlock(ip0, XFS_MMAPLOCK_EXCL);
+ goto again;
+ }
+ }
+
+ return 0;
+}
+
/*
* Lock two inodes so that userspace cannot initiate I/O via file syscalls or
* mmap activity.
@@ -3721,10 +3779,16 @@ xfs_ilock2_io_mmap(
struct xfs_inode *ip2)
{
int ret;
+ struct inode *ino1 = VFS_I(ip1);
+ struct inode *ino2 = VFS_I(ip2);
- ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
+ ret = xfs_iolock_two_inodes_and_break_layout(ino1, ino2);
if (ret)
return ret;
+
+ if (IS_DAX(ino1) && IS_DAX(ino2))
+ return xfs_mmaplock_two_inodes_and_break_dax_layout(ino1, ino2);
+
if (ip1 == ip2)
xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
else
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ca826cfba91c..2d0b344fb100 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -457,6 +457,7 @@ enum xfs_prealloc_flags {
int xfs_update_prealloc_flags(struct xfs_inode *ip,
enum xfs_prealloc_flags flags);
+int xfs_break_dax_layouts(struct inode *inode, bool *retry);
int xfs_break_layouts(struct inode *inode, uint *iolock,
enum layout_break_reason reason);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 9a780948dbd0..ff308304c5cd 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1324,8 +1324,8 @@ xfs_reflink_remap_prep(
if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
goto out_unlock;
- /* Don't share DAX file data for now. */
- if (IS_DAX(inode_in) || IS_DAX(inode_out))
+ /* Don't share DAX file data with non-DAX file. */
+ if (IS_DAX(inode_in) != IS_DAX(inode_out))
goto out_unlock;
if (!IS_DAX(inode_in))
--
2.31.1
Some operations, such as comparing a range of data in two files under
fsdax mode, requires nested iomap_open()/iomap_end() on two file. Thus,
we introduce iomap_apply2() to accept arguments from two files and
iomap_actor2_t for actions on two files.
Signed-off-by: Shiyang Ruan <[email protected]>
Reviewed-by: Darrick J. Wong <[email protected]>
---
fs/iomap/apply.c | 52 +++++++++++++++++++++++++++++++++++++++++++
include/linux/iomap.h | 7 +++++-
2 files changed, 58 insertions(+), 1 deletion(-)
diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c
index 26ab6563181f..0493da5286ad 100644
--- a/fs/iomap/apply.c
+++ b/fs/iomap/apply.c
@@ -97,3 +97,55 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
return written ? written : ret;
}
+
+loff_t
+iomap_apply2(struct inode *ino1, loff_t pos1, struct inode *ino2, loff_t pos2,
+ loff_t length, unsigned int flags, const struct iomap_ops *ops,
+ void *data, iomap_actor2_t actor)
+{
+ struct iomap smap = { .type = IOMAP_HOLE };
+ struct iomap dmap = { .type = IOMAP_HOLE };
+ loff_t written = 0, ret, ret2 = 0;
+ loff_t len1 = length, len2, min_len;
+
+ ret = ops->iomap_begin(ino1, pos1, len1, flags, &smap, NULL);
+ if (ret)
+ goto out;
+ if (WARN_ON(smap.offset > pos1)) {
+ written = -EIO;
+ goto out_src;
+ }
+ if (WARN_ON(smap.length == 0)) {
+ written = -EIO;
+ goto out_src;
+ }
+ len2 = min_t(loff_t, len1, smap.length);
+
+ ret = ops->iomap_begin(ino2, pos2, len2, flags, &dmap, NULL);
+ if (ret)
+ goto out_src;
+ if (WARN_ON(dmap.offset > pos2)) {
+ written = -EIO;
+ goto out_dest;
+ }
+ if (WARN_ON(dmap.length == 0)) {
+ written = -EIO;
+ goto out_dest;
+ }
+ min_len = min_t(loff_t, len2, dmap.length);
+
+ written = actor(ino1, pos1, ino2, pos2, min_len, data, &smap, &dmap);
+
+out_dest:
+ if (ops->iomap_end)
+ ret2 = ops->iomap_end(ino2, pos2, len2,
+ written > 0 ? written : 0, flags, &dmap);
+out_src:
+ if (ops->iomap_end)
+ ret = ops->iomap_end(ino1, pos1, len1,
+ written > 0 ? written : 0, flags, &smap);
+out:
+ if (written)
+ return written;
+ return ret ?: ret2;
+}
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index c87d0cb0de6d..95562f863ad0 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -150,10 +150,15 @@ struct iomap_ops {
*/
typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
void *data, struct iomap *iomap, struct iomap *srcmap);
-
+typedef loff_t (*iomap_actor2_t)(struct inode *ino1, loff_t pos1,
+ struct inode *ino2, loff_t pos2, loff_t len, void *data,
+ struct iomap *smap, struct iomap *dmap);
loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
unsigned flags, const struct iomap_ops *ops, void *data,
iomap_actor_t actor);
+loff_t iomap_apply2(struct inode *ino1, loff_t pos1, struct inode *ino2,
+ loff_t pos2, loff_t length, unsigned int flags,
+ const struct iomap_ops *ops, void *data, iomap_actor2_t actor);
ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
const struct iomap_ops *ops);
--
2.31.1
On Tue, May 11, 2021 at 11:09:26AM +0800, Shiyang Ruan wrote:
> This patchset is attempt to add CoW support for fsdax, and take XFS,
> which has both reflink and fsdax feature, as an example.
Slightly off topic, but I noticed all my pmem disappeared once I rolled
forward to 5.13-rc1. Am I the only lucky one? Qemu 4.2, with fake
memory devices backed by tmpfs files -- info qtree says they're there,
but the kernel doesn't show anything in /proc/iomem.
--D
> Changes from V4:
> - Fix the mistake of breaking dax layout for two inodes
> - Add CONFIG_FS_DAX judgement for fsdax code in remap_range.c
> - Fix other small problems and mistakes
>
> Changes from V3:
> - Take out the first 3 patches as a cleanup patchset[1], which has been
> sent yesterday.
> - Fix usage of code in dax_iomap_cow_copy()
> - Add comments for macro definitions
> - Fix other code style problems and mistakes
>
> One of the key mechanism need to be implemented in fsdax is CoW. Copy
> the data from srcmap before we actually write data to the destance
> iomap. And we just copy range in which data won't be changed.
>
> Another mechanism is range comparison. In page cache case, readpage()
> is used to load data on disk to page cache in order to be able to
> compare data. In fsdax case, readpage() does not work. So, we need
> another compare data with direct access support.
>
> With the two mechanisms implemented in fsdax, we are able to make reflink
> and fsdax work together in XFS.
>
> Some of the patches are picked up from Goldwyn's patchset. I made some
> changes to adapt to this patchset.
>
>
> (Rebased on v5.13-rc1 and patchset[1])
> [1]: https://lkml.org/lkml/2021/4/22/575
>
> Shiyang Ruan (7):
> fsdax: Introduce dax_iomap_cow_copy()
> fsdax: Replace mmap entry in case of CoW
> fsdax: Add dax_iomap_cow_copy() for dax_iomap_zero
> iomap: Introduce iomap_apply2() for operations on two files
> fsdax: Dedup file range to use a compare function
> fs/xfs: Handle CoW for fsdax write() path
> fs/xfs: Add dax dedupe support
>
> fs/dax.c | 206 +++++++++++++++++++++++++++++++++++------
> fs/iomap/apply.c | 52 +++++++++++
> fs/iomap/buffered-io.c | 2 +-
> fs/remap_range.c | 57 ++++++++++--
> fs/xfs/xfs_bmap_util.c | 3 +-
> fs/xfs/xfs_file.c | 11 +--
> fs/xfs/xfs_inode.c | 66 ++++++++++++-
> fs/xfs/xfs_inode.h | 1 +
> fs/xfs/xfs_iomap.c | 61 +++++++++++-
> fs/xfs/xfs_iomap.h | 4 +
> fs/xfs/xfs_iops.c | 7 +-
> fs/xfs/xfs_reflink.c | 15 +--
> include/linux/dax.h | 7 +-
> include/linux/fs.h | 12 ++-
> include/linux/iomap.h | 7 +-
> 15 files changed, 449 insertions(+), 62 deletions(-)
>
> --
> 2.31.1
>
>
>
> -----Original Message-----
> From: Darrick J. Wong <[email protected]>
> Sent: Tuesday, May 11, 2021 11:57 AM
> Subject: Re: [PATCH v5 0/7] fsdax,xfs: Add reflink&dedupe support for fsdax
>
> On Tue, May 11, 2021 at 11:09:26AM +0800, Shiyang Ruan wrote:
> > This patchset is attempt to add CoW support for fsdax, and take XFS,
> > which has both reflink and fsdax feature, as an example.
>
> Slightly off topic, but I noticed all my pmem disappeared once I rolled forward to
> 5.13-rc1. Am I the only lucky one? Qemu 4.2, with fake memory devices
> backed by tmpfs files -- info qtree says they're there, but the kernel doesn't show
> anything in /proc/iomem.
I have the same situation on 5.13-rc1 too. (Qemu 5.2.0, fake memory device backed by files)
I tested this code in v5.12-rc8 and then rebased it to v5.13-rc1... It's my bad for not testing again.
--
Thanks,
Ruan Shiyang.
>
> --D
>
> > Changes from V4:
> > - Fix the mistake of breaking dax layout for two inodes
> > - Add CONFIG_FS_DAX judgement for fsdax code in remap_range.c
> > - Fix other small problems and mistakes
> >
> > Changes from V3:
> > - Take out the first 3 patches as a cleanup patchset[1], which has been
> > sent yesterday.
> > - Fix usage of code in dax_iomap_cow_copy()
> > - Add comments for macro definitions
> > - Fix other code style problems and mistakes
> >
> > One of the key mechanism need to be implemented in fsdax is CoW. Copy
> > the data from srcmap before we actually write data to the destance
> > iomap. And we just copy range in which data won't be changed.
> >
> > Another mechanism is range comparison. In page cache case, readpage()
> > is used to load data on disk to page cache in order to be able to
> > compare data. In fsdax case, readpage() does not work. So, we need
> > another compare data with direct access support.
> >
> > With the two mechanisms implemented in fsdax, we are able to make
> > reflink and fsdax work together in XFS.
> >
> > Some of the patches are picked up from Goldwyn's patchset. I made
> > some changes to adapt to this patchset.
> >
> >
> > (Rebased on v5.13-rc1 and patchset[1])
> > [1]: https://lkml.org/lkml/2021/4/22/575
> >
> > Shiyang Ruan (7):
> > fsdax: Introduce dax_iomap_cow_copy()
> > fsdax: Replace mmap entry in case of CoW
> > fsdax: Add dax_iomap_cow_copy() for dax_iomap_zero
> > iomap: Introduce iomap_apply2() for operations on two files
> > fsdax: Dedup file range to use a compare function
> > fs/xfs: Handle CoW for fsdax write() path
> > fs/xfs: Add dax dedupe support
> >
> > fs/dax.c | 206
> +++++++++++++++++++++++++++++++++++------
> > fs/iomap/apply.c | 52 +++++++++++
> > fs/iomap/buffered-io.c | 2 +-
> > fs/remap_range.c | 57 ++++++++++--
> > fs/xfs/xfs_bmap_util.c | 3 +-
> > fs/xfs/xfs_file.c | 11 +--
> > fs/xfs/xfs_inode.c | 66 ++++++++++++-
> > fs/xfs/xfs_inode.h | 1 +
> > fs/xfs/xfs_iomap.c | 61 +++++++++++-
> > fs/xfs/xfs_iomap.h | 4 +
> > fs/xfs/xfs_iops.c | 7 +-
> > fs/xfs/xfs_reflink.c | 15 +--
> > include/linux/dax.h | 7 +-
> > include/linux/fs.h | 12 ++-
> > include/linux/iomap.h | 7 +-
> > 15 files changed, 449 insertions(+), 62 deletions(-)
> >
> > --
> > 2.31.1
> >
> >
> >
On Tue, May 11, 2021 at 11:09:33AM +0800, Shiyang Ruan wrote:
> Introduce xfs_mmaplock_two_inodes_and_break_dax_layout() for dax files
> who are going to be deduped. After that, call compare range function
> only when files are both DAX or not.
>
> Signed-off-by: Shiyang Ruan <[email protected]>
> ---
> fs/xfs/xfs_file.c | 2 +-
> fs/xfs/xfs_inode.c | 66 +++++++++++++++++++++++++++++++++++++++++++-
> fs/xfs/xfs_inode.h | 1 +
> fs/xfs/xfs_reflink.c | 4 +--
> 4 files changed, 69 insertions(+), 4 deletions(-)
>
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 38d8eca05aee..bd5002d38df4 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -823,7 +823,7 @@ xfs_wait_dax_page(
> xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
> }
>
> -static int
> +int
> xfs_break_dax_layouts(
> struct inode *inode,
> bool *retry)
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 0369eb22c1bb..0774b6e2b940 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -3711,6 +3711,64 @@ xfs_iolock_two_inodes_and_break_layout(
> return 0;
> }
>
> +static int
> +xfs_mmaplock_two_inodes_and_break_dax_layout(
> + struct inode *src,
> + struct inode *dest)
MMAPLOCK is an xfs_inode lock, so please pass those in here.
> +{
> + int error, attempts = 0;
> + bool retry;
> + struct xfs_inode *ip0, *ip1;
> + struct page *page;
> + struct xfs_log_item *lp;
> +
> + if (src > dest)
> + swap(src, dest);
The MMAPLOCK (and ILOCK) locking order is increasing inode number, not
the address of the incore object. This is different (and not consistent
with) i_rwsem/XFS_IOLOCK, but those are the rules.
> + ip0 = XFS_I(src);
> + ip1 = XFS_I(dest);
> +
> +again:
> + retry = false;
> + /* Lock the first inode */
> + xfs_ilock(ip0, XFS_MMAPLOCK_EXCL);
> + error = xfs_break_dax_layouts(src, &retry);
> + if (error || retry) {
> + xfs_iunlock(ip0, XFS_MMAPLOCK_EXCL);
> + goto again;
> + }
> +
> + if (src == dest)
> + return 0;
> +
> + /* Nested lock the second inode */
> + lp = &ip0->i_itemp->ili_item;
> + if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
> + if (!xfs_ilock_nowait(ip1,
> + xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1))) {
> + xfs_iunlock(ip0, XFS_MMAPLOCK_EXCL);
> + if ((++attempts % 5) == 0)
> + delay(1); /* Don't just spin the CPU */
> + goto again;
> + }
> + } else
> + xfs_ilock(ip1, xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1));
> + /*
> + * We cannot use xfs_break_dax_layouts() directly here because it may
> + * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable
> + * for this nested lock case.
> + */
> + page = dax_layout_busy_page(dest->i_mapping);
> + if (page) {
> + if (page_ref_count(page) != 1) {
This could be flattened to:
if (page && page_ref_count(page) != 1) {
...
}
--D
> + xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
> + xfs_iunlock(ip0, XFS_MMAPLOCK_EXCL);
> + goto again;
> + }
> + }
> +
> + return 0;
> +}
> +
> /*
> * Lock two inodes so that userspace cannot initiate I/O via file syscalls or
> * mmap activity.
> @@ -3721,10 +3779,16 @@ xfs_ilock2_io_mmap(
> struct xfs_inode *ip2)
> {
> int ret;
> + struct inode *ino1 = VFS_I(ip1);
> + struct inode *ino2 = VFS_I(ip2);
>
> - ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
> + ret = xfs_iolock_two_inodes_and_break_layout(ino1, ino2);
> if (ret)
> return ret;
> +
> + if (IS_DAX(ino1) && IS_DAX(ino2))
> + return xfs_mmaplock_two_inodes_and_break_dax_layout(ino1, ino2);
> +
> if (ip1 == ip2)
> xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
> else
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index ca826cfba91c..2d0b344fb100 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -457,6 +457,7 @@ enum xfs_prealloc_flags {
>
> int xfs_update_prealloc_flags(struct xfs_inode *ip,
> enum xfs_prealloc_flags flags);
> +int xfs_break_dax_layouts(struct inode *inode, bool *retry);
> int xfs_break_layouts(struct inode *inode, uint *iolock,
> enum layout_break_reason reason);
>
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 9a780948dbd0..ff308304c5cd 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -1324,8 +1324,8 @@ xfs_reflink_remap_prep(
> if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
> goto out_unlock;
>
> - /* Don't share DAX file data for now. */
> - if (IS_DAX(inode_in) || IS_DAX(inode_out))
> + /* Don't share DAX file data with non-DAX file. */
> + if (IS_DAX(inode_in) != IS_DAX(inode_out))
> goto out_unlock;
>
> if (!IS_DAX(inode_in))
> --
> 2.31.1
>
>
>
On Tue, May 11, 2021 at 11:09:27AM +0800, Shiyang Ruan wrote:
> In the case where the iomap is a write operation and iomap is not equal
> to srcmap after iomap_begin, we consider it is a CoW operation.
>
> The destance extent which iomap indicated is new allocated extent.
> So, it is needed to copy the data from srcmap to new allocated extent.
> In theory, it is better to copy the head and tail ranges which is
> outside of the non-aligned area instead of copying the whole aligned
> range. But in dax page fault, it will always be an aligned range. So,
> we have to copy the whole range in this case.
>
> Signed-off-by: Shiyang Ruan <[email protected]>
> Reviewed-by: Christoph Hellwig <[email protected]>
> ---
> fs/dax.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 81 insertions(+), 5 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index bf3fc8242e6c..f0249bb1d46a 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1038,6 +1038,61 @@ static int dax_iomap_direct_access(struct iomap *iomap, loff_t pos, size_t size,
> return rc;
> }
>
> +/**
> + * dax_iomap_cow_copy(): Copy the data from source to destination before write.
> + * @pos: address to do copy from.
> + * @length: size of copy operation.
> + * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
> + * @srcmap: iomap srcmap
> + * @daddr: destination address to copy to.
> + *
> + * This can be called from two places. Either during DAX write fault, to copy
> + * the length size data to daddr. Or, while doing normal DAX write operation,
> + * dax_iomap_actor() might call this to do the copy of either start or end
> + * unaligned address. In this case the rest of the copy of aligned ranges is
> + * taken care by dax_iomap_actor() itself.
> + * Also, note DAX fault will always result in aligned pos and pos + length.
> + */
> +static int dax_iomap_cow_copy(loff_t pos, loff_t length, size_t align_size,
Nit: Linus has asked us not to continue the use of loff_t for file
io length. Could you change this to 'uint64_t length', please?
(Assuming we even need the extra length bits?)
With that fixed up...
Reviewed-by: Darrick J. Wong <[email protected]>
--D
> + struct iomap *srcmap, void *daddr)
> +{
> + loff_t head_off = pos & (align_size - 1);
Other nit: head_off = round_down(pos, align_size); ?
> + size_t size = ALIGN(head_off + length, align_size);
> + loff_t end = pos + length;
> + loff_t pg_end = round_up(end, align_size);
> + bool copy_all = head_off == 0 && end == pg_end;
> + void *saddr = 0;
> + int ret = 0;
> +
> + ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
> + if (ret)
> + return ret;
> +
> + if (copy_all) {
> + ret = copy_mc_to_kernel(daddr, saddr, length);
> + return ret ? -EIO : 0;
> + }
> +
> + /* Copy the head part of the range. Note: we pass offset as length. */
> + if (head_off) {
> + ret = copy_mc_to_kernel(daddr, saddr, head_off);
> + if (ret)
> + return -EIO;
> + }
> +
> + /* Copy the tail part of the range */
> + if (end < pg_end) {
> + loff_t tail_off = head_off + length;
> + loff_t tail_len = pg_end - end;
> +
> + ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off,
> + tail_len);
> + if (ret)
> + return -EIO;
> + }
> + return 0;
> +}
> +
> /*
> * The user has performed a load from a hole in the file. Allocating a new
> * page in the file would cause excessive storage usage for workloads with
> @@ -1167,11 +1222,12 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
> struct dax_device *dax_dev = iomap->dax_dev;
> struct iov_iter *iter = data;
> loff_t end = pos + length, done = 0;
> + bool write = iov_iter_rw(iter) == WRITE;
> ssize_t ret = 0;
> size_t xfer;
> int id;
>
> - if (iov_iter_rw(iter) == READ) {
> + if (!write) {
> end = min(end, i_size_read(inode));
> if (pos >= end)
> return 0;
> @@ -1180,7 +1236,12 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
> return iov_iter_zero(min(length, end - pos), iter);
> }
>
> - if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
> + /*
> + * In DAX mode, we allow either pure overwrites of written extents, or
> + * writes to unwritten extents as part of a copy-on-write operation.
> + */
> + if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
> + !(iomap->flags & IOMAP_F_SHARED)))
> return -EIO;
>
> /*
> @@ -1219,6 +1280,13 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
> break;
> }
>
> + if (write && srcmap->addr != iomap->addr) {
> + ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap,
> + kaddr);
> + if (ret)
> + break;
> + }
> +
> map_len = PFN_PHYS(map_len);
> kaddr += offset;
> map_len -= offset;
> @@ -1230,7 +1298,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
> * validated via access_ok() in either vfs_read() or
> * vfs_write(), depending on which operation we are doing.
> */
> - if (iov_iter_rw(iter) == WRITE)
> + if (write)
> xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
> map_len, iter);
> else
> @@ -1382,6 +1450,7 @@ static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp,
> unsigned long entry_flags = pmd ? DAX_PMD : 0;
> int err = 0;
> pfn_t pfn;
> + void *kaddr;
>
> /* if we are reading UNWRITTEN and HOLE, return a hole. */
> if (!write &&
> @@ -1392,18 +1461,25 @@ static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp,
> return dax_pmd_load_hole(xas, vmf, iomap, entry);
> }
>
> - if (iomap->type != IOMAP_MAPPED) {
> + if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) {
> WARN_ON_ONCE(1);
> return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
> }
>
> - err = dax_iomap_direct_access(iomap, pos, size, NULL, &pfn);
> + err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
> if (err)
> return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
>
> *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, entry_flags,
> write && !sync);
>
> + if (write &&
> + srcmap->addr != IOMAP_HOLE && srcmap->addr != iomap->addr) {
> + err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr);
> + if (err)
> + return dax_fault_return(err);
> + }
> +
> if (sync)
> return dax_fault_synchronous_pfnp(pfnp, pfn);
>
> --
> 2.31.1
>
>
>
On Tue, May 11, 2021 at 11:09:28AM +0800, Shiyang Ruan wrote:
> We replace the existing entry to the newly allocated one in case of CoW.
> Also, we mark the entry as PAGECACHE_TAG_TOWRITE so writeback marks this
> entry as writeprotected. This helps us snapshots so new write
> pagefaults after snapshots trigger a CoW.
>
> Signed-off-by: Goldwyn Rodrigues <[email protected]>
> Signed-off-by: Shiyang Ruan <[email protected]>
> Reviewed-by: Christoph Hellwig <[email protected]>
> Reviewed-by: Ritesh Harjani <[email protected]>
Seems fine to me...
Reviewed-by: Darrick J. Wong <[email protected]>
--D
> ---
> fs/dax.c | 39 ++++++++++++++++++++++++++++-----------
> 1 file changed, 28 insertions(+), 11 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index f0249bb1d46a..ef0e564e7904 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -722,6 +722,10 @@ static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_d
> return 0;
> }
>
> +/* DAX Insert Flag: The state of the entry we insert */
> +#define DAX_IF_DIRTY (1 << 0)
> +#define DAX_IF_COW (1 << 1)
> +
> /*
> * By this point grab_mapping_entry() has ensured that we have a locked entry
> * of the appropriate size so we don't have to worry about downgrading PMDs to
> @@ -729,16 +733,19 @@ static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_d
> * already in the tree, we will skip the insertion and just dirty the PMD as
> * appropriate.
> */
> -static void *dax_insert_entry(struct xa_state *xas,
> - struct address_space *mapping, struct vm_fault *vmf,
> - void *entry, pfn_t pfn, unsigned long flags, bool dirty)
> +static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
> + void *entry, pfn_t pfn, unsigned long flags,
> + unsigned int insert_flags)
> {
> + struct address_space *mapping = vmf->vma->vm_file->f_mapping;
> void *new_entry = dax_make_entry(pfn, flags);
> + bool dirty = insert_flags & DAX_IF_DIRTY;
> + bool cow = insert_flags & DAX_IF_COW;
>
> if (dirty)
> __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
>
> - if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
> + if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
> unsigned long index = xas->xa_index;
> /* we are replacing a zero page with block mapping */
> if (dax_is_pmd_entry(entry))
> @@ -750,7 +757,7 @@ static void *dax_insert_entry(struct xa_state *xas,
>
> xas_reset(xas);
> xas_lock_irq(xas);
> - if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
> + if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
> void *old;
>
> dax_disassociate_entry(entry, mapping, false);
> @@ -774,6 +781,9 @@ static void *dax_insert_entry(struct xa_state *xas,
> if (dirty)
> xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
>
> + if (cow)
> + xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
> +
> xas_unlock_irq(xas);
> return entry;
> }
> @@ -1109,8 +1119,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
> pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
> vm_fault_t ret;
>
> - *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
> - DAX_ZERO_PAGE, false);
> + *entry = dax_insert_entry(xas, vmf, *entry, pfn, DAX_ZERO_PAGE, 0);
>
> ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
> trace_dax_load_hole(inode, vmf, ret);
> @@ -1137,8 +1146,8 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
> goto fallback;
>
> pfn = page_to_pfn_t(zero_page);
> - *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
> - DAX_PMD | DAX_ZERO_PAGE, false);
> + *entry = dax_insert_entry(xas, vmf, *entry, pfn,
> + DAX_PMD | DAX_ZERO_PAGE, 0);
>
> if (arch_needs_pgtable_deposit()) {
> pgtable = pte_alloc_one(vma->vm_mm);
> @@ -1448,6 +1457,7 @@ static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp,
> bool write = vmf->flags & FAULT_FLAG_WRITE;
> bool sync = dax_fault_is_synchronous(flags, vmf->vma, iomap);
> unsigned long entry_flags = pmd ? DAX_PMD : 0;
> + unsigned int insert_flags = 0;
> int err = 0;
> pfn_t pfn;
> void *kaddr;
> @@ -1470,8 +1480,15 @@ static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp,
> if (err)
> return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
>
> - *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, entry_flags,
> - write && !sync);
> + if (write) {
> + if (!sync)
> + insert_flags |= DAX_IF_DIRTY;
> + if (iomap->flags & IOMAP_F_SHARED)
> + insert_flags |= DAX_IF_COW;
> + }
> +
> + *entry = dax_insert_entry(xas, vmf, *entry, pfn, entry_flags,
> + insert_flags);
>
> if (write &&
> srcmap->addr != IOMAP_HOLE && srcmap->addr != iomap->addr) {
> --
> 2.31.1
>
>
>
On Tue, May 11, 2021 at 11:09:29AM +0800, Shiyang Ruan wrote:
> Punch hole on a reflinked file needs dax_copy_edge() too. Otherwise,
> data in not aligned area will be not correct. So, add the srcmap to
> dax_iomap_zero() and replace memset() as dax_copy_edge().
>
> Signed-off-by: Shiyang Ruan <[email protected]>
> Reviewed-by: Ritesh Harjani <[email protected]>
> ---
> fs/dax.c | 25 +++++++++++++++----------
> fs/iomap/buffered-io.c | 2 +-
> include/linux/dax.h | 3 ++-
> 3 files changed, 18 insertions(+), 12 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index ef0e564e7904..ee9d28a79bfb 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1186,7 +1186,8 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
> }
> #endif /* CONFIG_FS_DAX_PMD */
>
> -s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
> +s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
> + struct iomap *srcmap)
> {
> sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
> pgoff_t pgoff;
> @@ -1208,19 +1209,23 @@ s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
>
> if (page_aligned)
> rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
> - else
> + else {
> rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
> - if (rc < 0) {
> - dax_read_unlock(id);
> - return rc;
> - }
> -
> - if (!page_aligned) {
> - memset(kaddr + offset, 0, size);
> + if (rc < 0)
> + goto out;
> + if (iomap->addr != srcmap->addr) {
Why isn't this "if (srcmap->type != IOMAP_HOLE)" ?
I suppose it has the same effect, since @iomap should never be a hole
and we should never have a @srcmap that's the same as @iomap, but
still, we use IOMAP_HOLE checks in most other parts of fs/iomap/.
Other than that, the logic looks decent to me.
--D
> + rc = dax_iomap_cow_copy(offset, size, PAGE_SIZE, srcmap,
> + kaddr);
> + if (rc < 0)
> + goto out;
> + } else
> + memset(kaddr + offset, 0, size);
> dax_flush(iomap->dax_dev, kaddr + offset, size);
> }
> +
> +out:
> dax_read_unlock(id);
> - return size;
> + return rc < 0 ? rc : size;
> }
>
> static loff_t
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index f2cd2034a87b..2734955ea67f 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -933,7 +933,7 @@ static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos,
> s64 bytes;
>
> if (IS_DAX(inode))
> - bytes = dax_iomap_zero(pos, length, iomap);
> + bytes = dax_iomap_zero(pos, length, iomap, srcmap);
> else
> bytes = iomap_zero(inode, pos, length, iomap, srcmap);
> if (bytes < 0)
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index b52f084aa643..3275e01ed33d 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -237,7 +237,8 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
> int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
> int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
> pgoff_t index);
> -s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap);
> +s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
> + struct iomap *srcmap);
> static inline bool dax_mapping(struct address_space *mapping)
> {
> return mapping->host && IS_DAX(mapping->host);
> --
> 2.31.1
>
>
>
On Tue, May 11, 2021 at 11:09:31AM +0800, Shiyang Ruan wrote:
> With dax we cannot deal with readpage() etc. So, we create a dax
> comparison funciton which is similar with
> vfs_dedupe_file_range_compare().
> And introduce dax_remap_file_range_prep() for filesystem use.
>
> Signed-off-by: Goldwyn Rodrigues <[email protected]>
> Signed-off-by: Shiyang Ruan <[email protected]>
> ---
> fs/dax.c | 56 +++++++++++++++++++++++++++++++++++++++++++
> fs/remap_range.c | 57 +++++++++++++++++++++++++++++++++++++-------
> fs/xfs/xfs_reflink.c | 8 +++++--
> include/linux/dax.h | 4 ++++
> include/linux/fs.h | 12 ++++++----
> 5 files changed, 123 insertions(+), 14 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index ee9d28a79bfb..dedf1be0155c 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1853,3 +1853,59 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
> return dax_insert_pfn_mkwrite(vmf, pfn, order);
> }
> EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
> +
> +static loff_t dax_range_compare_actor(struct inode *ino1, loff_t pos1,
> + struct inode *ino2, loff_t pos2, loff_t len, void *data,
> + struct iomap *smap, struct iomap *dmap)
> +{
> + void *saddr, *daddr;
> + bool *same = data;
> + int ret;
> +
> + if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
> + *same = true;
> + return len;
> + }
> +
> + if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
> + *same = false;
> + return 0;
> + }
> +
> + ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
> + &saddr, NULL);
> + if (ret < 0)
> + return -EIO;
> +
> + ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
> + &daddr, NULL);
> + if (ret < 0)
> + return -EIO;
> +
> + *same = !memcmp(saddr, daddr, len);
> + return len;
> +}
> +
> +int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> + struct inode *dest, loff_t destoff, loff_t len, bool *is_same,
> + const struct iomap_ops *ops)
> +{
> + int id, ret = 0;
> +
> + id = dax_read_lock();
> + while (len) {
> + ret = iomap_apply2(src, srcoff, dest, destoff, len, 0, ops,
> + is_same, dax_range_compare_actor);
> + if (ret < 0 || !*is_same)
> + goto out;
> +
> + len -= ret;
> + srcoff += ret;
> + destoff += ret;
> + }
> + ret = 0;
> +out:
> + dax_read_unlock(id);
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(dax_dedupe_file_range_compare);
> diff --git a/fs/remap_range.c b/fs/remap_range.c
> index e4a5fdd7ad7b..7bc4c8e3aa9f 100644
> --- a/fs/remap_range.c
> +++ b/fs/remap_range.c
> @@ -14,6 +14,7 @@
> #include <linux/compat.h>
> #include <linux/mount.h>
> #include <linux/fs.h>
> +#include <linux/dax.h>
> #include "internal.h"
>
> #include <linux/uaccess.h>
> @@ -199,9 +200,9 @@ static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
> * Compare extents of two files to see if they are the same.
> * Caller must have locked both inodes to prevent write races.
> */
> -static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> - struct inode *dest, loff_t destoff,
> - loff_t len, bool *is_same)
> +int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> + struct inode *dest, loff_t destoff,
> + loff_t len, bool *is_same)
> {
> loff_t src_poff;
> loff_t dest_poff;
> @@ -280,6 +281,7 @@ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> out_error:
> return error;
> }
> +EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
>
> /*
> * Check that the two inodes are eligible for cloning, the ranges make
> @@ -289,9 +291,11 @@ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> * If there's an error, then the usual negative error code is returned.
> * Otherwise returns 0 with *len set to the request length.
> */
> -int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> - struct file *file_out, loff_t pos_out,
> - loff_t *len, unsigned int remap_flags)
> +static int
> +__generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> + struct file *file_out, loff_t pos_out,
> + loff_t *len, unsigned int remap_flags,
> + const struct iomap_ops *dax_read_ops)
> {
> struct inode *inode_in = file_inode(file_in);
> struct inode *inode_out = file_inode(file_out);
> @@ -351,8 +355,17 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> if (remap_flags & REMAP_FILE_DEDUP) {
> bool is_same = false;
>
> - ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
> - inode_out, pos_out, *len, &is_same);
> + if (!IS_DAX(inode_in))
> + ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
> + inode_out, pos_out, *len, &is_same);
> +#ifdef CONFIG_FS_DAX
> + else if (dax_read_ops)
> + ret = dax_dedupe_file_range_compare(inode_in, pos_in,
> + inode_out, pos_out, *len, &is_same,
> + dax_read_ops);
> +#endif /* CONFIG_FS_DAX */
Hmm, can you add an entry to the !IS_ENABLED(CONFIG_DAX) part of dax.h
that defines dax_dedupe_file_range_compare as a dummy function that
returns -EOPNOTSUPP? We try not to sprinkle preprocessor directives
into the middle of functions, per Linus rules.
> + else
> + return -EINVAL;
> if (ret)
> return ret;
> if (!is_same)
> @@ -370,6 +383,34 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
>
> return ret;
> }
> +
> +#ifdef CONFIG_FS_DAX
> +int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> + struct file *file_out, loff_t pos_out,
> + loff_t *len, unsigned int remap_flags,
> + const struct iomap_ops *ops)
> +{
> + return __generic_remap_file_range_prep(file_in, pos_in, file_out,
> + pos_out, len, remap_flags, ops);
> +}
> +#else
> +int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> + struct file *file_out, loff_t pos_out,
> + loff_t *len, unsigned int remap_flags,
> + const struct iomap_ops *ops)
> +{
> + return -EOPNOTSUPP;
> +}
> +#endif /* CONFIG_FS_DAX */
> +EXPORT_SYMBOL(dax_remap_file_range_prep);
I think this symbol belongs in fs/dax.c and the declaration in dax.h?
--D
> +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> + struct file *file_out, loff_t pos_out,
> + loff_t *len, unsigned int remap_flags)
> +{
> + return __generic_remap_file_range_prep(file_in, pos_in, file_out,
> + pos_out, len, remap_flags, NULL);
> +}
> EXPORT_SYMBOL(generic_remap_file_range_prep);
>
> loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 060695d6d56a..d25434f93235 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -1329,8 +1329,12 @@ xfs_reflink_remap_prep(
> if (IS_DAX(inode_in) || IS_DAX(inode_out))
> goto out_unlock;
>
> - ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
> - len, remap_flags);
> + if (!IS_DAX(inode_in))
> + ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
> + pos_out, len, remap_flags);
> + else
> + ret = dax_remap_file_range_prep(file_in, pos_in, file_out,
> + pos_out, len, remap_flags, &xfs_read_iomap_ops);
> if (ret || *len == 0)
> goto out_unlock;
>
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 3275e01ed33d..32e1c34349f2 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -239,6 +239,10 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
> pgoff_t index);
> s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
> struct iomap *srcmap);
> +int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> + struct inode *dest, loff_t destoff,
> + loff_t len, bool *is_same,
> + const struct iomap_ops *ops);
> static inline bool dax_mapping(struct address_space *mapping)
> {
> return mapping->host && IS_DAX(mapping->host);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index c3c88fdb9b2a..e2c348553d87 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -71,6 +71,7 @@ struct fsverity_operations;
> struct fs_context;
> struct fs_parameter_spec;
> struct fileattr;
> +struct iomap_ops;
>
> extern void __init inode_init(void);
> extern void __init inode_init_early(void);
> @@ -2126,10 +2127,13 @@ extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
> extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> struct file *file_out, loff_t pos_out,
> size_t len, unsigned int flags);
> -extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> - struct file *file_out, loff_t pos_out,
> - loff_t *count,
> - unsigned int remap_flags);
> +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> + struct file *file_out, loff_t pos_out,
> + loff_t *count, unsigned int remap_flags);
> +int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> + struct file *file_out, loff_t pos_out,
> + loff_t *len, unsigned int remap_flags,
> + const struct iomap_ops *ops);
> extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
> struct file *file_out, loff_t pos_out,
> loff_t len, unsigned int remap_flags);
> --
> 2.31.1
>
>
>
> -----Original Message-----
> From: Darrick J. Wong <[email protected]>
> Subject: Re: [PATCH v5 7/7] fs/xfs: Add dax dedupe support
>
> On Tue, May 11, 2021 at 11:09:33AM +0800, Shiyang Ruan wrote:
> > Introduce xfs_mmaplock_two_inodes_and_break_dax_layout() for dax files
> > who are going to be deduped. After that, call compare range function
> > only when files are both DAX or not.
> >
> > Signed-off-by: Shiyang Ruan <[email protected]>
> > ---
> > fs/xfs/xfs_file.c | 2 +-
> > fs/xfs/xfs_inode.c | 66
> +++++++++++++++++++++++++++++++++++++++++++-
> > fs/xfs/xfs_inode.h | 1 +
> > fs/xfs/xfs_reflink.c | 4 +--
> > 4 files changed, 69 insertions(+), 4 deletions(-)
> >
> > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index
> > 38d8eca05aee..bd5002d38df4 100644
> > --- a/fs/xfs/xfs_file.c
> > +++ b/fs/xfs/xfs_file.c
> > @@ -823,7 +823,7 @@ xfs_wait_dax_page(
> > xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
> > }
> >
> > -static int
> > +int
> > xfs_break_dax_layouts(
> > struct inode *inode,
> > bool *retry)
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index
> > 0369eb22c1bb..0774b6e2b940 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -3711,6 +3711,64 @@ xfs_iolock_two_inodes_and_break_layout(
> > return 0;
> > }
> >
> > +static int
> > +xfs_mmaplock_two_inodes_and_break_dax_layout(
> > + struct inode *src,
> > + struct inode *dest)
>
> MMAPLOCK is an xfs_inode lock, so please pass those in here.
>
> > +{
> > + int error, attempts = 0;
> > + bool retry;
> > + struct xfs_inode *ip0, *ip1;
> > + struct page *page;
> > + struct xfs_log_item *lp;
> > +
> > + if (src > dest)
> > + swap(src, dest);
>
> The MMAPLOCK (and ILOCK) locking order is increasing inode number, not the
> address of the incore object. This is different (and not consistent
> with) i_rwsem/XFS_IOLOCK, but those are the rules.
Yes, I misunderstood here.
>
> > + ip0 = XFS_I(src);
> > + ip1 = XFS_I(dest);
> > +
> > +again:
> > + retry = false;
> > + /* Lock the first inode */
> > + xfs_ilock(ip0, XFS_MMAPLOCK_EXCL);
> > + error = xfs_break_dax_layouts(src, &retry);
> > + if (error || retry) {
> > + xfs_iunlock(ip0, XFS_MMAPLOCK_EXCL);
> > + goto again;
> > + }
> > +
> > + if (src == dest)
> > + return 0;
> > +
> > + /* Nested lock the second inode */
> > + lp = &ip0->i_itemp->ili_item;
> > + if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
> > + if (!xfs_ilock_nowait(ip1,
> > + xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1))) {
> > + xfs_iunlock(ip0, XFS_MMAPLOCK_EXCL);
> > + if ((++attempts % 5) == 0)
> > + delay(1); /* Don't just spin the CPU */
> > + goto again;
> > + }
> > + } else
> > + xfs_ilock(ip1, xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1));
> > + /*
> > + * We cannot use xfs_break_dax_layouts() directly here because it may
> > + * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable
> > + * for this nested lock case.
> > + */
> > + page = dax_layout_busy_page(dest->i_mapping);
> > + if (page) {
> > + if (page_ref_count(page) != 1) {
>
> This could be flattened to:
>
> if (page && page_ref_count(page) != 1) {
> ...
> }
OK.
--
Thanks,
Ruan Shiyang.
>
> --D
>
> > + xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
> > + xfs_iunlock(ip0, XFS_MMAPLOCK_EXCL);
> > + goto again;
> > + }
> > + }
> > +
> > + return 0;
> > +}
> > +
> > /*
> > * Lock two inodes so that userspace cannot initiate I/O via file syscalls or
> > * mmap activity.
> > @@ -3721,10 +3779,16 @@ xfs_ilock2_io_mmap(
> > struct xfs_inode *ip2)
> > {
> > int ret;
> > + struct inode *ino1 = VFS_I(ip1);
> > + struct inode *ino2 = VFS_I(ip2);
> >
> > - ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
> > + ret = xfs_iolock_two_inodes_and_break_layout(ino1, ino2);
> > if (ret)
> > return ret;
> > +
> > + if (IS_DAX(ino1) && IS_DAX(ino2))
> > + return xfs_mmaplock_two_inodes_and_break_dax_layout(ino1, ino2);
> > +
> > if (ip1 == ip2)
> > xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
> > else
> > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index
> > ca826cfba91c..2d0b344fb100 100644
> > --- a/fs/xfs/xfs_inode.h
> > +++ b/fs/xfs/xfs_inode.h
> > @@ -457,6 +457,7 @@ enum xfs_prealloc_flags {
> >
> > int xfs_update_prealloc_flags(struct xfs_inode *ip,
> > enum xfs_prealloc_flags flags);
> > +int xfs_break_dax_layouts(struct inode *inode, bool *retry);
> > int xfs_break_layouts(struct inode *inode, uint *iolock,
> > enum layout_break_reason reason);
> >
> > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index
> > 9a780948dbd0..ff308304c5cd 100644
> > --- a/fs/xfs/xfs_reflink.c
> > +++ b/fs/xfs/xfs_reflink.c
> > @@ -1324,8 +1324,8 @@ xfs_reflink_remap_prep(
> > if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
> > goto out_unlock;
> >
> > - /* Don't share DAX file data for now. */
> > - if (IS_DAX(inode_in) || IS_DAX(inode_out))
> > + /* Don't share DAX file data with non-DAX file. */
> > + if (IS_DAX(inode_in) != IS_DAX(inode_out))
> > goto out_unlock;
> >
> > if (!IS_DAX(inode_in))
> > --
> > 2.31.1
> >
> >
> >
On Tue, May 11, 2021 at 11:09:32AM +0800, Shiyang Ruan wrote:
> In fsdax mode, WRITE and ZERO on a shared extent need CoW performed. After
> CoW, new allocated extents needs to be remapped to the file. So, add an
> iomap_end for dax write ops to do the remapping work.
>
> Signed-off-by: Shiyang Ruan <[email protected]>
> ---
> fs/xfs/xfs_bmap_util.c | 3 +--
> fs/xfs/xfs_file.c | 9 +++----
> fs/xfs/xfs_iomap.c | 61 +++++++++++++++++++++++++++++++++++++++++-
> fs/xfs/xfs_iomap.h | 4 +++
> fs/xfs/xfs_iops.c | 7 +++--
> fs/xfs/xfs_reflink.c | 3 +--
> 6 files changed, 72 insertions(+), 15 deletions(-)
>
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index a5e9d7d34023..2a36dc93ff27 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -965,8 +965,7 @@ xfs_free_file_space(
> return 0;
> if (offset + len > XFS_ISIZE(ip))
> len = XFS_ISIZE(ip) - offset;
> - error = iomap_zero_range(VFS_I(ip), offset, len, NULL,
> - &xfs_buffered_write_iomap_ops);
> + error = xfs_iomap_zero_range(ip, offset, len, NULL);
> if (error)
> return error;
>
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 396ef36dcd0a..38d8eca05aee 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -684,11 +684,8 @@ xfs_file_dax_write(
> pos = iocb->ki_pos;
>
> trace_xfs_file_dax_write(iocb, from);
> - ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
> - if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
> - i_size_write(inode, iocb->ki_pos);
> - error = xfs_setfilesize(ip, pos, ret);
> - }
> + ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
> +
> out:
> if (iolock)
> xfs_iunlock(ip, iolock);
> @@ -1309,7 +1306,7 @@ __xfs_filemap_fault(
>
> ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
> (write_fault && !vmf->cow_page) ?
> - &xfs_direct_write_iomap_ops :
> + &xfs_dax_write_iomap_ops :
> &xfs_read_iomap_ops);
> if (ret & VM_FAULT_NEEDDSYNC)
> ret = dax_finish_sync_fault(vmf, pe_size, pfn);
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index d154f42e2dc6..8b593a51480d 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -761,7 +761,8 @@ xfs_direct_write_iomap_begin(
>
> /* may drop and re-acquire the ilock */
> error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
> - &lockmode, flags & IOMAP_DIRECT);
> + &lockmode,
> + (flags & IOMAP_DIRECT) || IS_DAX(inode));
> if (error)
> goto out_unlock;
> if (shared)
> @@ -854,6 +855,41 @@ const struct iomap_ops xfs_direct_write_iomap_ops = {
> .iomap_begin = xfs_direct_write_iomap_begin,
> };
>
> +static int
> +xfs_dax_write_iomap_end(
> + struct inode *inode,
> + loff_t pos,
> + loff_t length,
> + ssize_t written,
> + unsigned int flags,
> + struct iomap *iomap)
> +{
> + int error = 0;
> + struct xfs_inode *ip = XFS_I(inode);
> + bool cow = xfs_is_cow_inode(ip);
> +
> + if (!written)
> + return 0;
> +
> + if (pos + written > i_size_read(inode) && !(flags & IOMAP_FAULT)) {
> + i_size_write(inode, pos + written);
> + error = xfs_setfilesize(ip, pos, written);
> + if (error && cow) {
> + xfs_reflink_cancel_cow_range(ip, pos, written, true);
> + return error;
> + }
> + }
> + if (cow)
> + error = xfs_reflink_end_cow(ip, pos, written);
> +
> + return error;
> +}
> +
> +const struct iomap_ops xfs_dax_write_iomap_ops = {
> + .iomap_begin = xfs_direct_write_iomap_begin,
> + .iomap_end = xfs_dax_write_iomap_end,
> +};
> +
> static int
> xfs_buffered_write_iomap_begin(
> struct inode *inode,
> @@ -1311,3 +1347,26 @@ xfs_xattr_iomap_begin(
> const struct iomap_ops xfs_xattr_iomap_ops = {
> .iomap_begin = xfs_xattr_iomap_begin,
> };
> +
> +int
> +xfs_iomap_zero_range(
> + struct xfs_inode *ip,
> + loff_t offset,
> + loff_t len,
> + bool *did_zero)
> +{
> + return iomap_zero_range(VFS_I(ip), offset, len, did_zero,
> + IS_DAX(VFS_I(ip)) ? &xfs_dax_write_iomap_ops
> + : &xfs_buffered_write_iomap_ops);
> +}
> +
> +int
> +xfs_iomap_truncate_page(
> + struct xfs_inode *ip,
> + loff_t pos,
> + bool *did_zero)
> +{
> + return iomap_truncate_page(VFS_I(ip), pos, did_zero,
> + IS_DAX(VFS_I(ip)) ? &xfs_dax_write_iomap_ops
> + : &xfs_buffered_write_iomap_ops);
> +}
I wonder, can these become static inline helpers in xfs_iomap.h?
It would be kinda nice not to add another stack frame just to virtualize
the iomap ops.
--D
> diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
> index 7d3703556d0e..e4e515cd63b5 100644
> --- a/fs/xfs/xfs_iomap.h
> +++ b/fs/xfs/xfs_iomap.h
> @@ -14,6 +14,9 @@ struct xfs_bmbt_irec;
> int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
> xfs_fileoff_t count_fsb, struct xfs_bmbt_irec *imap);
> int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
> +int xfs_iomap_zero_range(struct xfs_inode *ip, loff_t offset, loff_t len,
> + bool *did_zero);
> +int xfs_iomap_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero);
> xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
> xfs_fileoff_t end_fsb);
>
> @@ -42,6 +45,7 @@ xfs_aligned_fsb_count(
>
> extern const struct iomap_ops xfs_buffered_write_iomap_ops;
> extern const struct iomap_ops xfs_direct_write_iomap_ops;
> +extern const struct iomap_ops xfs_dax_write_iomap_ops;
> extern const struct iomap_ops xfs_read_iomap_ops;
> extern const struct iomap_ops xfs_seek_iomap_ops;
> extern const struct iomap_ops xfs_xattr_iomap_ops;
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index dfe24b7f26e5..6d936c3e1a6e 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -911,8 +911,8 @@ xfs_setattr_size(
> */
> if (newsize > oldsize) {
> trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
> - error = iomap_zero_range(inode, oldsize, newsize - oldsize,
> - &did_zeroing, &xfs_buffered_write_iomap_ops);
> + error = xfs_iomap_zero_range(ip, oldsize, newsize - oldsize,
> + &did_zeroing);
> } else {
> /*
> * iomap won't detect a dirty page over an unwritten block (or a
> @@ -924,8 +924,7 @@ xfs_setattr_size(
> newsize);
> if (error)
> return error;
> - error = iomap_truncate_page(inode, newsize, &did_zeroing,
> - &xfs_buffered_write_iomap_ops);
> + error = xfs_iomap_truncate_page(ip, newsize, &did_zeroing);
> }
>
> if (error)
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index d25434f93235..9a780948dbd0 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -1266,8 +1266,7 @@ xfs_reflink_zero_posteof(
> return 0;
>
> trace_xfs_zero_eof(ip, isize, pos - isize);
> - return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
> - &xfs_buffered_write_iomap_ops);
> + return xfs_iomap_zero_range(ip, isize, pos - isize, NULL);
> }
>
> /*
> --
> 2.31.1
>
>
>
> -----Original Message-----
> From: Darrick J. Wong <[email protected]>
> Subject: Re: [PATCH v5 3/7] fsdax: Add dax_iomap_cow_copy() for
> dax_iomap_zero
>
> On Tue, May 11, 2021 at 11:09:29AM +0800, Shiyang Ruan wrote:
> > Punch hole on a reflinked file needs dax_copy_edge() too. Otherwise,
> > data in not aligned area will be not correct. So, add the srcmap to
> > dax_iomap_zero() and replace memset() as dax_copy_edge().
> >
> > Signed-off-by: Shiyang Ruan <[email protected]>
> > Reviewed-by: Ritesh Harjani <[email protected]>
> > ---
> > fs/dax.c | 25 +++++++++++++++----------
> > fs/iomap/buffered-io.c | 2 +-
> > include/linux/dax.h | 3 ++-
> > 3 files changed, 18 insertions(+), 12 deletions(-)
> >
> > diff --git a/fs/dax.c b/fs/dax.c
> > index ef0e564e7904..ee9d28a79bfb 100644
> > --- a/fs/dax.c
> > +++ b/fs/dax.c
> > @@ -1186,7 +1186,8 @@ static vm_fault_t dax_pmd_load_hole(struct
> > xa_state *xas, struct vm_fault *vmf, } #endif /* CONFIG_FS_DAX_PMD
> > */
> >
> > -s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
> > +s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
> > + struct iomap *srcmap)
> > {
> > sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
> > pgoff_t pgoff;
> > @@ -1208,19 +1209,23 @@ s64 dax_iomap_zero(loff_t pos, u64 length,
> > struct iomap *iomap)
> >
> > if (page_aligned)
> > rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
> > - else
> > + else {
> > rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
> > - if (rc < 0) {
> > - dax_read_unlock(id);
> > - return rc;
> > - }
> > -
> > - if (!page_aligned) {
> > - memset(kaddr + offset, 0, size);
> > + if (rc < 0)
> > + goto out;
> > + if (iomap->addr != srcmap->addr) {
>
> Why isn't this "if (srcmap->type != IOMAP_HOLE)" ?
>
> I suppose it has the same effect, since @iomap should never be a hole and we
> should never have a @srcmap that's the same as @iomap, but still, we use
> IOMAP_HOLE checks in most other parts of fs/iomap/.
According to its caller `iomap_zero_range_actor()`, whether srcmap->type is IOMAP_HOLE has already been checked before `dax_iomap_zero()`. So the check you suggested will always be true...
--
Thanks,
Ruan Shiyang.
>
> Other than that, the logic looks decent to me.
>
> --D
>
> > + rc = dax_iomap_cow_copy(offset, size, PAGE_SIZE, srcmap,
> > + kaddr);
> > + if (rc < 0)
> > + goto out;
> > + } else
> > + memset(kaddr + offset, 0, size);
> > dax_flush(iomap->dax_dev, kaddr + offset, size);
> > }
> > +
> > +out:
> > dax_read_unlock(id);
> > - return size;
> > + return rc < 0 ? rc : size;
> > }
> >
> > static loff_t
> > diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index
> > f2cd2034a87b..2734955ea67f 100644
> > --- a/fs/iomap/buffered-io.c
> > +++ b/fs/iomap/buffered-io.c
> > @@ -933,7 +933,7 @@ static loff_t iomap_zero_range_actor(struct inode
> *inode, loff_t pos,
> > s64 bytes;
> >
> > if (IS_DAX(inode))
> > - bytes = dax_iomap_zero(pos, length, iomap);
> > + bytes = dax_iomap_zero(pos, length, iomap, srcmap);
> > else
> > bytes = iomap_zero(inode, pos, length, iomap, srcmap);
> > if (bytes < 0)
> > diff --git a/include/linux/dax.h b/include/linux/dax.h index
> > b52f084aa643..3275e01ed33d 100644
> > --- a/include/linux/dax.h
> > +++ b/include/linux/dax.h
> > @@ -237,7 +237,8 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault
> > *vmf, int dax_delete_mapping_entry(struct address_space *mapping,
> > pgoff_t index); int dax_invalidate_mapping_entry_sync(struct
> address_space *mapping,
> > pgoff_t index);
> > -s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap);
> > +s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
> > + struct iomap *srcmap);
> > static inline bool dax_mapping(struct address_space *mapping) {
> > return mapping->host && IS_DAX(mapping->host);
> > --
> > 2.31.1
> >
> >
> >
>
Hi,
On 11.5.2021 6.09, Shiyang Ruan wrote:
> Punch hole on a reflinked file needs dax_copy_edge() too. Otherwise,
> data in not aligned area will be not correct. So, add the srcmap to
> dax_iomap_zero() and replace memset() as dax_copy_edge().
>
> Signed-off-by: Shiyang Ruan <[email protected]>
> Reviewed-by: Ritesh Harjani <[email protected]>
> ---
> fs/dax.c | 25 +++++++++++++++----------
> fs/iomap/buffered-io.c | 2 +-
> include/linux/dax.h | 3 ++-
> 3 files changed, 18 insertions(+), 12 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index ef0e564e7904..ee9d28a79bfb 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1186,7 +1186,8 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
> }
> #endif /* CONFIG_FS_DAX_PMD */
>
> -s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
> +s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
> + struct iomap *srcmap)
> {
> sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
> pgoff_t pgoff;
> @@ -1208,19 +1209,23 @@ s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
>
> if (page_aligned)
> rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
> - else
> + else {
> rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
> - if (rc < 0) {
> - dax_read_unlock(id);
> - return rc;
> - }
> -
> - if (!page_aligned) {
> - memset(kaddr + offset, 0, size);
> + if (rc < 0)
> + goto out;
> + if (iomap->addr != srcmap->addr) {
> + rc = dax_iomap_cow_copy(offset, size, PAGE_SIZE, srcmap,
> + kaddr);
offset above is offset in page, think dax_iomap_cow_copy() expects
absolute pos
> + if (rc < 0)
> + goto out;
> + } else
> + memset(kaddr + offset, 0, size);
> dax_flush(iomap->dax_dev, kaddr + offset, size);
> }
> +
> +out:
> dax_read_unlock(id);
> - return size;
> + return rc < 0 ? rc : size;
> }
>
> static loff_t
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index f2cd2034a87b..2734955ea67f 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -933,7 +933,7 @@ static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos,
> s64 bytes;
>
> if (IS_DAX(inode))
> - bytes = dax_iomap_zero(pos, length, iomap);
> + bytes = dax_iomap_zero(pos, length, iomap, srcmap);
> else
> bytes = iomap_zero(inode, pos, length, iomap, srcmap);
> if (bytes < 0)
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index b52f084aa643..3275e01ed33d 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -237,7 +237,8 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
> int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
> int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
> pgoff_t index);
> -s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap);
> +s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
> + struct iomap *srcmap);
> static inline bool dax_mapping(struct address_space *mapping)
> {
> return mapping->host && IS_DAX(mapping->host);
> -----Original Message-----
> From: Mika Penttilä <[email protected]>
> Subject: Re: [PATCH v5 3/7] fsdax: Add dax_iomap_cow_copy() for
> dax_iomap_zero
>
> Hi,
>
> On 11.5.2021 6.09, Shiyang Ruan wrote:
> > Punch hole on a reflinked file needs dax_copy_edge() too. Otherwise,
> > data in not aligned area will be not correct. So, add the srcmap to
> > dax_iomap_zero() and replace memset() as dax_copy_edge().
> >
> > Signed-off-by: Shiyang Ruan <[email protected]>
> > Reviewed-by: Ritesh Harjani <[email protected]>
> > ---
> > fs/dax.c | 25 +++++++++++++++----------
> > fs/iomap/buffered-io.c | 2 +-
> > include/linux/dax.h | 3 ++-
> > 3 files changed, 18 insertions(+), 12 deletions(-)
> >
> > diff --git a/fs/dax.c b/fs/dax.c
> > index ef0e564e7904..ee9d28a79bfb 100644
> > --- a/fs/dax.c
> > +++ b/fs/dax.c
> > @@ -1186,7 +1186,8 @@ static vm_fault_t dax_pmd_load_hole(struct
> xa_state *xas, struct vm_fault *vmf,
> > }
> > #endif /* CONFIG_FS_DAX_PMD */
> >
> > -s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
> > +s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
> > + struct iomap *srcmap)
> > {
> > sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
> > pgoff_t pgoff;
> > @@ -1208,19 +1209,23 @@ s64 dax_iomap_zero(loff_t pos, u64 length,
> > struct iomap *iomap)
> >
> > if (page_aligned)
> > rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
> > - else
> > + else {
> > rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
> > - if (rc < 0) {
> > - dax_read_unlock(id);
> > - return rc;
> > - }
> > -
> > - if (!page_aligned) {
> > - memset(kaddr + offset, 0, size);
> > + if (rc < 0)
> > + goto out;
> > + if (iomap->addr != srcmap->addr) {
> > + rc = dax_iomap_cow_copy(offset, size, PAGE_SIZE, srcmap,
> > + kaddr);
>
> offset above is offset in page, think dax_iomap_cow_copy() expects absolute pos
You are right. Should pass pos here. Thanks for pointing out.
--
Thanks,
Ruan Shiyang.
>
> > + if (rc < 0)
> > + goto out;
> > + } else
> > + memset(kaddr + offset, 0, size);
> > dax_flush(iomap->dax_dev, kaddr + offset, size);
> > }
> > +
> > +out:
> > dax_read_unlock(id);
> > - return size;
> > + return rc < 0 ? rc : size;
> > }
> >
> > static loff_t
> > diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index
> > f2cd2034a87b..2734955ea67f 100644
> > --- a/fs/iomap/buffered-io.c
> > +++ b/fs/iomap/buffered-io.c
> > @@ -933,7 +933,7 @@ static loff_t iomap_zero_range_actor(struct inode
> *inode, loff_t pos,
> > s64 bytes;
> >
> > if (IS_DAX(inode))
> > - bytes = dax_iomap_zero(pos, length, iomap);
> > + bytes = dax_iomap_zero(pos, length, iomap, srcmap);
> > else
> > bytes = iomap_zero(inode, pos, length, iomap, srcmap);
> > if (bytes < 0)
> > diff --git a/include/linux/dax.h b/include/linux/dax.h index
> > b52f084aa643..3275e01ed33d 100644
> > --- a/include/linux/dax.h
> > +++ b/include/linux/dax.h
> > @@ -237,7 +237,8 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault
> *vmf,
> > int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t
> index);
> > int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
> > pgoff_t index);
> > -s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap);
> > +s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
> > + struct iomap *srcmap);
> > static inline bool dax_mapping(struct address_space *mapping)
> > {
> > return mapping->host && IS_DAX(mapping->host);
> -----Original Message-----
> From: Darrick J. Wong <[email protected]>
> Subject: Re: [PATCH v5 1/7] fsdax: Introduce dax_iomap_cow_copy()
>
> On Tue, May 11, 2021 at 11:09:27AM +0800, Shiyang Ruan wrote:
> > In the case where the iomap is a write operation and iomap is not
> > equal to srcmap after iomap_begin, we consider it is a CoW operation.
> >
> > The destance extent which iomap indicated is new allocated extent.
> > So, it is needed to copy the data from srcmap to new allocated extent.
> > In theory, it is better to copy the head and tail ranges which is
> > outside of the non-aligned area instead of copying the whole aligned
> > range. But in dax page fault, it will always be an aligned range. So,
> > we have to copy the whole range in this case.
> >
> > Signed-off-by: Shiyang Ruan <[email protected]>
> > Reviewed-by: Christoph Hellwig <[email protected]>
> > ---
> > fs/dax.c | 86
> > ++++++++++++++++++++++++++++++++++++++++++++++++++++----
> > 1 file changed, 81 insertions(+), 5 deletions(-)
> >
> > diff --git a/fs/dax.c b/fs/dax.c
> > index bf3fc8242e6c..f0249bb1d46a 100644
> > --- a/fs/dax.c
> > +++ b/fs/dax.c
> > @@ -1038,6 +1038,61 @@ static int dax_iomap_direct_access(struct iomap
> *iomap, loff_t pos, size_t size,
> > return rc;
> > }
> >
> > +/**
> > + * dax_iomap_cow_copy(): Copy the data from source to destination before
> write.
> > + * @pos: address to do copy from.
> > + * @length: size of copy operation.
> > + * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
> > + * @srcmap: iomap srcmap
> > + * @daddr: destination address to copy to.
> > + *
> > + * This can be called from two places. Either during DAX write fault,
> > +to copy
> > + * the length size data to daddr. Or, while doing normal DAX write
> > +operation,
> > + * dax_iomap_actor() might call this to do the copy of either start
> > +or end
> > + * unaligned address. In this case the rest of the copy of aligned
> > +ranges is
> > + * taken care by dax_iomap_actor() itself.
> > + * Also, note DAX fault will always result in aligned pos and pos + length.
> > + */
> > +static int dax_iomap_cow_copy(loff_t pos, loff_t length, size_t
> > +align_size,
>
> Nit: Linus has asked us not to continue the use of loff_t for file io length. Could
> you change this to 'uint64_t length', please?
> (Assuming we even need the extra length bits?)
>
> With that fixed up...
> Reviewed-by: Darrick J. Wong <[email protected]>
>
> --D
>
> > + struct iomap *srcmap, void *daddr)
> > +{
> > + loff_t head_off = pos & (align_size - 1);
>
> Other nit: head_off = round_down(pos, align_size); ?
We need the offset within a page here, either PTE or PMD. So I think round_down() is not suitable here.
--
Thanks,
Ruan Shiyang.
>
> > + size_t size = ALIGN(head_off + length, align_size);
> > + loff_t end = pos + length;
> > + loff_t pg_end = round_up(end, align_size);
> > + bool copy_all = head_off == 0 && end == pg_end;
> > + void *saddr = 0;
> > + int ret = 0;
> > +
> > + ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
> > + if (ret)
> > + return ret;
> > +
> > + if (copy_all) {
> > + ret = copy_mc_to_kernel(daddr, saddr, length);
> > + return ret ? -EIO : 0;
> > + }
> > +
> > + /* Copy the head part of the range. Note: we pass offset as length. */
> > + if (head_off) {
> > + ret = copy_mc_to_kernel(daddr, saddr, head_off);
> > + if (ret)
> > + return -EIO;
> > + }
> > +
> > + /* Copy the tail part of the range */
> > + if (end < pg_end) {
> > + loff_t tail_off = head_off + length;
> > + loff_t tail_len = pg_end - end;
> > +
> > + ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off,
> > + tail_len);
> > + if (ret)
> > + return -EIO;
> > + }
> > + return 0;
> > +}
> > +
> > /*
> > * The user has performed a load from a hole in the file. Allocating a new
> > * page in the file would cause excessive storage usage for workloads
> > with @@ -1167,11 +1222,12 @@ dax_iomap_actor(struct inode *inode,
> loff_t pos, loff_t length, void *data,
> > struct dax_device *dax_dev = iomap->dax_dev;
> > struct iov_iter *iter = data;
> > loff_t end = pos + length, done = 0;
> > + bool write = iov_iter_rw(iter) == WRITE;
> > ssize_t ret = 0;
> > size_t xfer;
> > int id;
> >
> > - if (iov_iter_rw(iter) == READ) {
> > + if (!write) {
> > end = min(end, i_size_read(inode));
> > if (pos >= end)
> > return 0;
> > @@ -1180,7 +1236,12 @@ dax_iomap_actor(struct inode *inode, loff_t pos,
> loff_t length, void *data,
> > return iov_iter_zero(min(length, end - pos), iter);
> > }
> >
> > - if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
> > + /*
> > + * In DAX mode, we allow either pure overwrites of written extents, or
> > + * writes to unwritten extents as part of a copy-on-write operation.
> > + */
> > + if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
> > + !(iomap->flags & IOMAP_F_SHARED)))
> > return -EIO;
> >
> > /*
> > @@ -1219,6 +1280,13 @@ dax_iomap_actor(struct inode *inode, loff_t pos,
> loff_t length, void *data,
> > break;
> > }
> >
> > + if (write && srcmap->addr != iomap->addr) {
> > + ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap,
> > + kaddr);
> > + if (ret)
> > + break;
> > + }
> > +
> > map_len = PFN_PHYS(map_len);
> > kaddr += offset;
> > map_len -= offset;
> > @@ -1230,7 +1298,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos,
> loff_t length, void *data,
> > * validated via access_ok() in either vfs_read() or
> > * vfs_write(), depending on which operation we are doing.
> > */
> > - if (iov_iter_rw(iter) == WRITE)
> > + if (write)
> > xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
> > map_len, iter);
> > else
> > @@ -1382,6 +1450,7 @@ static vm_fault_t dax_fault_actor(struct vm_fault
> *vmf, pfn_t *pfnp,
> > unsigned long entry_flags = pmd ? DAX_PMD : 0;
> > int err = 0;
> > pfn_t pfn;
> > + void *kaddr;
> >
> > /* if we are reading UNWRITTEN and HOLE, return a hole. */
> > if (!write &&
> > @@ -1392,18 +1461,25 @@ static vm_fault_t dax_fault_actor(struct
> vm_fault *vmf, pfn_t *pfnp,
> > return dax_pmd_load_hole(xas, vmf, iomap, entry);
> > }
> >
> > - if (iomap->type != IOMAP_MAPPED) {
> > + if (iomap->type != IOMAP_MAPPED && !(iomap->flags &
> IOMAP_F_SHARED))
> > +{
> > WARN_ON_ONCE(1);
> > return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
> > }
> >
> > - err = dax_iomap_direct_access(iomap, pos, size, NULL, &pfn);
> > + err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
> > if (err)
> > return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
> >
> > *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, entry_flags,
> > write && !sync);
> >
> > + if (write &&
> > + srcmap->addr != IOMAP_HOLE && srcmap->addr != iomap->addr) {
> > + err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr);
> > + if (err)
> > + return dax_fault_return(err);
> > + }
> > +
> > if (sync)
> > return dax_fault_synchronous_pfnp(pfnp, pfn);
> >
> > --
> > 2.31.1
> >
> >
> >
On Thu, May 13, 2021 at 07:57:47AM +0000, [email protected] wrote:
> > -----Original Message-----
> > From: Darrick J. Wong <[email protected]>
> > Subject: Re: [PATCH v5 1/7] fsdax: Introduce dax_iomap_cow_copy()
> >
> > On Tue, May 11, 2021 at 11:09:27AM +0800, Shiyang Ruan wrote:
> > > In the case where the iomap is a write operation and iomap is not
> > > equal to srcmap after iomap_begin, we consider it is a CoW operation.
> > >
> > > The destance extent which iomap indicated is new allocated extent.
> > > So, it is needed to copy the data from srcmap to new allocated extent.
> > > In theory, it is better to copy the head and tail ranges which is
> > > outside of the non-aligned area instead of copying the whole aligned
> > > range. But in dax page fault, it will always be an aligned range. So,
> > > we have to copy the whole range in this case.
> > >
> > > Signed-off-by: Shiyang Ruan <[email protected]>
> > > Reviewed-by: Christoph Hellwig <[email protected]>
> > > ---
> > > fs/dax.c | 86
> > > ++++++++++++++++++++++++++++++++++++++++++++++++++++----
> > > 1 file changed, 81 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/fs/dax.c b/fs/dax.c
> > > index bf3fc8242e6c..f0249bb1d46a 100644
> > > --- a/fs/dax.c
> > > +++ b/fs/dax.c
> > > @@ -1038,6 +1038,61 @@ static int dax_iomap_direct_access(struct iomap
> > *iomap, loff_t pos, size_t size,
> > > return rc;
> > > }
> > >
> > > +/**
> > > + * dax_iomap_cow_copy(): Copy the data from source to destination before
> > write.
> > > + * @pos: address to do copy from.
> > > + * @length: size of copy operation.
> > > + * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
> > > + * @srcmap: iomap srcmap
> > > + * @daddr: destination address to copy to.
> > > + *
> > > + * This can be called from two places. Either during DAX write fault,
> > > +to copy
> > > + * the length size data to daddr. Or, while doing normal DAX write
> > > +operation,
> > > + * dax_iomap_actor() might call this to do the copy of either start
> > > +or end
> > > + * unaligned address. In this case the rest of the copy of aligned
> > > +ranges is
> > > + * taken care by dax_iomap_actor() itself.
> > > + * Also, note DAX fault will always result in aligned pos and pos + length.
> > > + */
> > > +static int dax_iomap_cow_copy(loff_t pos, loff_t length, size_t
> > > +align_size,
> >
> > Nit: Linus has asked us not to continue the use of loff_t for file io length. Could
> > you change this to 'uint64_t length', please?
> > (Assuming we even need the extra length bits?)
> >
> > With that fixed up...
> > Reviewed-by: Darrick J. Wong <[email protected]>
> >
> > --D
> >
> > > + struct iomap *srcmap, void *daddr)
> > > +{
> > > + loff_t head_off = pos & (align_size - 1);
> >
> > Other nit: head_off = round_down(pos, align_size); ?
>
> We need the offset within a page here, either PTE or PMD. So I think round_down() is not suitable here.
Oops, yeah. /me wonders if any of Matthew's folio cleanups will reduce
the amount of opencoding around this...
--D
>
>
> --
> Thanks,
> Ruan Shiyang.
>
> >
> > > + size_t size = ALIGN(head_off + length, align_size);
> > > + loff_t end = pos + length;
> > > + loff_t pg_end = round_up(end, align_size);
> > > + bool copy_all = head_off == 0 && end == pg_end;
> > > + void *saddr = 0;
> > > + int ret = 0;
> > > +
> > > + ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
> > > + if (ret)
> > > + return ret;
> > > +
> > > + if (copy_all) {
> > > + ret = copy_mc_to_kernel(daddr, saddr, length);
> > > + return ret ? -EIO : 0;
> > > + }
> > > +
> > > + /* Copy the head part of the range. Note: we pass offset as length. */
> > > + if (head_off) {
> > > + ret = copy_mc_to_kernel(daddr, saddr, head_off);
> > > + if (ret)
> > > + return -EIO;
> > > + }
> > > +
> > > + /* Copy the tail part of the range */
> > > + if (end < pg_end) {
> > > + loff_t tail_off = head_off + length;
> > > + loff_t tail_len = pg_end - end;
> > > +
> > > + ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off,
> > > + tail_len);
> > > + if (ret)
> > > + return -EIO;
> > > + }
> > > + return 0;
> > > +}
> > > +
> > > /*
> > > * The user has performed a load from a hole in the file. Allocating a new
> > > * page in the file would cause excessive storage usage for workloads
> > > with @@ -1167,11 +1222,12 @@ dax_iomap_actor(struct inode *inode,
> > loff_t pos, loff_t length, void *data,
> > > struct dax_device *dax_dev = iomap->dax_dev;
> > > struct iov_iter *iter = data;
> > > loff_t end = pos + length, done = 0;
> > > + bool write = iov_iter_rw(iter) == WRITE;
> > > ssize_t ret = 0;
> > > size_t xfer;
> > > int id;
> > >
> > > - if (iov_iter_rw(iter) == READ) {
> > > + if (!write) {
> > > end = min(end, i_size_read(inode));
> > > if (pos >= end)
> > > return 0;
> > > @@ -1180,7 +1236,12 @@ dax_iomap_actor(struct inode *inode, loff_t pos,
> > loff_t length, void *data,
> > > return iov_iter_zero(min(length, end - pos), iter);
> > > }
> > >
> > > - if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
> > > + /*
> > > + * In DAX mode, we allow either pure overwrites of written extents, or
> > > + * writes to unwritten extents as part of a copy-on-write operation.
> > > + */
> > > + if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
> > > + !(iomap->flags & IOMAP_F_SHARED)))
> > > return -EIO;
> > >
> > > /*
> > > @@ -1219,6 +1280,13 @@ dax_iomap_actor(struct inode *inode, loff_t pos,
> > loff_t length, void *data,
> > > break;
> > > }
> > >
> > > + if (write && srcmap->addr != iomap->addr) {
> > > + ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap,
> > > + kaddr);
> > > + if (ret)
> > > + break;
> > > + }
> > > +
> > > map_len = PFN_PHYS(map_len);
> > > kaddr += offset;
> > > map_len -= offset;
> > > @@ -1230,7 +1298,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos,
> > loff_t length, void *data,
> > > * validated via access_ok() in either vfs_read() or
> > > * vfs_write(), depending on which operation we are doing.
> > > */
> > > - if (iov_iter_rw(iter) == WRITE)
> > > + if (write)
> > > xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
> > > map_len, iter);
> > > else
> > > @@ -1382,6 +1450,7 @@ static vm_fault_t dax_fault_actor(struct vm_fault
> > *vmf, pfn_t *pfnp,
> > > unsigned long entry_flags = pmd ? DAX_PMD : 0;
> > > int err = 0;
> > > pfn_t pfn;
> > > + void *kaddr;
> > >
> > > /* if we are reading UNWRITTEN and HOLE, return a hole. */
> > > if (!write &&
> > > @@ -1392,18 +1461,25 @@ static vm_fault_t dax_fault_actor(struct
> > vm_fault *vmf, pfn_t *pfnp,
> > > return dax_pmd_load_hole(xas, vmf, iomap, entry);
> > > }
> > >
> > > - if (iomap->type != IOMAP_MAPPED) {
> > > + if (iomap->type != IOMAP_MAPPED && !(iomap->flags &
> > IOMAP_F_SHARED))
> > > +{
> > > WARN_ON_ONCE(1);
> > > return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
> > > }
> > >
> > > - err = dax_iomap_direct_access(iomap, pos, size, NULL, &pfn);
> > > + err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
> > > if (err)
> > > return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
> > >
> > > *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, entry_flags,
> > > write && !sync);
> > >
> > > + if (write &&
> > > + srcmap->addr != IOMAP_HOLE && srcmap->addr != iomap->addr) {
> > > + err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr);
> > > + if (err)
> > > + return dax_fault_return(err);
> > > + }
> > > +
> > > if (sync)
> > > return dax_fault_synchronous_pfnp(pfnp, pfn);
> > >
> > > --
> > > 2.31.1
> > >
> > >
> > >
> -----Original Message-----
> From: Darrick J. Wong <[email protected]>
> Subject: Re: [PATCH v5 5/7] fsdax: Dedup file range to use a compare function
>
> On Tue, May 11, 2021 at 11:09:31AM +0800, Shiyang Ruan wrote:
> > With dax we cannot deal with readpage() etc. So, we create a dax
> > comparison funciton which is similar with
> > vfs_dedupe_file_range_compare().
> > And introduce dax_remap_file_range_prep() for filesystem use.
> >
> > Signed-off-by: Goldwyn Rodrigues <[email protected]>
> > Signed-off-by: Shiyang Ruan <[email protected]>
> > ---
> > fs/dax.c | 56
> +++++++++++++++++++++++++++++++++++++++++++
> > fs/remap_range.c | 57
> +++++++++++++++++++++++++++++++++++++-------
> > fs/xfs/xfs_reflink.c | 8 +++++--
> > include/linux/dax.h | 4 ++++
> > include/linux/fs.h | 12 ++++++----
> > 5 files changed, 123 insertions(+), 14 deletions(-)
> >
> > diff --git a/fs/dax.c b/fs/dax.c
> > index ee9d28a79bfb..dedf1be0155c 100644
> > --- a/fs/dax.c
> > +++ b/fs/dax.c
> > @@ -1853,3 +1853,59 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault
> *vmf,
> > return dax_insert_pfn_mkwrite(vmf, pfn, order); }
> > EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
> > +
> > +static loff_t dax_range_compare_actor(struct inode *ino1, loff_t pos1,
> > + struct inode *ino2, loff_t pos2, loff_t len, void *data,
> > + struct iomap *smap, struct iomap *dmap) {
> > + void *saddr, *daddr;
> > + bool *same = data;
> > + int ret;
> > +
> > + if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
> > + *same = true;
> > + return len;
> > + }
> > +
> > + if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
> > + *same = false;
> > + return 0;
> > + }
> > +
> > + ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
> > + &saddr, NULL);
> > + if (ret < 0)
> > + return -EIO;
> > +
> > + ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
> > + &daddr, NULL);
> > + if (ret < 0)
> > + return -EIO;
> > +
> > + *same = !memcmp(saddr, daddr, len);
> > + return len;
> > +}
> > +
> > +int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> > + struct inode *dest, loff_t destoff, loff_t len, bool *is_same,
> > + const struct iomap_ops *ops)
> > +{
> > + int id, ret = 0;
> > +
> > + id = dax_read_lock();
> > + while (len) {
> > + ret = iomap_apply2(src, srcoff, dest, destoff, len, 0, ops,
> > + is_same, dax_range_compare_actor);
> > + if (ret < 0 || !*is_same)
> > + goto out;
> > +
> > + len -= ret;
> > + srcoff += ret;
> > + destoff += ret;
> > + }
> > + ret = 0;
> > +out:
> > + dax_read_unlock(id);
> > + return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(dax_dedupe_file_range_compare);
> > diff --git a/fs/remap_range.c b/fs/remap_range.c index
> > e4a5fdd7ad7b..7bc4c8e3aa9f 100644
> > --- a/fs/remap_range.c
> > +++ b/fs/remap_range.c
> > @@ -14,6 +14,7 @@
> > #include <linux/compat.h>
> > #include <linux/mount.h>
> > #include <linux/fs.h>
> > +#include <linux/dax.h>
> > #include "internal.h"
> >
> > #include <linux/uaccess.h>
> > @@ -199,9 +200,9 @@ static void vfs_unlock_two_pages(struct page *page1,
> struct page *page2)
> > * Compare extents of two files to see if they are the same.
> > * Caller must have locked both inodes to prevent write races.
> > */
> > -static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> > - struct inode *dest, loff_t destoff,
> > - loff_t len, bool *is_same)
> > +int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> > + struct inode *dest, loff_t destoff,
> > + loff_t len, bool *is_same)
> > {
> > loff_t src_poff;
> > loff_t dest_poff;
> > @@ -280,6 +281,7 @@ static int vfs_dedupe_file_range_compare(struct
> > inode *src, loff_t srcoff,
> > out_error:
> > return error;
> > }
> > +EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
> >
> > /*
> > * Check that the two inodes are eligible for cloning, the ranges
> > make @@ -289,9 +291,11 @@ static int
> vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> > * If there's an error, then the usual negative error code is returned.
> > * Otherwise returns 0 with *len set to the request length.
> > */
> > -int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > - struct file *file_out, loff_t pos_out,
> > - loff_t *len, unsigned int remap_flags)
> > +static int
> > +__generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > + struct file *file_out, loff_t pos_out,
> > + loff_t *len, unsigned int remap_flags,
> > + const struct iomap_ops *dax_read_ops)
> > {
> > struct inode *inode_in = file_inode(file_in);
> > struct inode *inode_out = file_inode(file_out); @@ -351,8 +355,17 @@
> > int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > if (remap_flags & REMAP_FILE_DEDUP) {
> > bool is_same = false;
> >
> > - ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
> > - inode_out, pos_out, *len, &is_same);
> > + if (!IS_DAX(inode_in))
> > + ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
> > + inode_out, pos_out, *len, &is_same); #ifdef
> CONFIG_FS_DAX
> > + else if (dax_read_ops)
> > + ret = dax_dedupe_file_range_compare(inode_in, pos_in,
> > + inode_out, pos_out, *len, &is_same,
> > + dax_read_ops);
> > +#endif /* CONFIG_FS_DAX */
>
> Hmm, can you add an entry to the !IS_ENABLED(CONFIG_DAX) part of dax.h
> that defines dax_dedupe_file_range_compare as a dummy function that returns
> -EOPNOTSUPP? We try not to sprinkle preprocessor directives into the middle
> of functions, per Linus rules.
I found that it's ok to build without the #ifdef and #endif here, even though CONFIG_FS_DAX is disabled.
And like other dax functions in fs/dax.c, such as dax_iomap_rw(), it is declared in include/linux/dax.h without IS_ENABLED() or #ifdef wrapped. And xfs calls it directly and it won't cause build error. So, I think I could just remove the #ifdef here, but I am not sure if this obeys the rule.
>
> > + else
> > + return -EINVAL;
> > if (ret)
> > return ret;
> > if (!is_same)
> > @@ -370,6 +383,34 @@ int generic_remap_file_range_prep(struct file
> > *file_in, loff_t pos_in,
> >
> > return ret;
> > }
> > +
> > +#ifdef CONFIG_FS_DAX
> > +int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > + struct file *file_out, loff_t pos_out,
> > + loff_t *len, unsigned int remap_flags,
> > + const struct iomap_ops *ops) {
> > + return __generic_remap_file_range_prep(file_in, pos_in, file_out,
> > + pos_out, len, remap_flags, ops); } #else int
> > +dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > + struct file *file_out, loff_t pos_out,
> > + loff_t *len, unsigned int remap_flags,
> > + const struct iomap_ops *ops) {
> > + return -EOPNOTSUPP;
> > +}
> > +#endif /* CONFIG_FS_DAX */
> > +EXPORT_SYMBOL(dax_remap_file_range_prep);
>
> I think this symbol belongs in fs/dax.c and the declaration in dax.h?
For the function name, it does should belong to fs/dax. But if so, __generic_remap_file_range_prep() needs to be called in fs/dax. I don't think this is good.
--
Thanks,
Ruan Shiyang.
>
> --D
>
> > +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > + struct file *file_out, loff_t pos_out,
> > + loff_t *len, unsigned int remap_flags) {
> > + return __generic_remap_file_range_prep(file_in, pos_in, file_out,
> > + pos_out, len, remap_flags, NULL); }
> > EXPORT_SYMBOL(generic_remap_file_range_prep);
> >
> > loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, diff
> > --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index
> > 060695d6d56a..d25434f93235 100644
> > --- a/fs/xfs/xfs_reflink.c
> > +++ b/fs/xfs/xfs_reflink.c
> > @@ -1329,8 +1329,12 @@ xfs_reflink_remap_prep(
> > if (IS_DAX(inode_in) || IS_DAX(inode_out))
> > goto out_unlock;
> >
> > - ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
> > - len, remap_flags);
> > + if (!IS_DAX(inode_in))
> > + ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
> > + pos_out, len, remap_flags);
> > + else
> > + ret = dax_remap_file_range_prep(file_in, pos_in, file_out,
> > + pos_out, len, remap_flags, &xfs_read_iomap_ops);
> > if (ret || *len == 0)
> > goto out_unlock;
> >
> > diff --git a/include/linux/dax.h b/include/linux/dax.h index
> > 3275e01ed33d..32e1c34349f2 100644
> > --- a/include/linux/dax.h
> > +++ b/include/linux/dax.h
> > @@ -239,6 +239,10 @@ int dax_invalidate_mapping_entry_sync(struct
> address_space *mapping,
> > pgoff_t index);
> > s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
> > struct iomap *srcmap);
> > +int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> > + struct inode *dest, loff_t destoff,
> > + loff_t len, bool *is_same,
> > + const struct iomap_ops *ops);
> > static inline bool dax_mapping(struct address_space *mapping) {
> > return mapping->host && IS_DAX(mapping->host); diff --git
> > a/include/linux/fs.h b/include/linux/fs.h index
> > c3c88fdb9b2a..e2c348553d87 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -71,6 +71,7 @@ struct fsverity_operations; struct fs_context;
> > struct fs_parameter_spec; struct fileattr;
> > +struct iomap_ops;
> >
> > extern void __init inode_init(void);
> > extern void __init inode_init_early(void); @@ -2126,10 +2127,13 @@
> > extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file
> > *, extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> > struct file *file_out, loff_t pos_out,
> > size_t len, unsigned int flags); -extern int
> > generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > - struct file *file_out, loff_t pos_out,
> > - loff_t *count,
> > - unsigned int remap_flags);
> > +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > + struct file *file_out, loff_t pos_out,
> > + loff_t *count, unsigned int remap_flags); int
> > +dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > + struct file *file_out, loff_t pos_out,
> > + loff_t *len, unsigned int remap_flags,
> > + const struct iomap_ops *ops);
> > extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
> > struct file *file_out, loff_t pos_out,
> > loff_t len, unsigned int remap_flags);
> > --
> > 2.31.1
> >
> >
> >
On Fri, May 14, 2021 at 08:35:44AM +0000, [email protected] wrote:
>
>
> > -----Original Message-----
> > From: Darrick J. Wong <[email protected]>
> > Subject: Re: [PATCH v5 5/7] fsdax: Dedup file range to use a compare function
> >
> > On Tue, May 11, 2021 at 11:09:31AM +0800, Shiyang Ruan wrote:
> > > With dax we cannot deal with readpage() etc. So, we create a dax
> > > comparison funciton which is similar with
> > > vfs_dedupe_file_range_compare().
> > > And introduce dax_remap_file_range_prep() for filesystem use.
> > >
> > > Signed-off-by: Goldwyn Rodrigues <[email protected]>
> > > Signed-off-by: Shiyang Ruan <[email protected]>
> > > ---
> > > fs/dax.c | 56
> > +++++++++++++++++++++++++++++++++++++++++++
> > > fs/remap_range.c | 57
> > +++++++++++++++++++++++++++++++++++++-------
> > > fs/xfs/xfs_reflink.c | 8 +++++--
> > > include/linux/dax.h | 4 ++++
> > > include/linux/fs.h | 12 ++++++----
> > > 5 files changed, 123 insertions(+), 14 deletions(-)
> > >
> > > diff --git a/fs/dax.c b/fs/dax.c
> > > index ee9d28a79bfb..dedf1be0155c 100644
> > > --- a/fs/dax.c
> > > +++ b/fs/dax.c
> > > @@ -1853,3 +1853,59 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault
> > *vmf,
> > > return dax_insert_pfn_mkwrite(vmf, pfn, order); }
> > > EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
> > > +
> > > +static loff_t dax_range_compare_actor(struct inode *ino1, loff_t pos1,
> > > + struct inode *ino2, loff_t pos2, loff_t len, void *data,
> > > + struct iomap *smap, struct iomap *dmap) {
> > > + void *saddr, *daddr;
> > > + bool *same = data;
> > > + int ret;
> > > +
> > > + if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
> > > + *same = true;
> > > + return len;
> > > + }
> > > +
> > > + if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
> > > + *same = false;
> > > + return 0;
> > > + }
> > > +
> > > + ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
> > > + &saddr, NULL);
> > > + if (ret < 0)
> > > + return -EIO;
> > > +
> > > + ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
> > > + &daddr, NULL);
> > > + if (ret < 0)
> > > + return -EIO;
> > > +
> > > + *same = !memcmp(saddr, daddr, len);
> > > + return len;
> > > +}
> > > +
> > > +int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> > > + struct inode *dest, loff_t destoff, loff_t len, bool *is_same,
> > > + const struct iomap_ops *ops)
> > > +{
> > > + int id, ret = 0;
> > > +
> > > + id = dax_read_lock();
> > > + while (len) {
> > > + ret = iomap_apply2(src, srcoff, dest, destoff, len, 0, ops,
> > > + is_same, dax_range_compare_actor);
> > > + if (ret < 0 || !*is_same)
> > > + goto out;
> > > +
> > > + len -= ret;
> > > + srcoff += ret;
> > > + destoff += ret;
> > > + }
> > > + ret = 0;
> > > +out:
> > > + dax_read_unlock(id);
> > > + return ret;
> > > +}
> > > +EXPORT_SYMBOL_GPL(dax_dedupe_file_range_compare);
> > > diff --git a/fs/remap_range.c b/fs/remap_range.c index
> > > e4a5fdd7ad7b..7bc4c8e3aa9f 100644
> > > --- a/fs/remap_range.c
> > > +++ b/fs/remap_range.c
> > > @@ -14,6 +14,7 @@
> > > #include <linux/compat.h>
> > > #include <linux/mount.h>
> > > #include <linux/fs.h>
> > > +#include <linux/dax.h>
> > > #include "internal.h"
> > >
> > > #include <linux/uaccess.h>
> > > @@ -199,9 +200,9 @@ static void vfs_unlock_two_pages(struct page *page1,
> > struct page *page2)
> > > * Compare extents of two files to see if they are the same.
> > > * Caller must have locked both inodes to prevent write races.
> > > */
> > > -static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> > > - struct inode *dest, loff_t destoff,
> > > - loff_t len, bool *is_same)
> > > +int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> > > + struct inode *dest, loff_t destoff,
> > > + loff_t len, bool *is_same)
> > > {
> > > loff_t src_poff;
> > > loff_t dest_poff;
> > > @@ -280,6 +281,7 @@ static int vfs_dedupe_file_range_compare(struct
> > > inode *src, loff_t srcoff,
> > > out_error:
> > > return error;
> > > }
> > > +EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
> > >
> > > /*
> > > * Check that the two inodes are eligible for cloning, the ranges
> > > make @@ -289,9 +291,11 @@ static int
> > vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> > > * If there's an error, then the usual negative error code is returned.
> > > * Otherwise returns 0 with *len set to the request length.
> > > */
> > > -int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > > - struct file *file_out, loff_t pos_out,
> > > - loff_t *len, unsigned int remap_flags)
> > > +static int
> > > +__generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > > + struct file *file_out, loff_t pos_out,
> > > + loff_t *len, unsigned int remap_flags,
> > > + const struct iomap_ops *dax_read_ops)
> > > {
> > > struct inode *inode_in = file_inode(file_in);
> > > struct inode *inode_out = file_inode(file_out); @@ -351,8 +355,17 @@
> > > int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > > if (remap_flags & REMAP_FILE_DEDUP) {
> > > bool is_same = false;
> > >
> > > - ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
> > > - inode_out, pos_out, *len, &is_same);
> > > + if (!IS_DAX(inode_in))
> > > + ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
> > > + inode_out, pos_out, *len, &is_same); #ifdef
> > CONFIG_FS_DAX
> > > + else if (dax_read_ops)
> > > + ret = dax_dedupe_file_range_compare(inode_in, pos_in,
> > > + inode_out, pos_out, *len, &is_same,
> > > + dax_read_ops);
> > > +#endif /* CONFIG_FS_DAX */
> >
> > Hmm, can you add an entry to the !IS_ENABLED(CONFIG_DAX) part of dax.h
> > that defines dax_dedupe_file_range_compare as a dummy function that returns
> > -EOPNOTSUPP? We try not to sprinkle preprocessor directives into the middle
> > of functions, per Linus rules.
>
> I found that it's ok to build without the #ifdef and #endif here, even
> though CONFIG_FS_DAX is disabled.
> And like other dax functions in fs/dax.c, such as dax_iomap_rw(), it
> is declared in include/linux/dax.h without IS_ENABLED() or #ifdef
> wrapped. And xfs calls it directly and it won't cause build error.
> So, I think I could just remove the #ifdef here, but I am not sure if
> this obeys the rule.
If it doesn't break the build (yay kbuild robot!) then it's probably
ok.
> >
> > > + else
> > > + return -EINVAL;
> > > if (ret)
> > > return ret;
> > > if (!is_same)
> > > @@ -370,6 +383,34 @@ int generic_remap_file_range_prep(struct file
> > > *file_in, loff_t pos_in,
> > >
> > > return ret;
> > > }
> > > +
> > > +#ifdef CONFIG_FS_DAX
> > > +int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > > + struct file *file_out, loff_t pos_out,
> > > + loff_t *len, unsigned int remap_flags,
> > > + const struct iomap_ops *ops) {
> > > + return __generic_remap_file_range_prep(file_in, pos_in, file_out,
> > > + pos_out, len, remap_flags, ops); } #else int
> > > +dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > > + struct file *file_out, loff_t pos_out,
> > > + loff_t *len, unsigned int remap_flags,
> > > + const struct iomap_ops *ops) {
> > > + return -EOPNOTSUPP;
> > > +}
> > > +#endif /* CONFIG_FS_DAX */
> > > +EXPORT_SYMBOL(dax_remap_file_range_prep);
> >
> > I think this symbol belongs in fs/dax.c and the declaration in dax.h?
>
> For the function name, it does should belong to fs/dax. But if so,
> __generic_remap_file_range_prep() needs to be called in fs/dax. I
> don't think this is good.
Why not? FS_DAX is a boolean, so fs/dax.o will get linked into the same
vmlinux file as fs/remap_range.o.
--D
>
>
> --
> Thanks,
> Ruan Shiyang.
>
> >
> > --D
> >
> > > +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > > + struct file *file_out, loff_t pos_out,
> > > + loff_t *len, unsigned int remap_flags) {
> > > + return __generic_remap_file_range_prep(file_in, pos_in, file_out,
> > > + pos_out, len, remap_flags, NULL); }
> > > EXPORT_SYMBOL(generic_remap_file_range_prep);
> > >
> > > loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, diff
> > > --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index
> > > 060695d6d56a..d25434f93235 100644
> > > --- a/fs/xfs/xfs_reflink.c
> > > +++ b/fs/xfs/xfs_reflink.c
> > > @@ -1329,8 +1329,12 @@ xfs_reflink_remap_prep(
> > > if (IS_DAX(inode_in) || IS_DAX(inode_out))
> > > goto out_unlock;
> > >
> > > - ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
> > > - len, remap_flags);
> > > + if (!IS_DAX(inode_in))
> > > + ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
> > > + pos_out, len, remap_flags);
> > > + else
> > > + ret = dax_remap_file_range_prep(file_in, pos_in, file_out,
> > > + pos_out, len, remap_flags, &xfs_read_iomap_ops);
> > > if (ret || *len == 0)
> > > goto out_unlock;
> > >
> > > diff --git a/include/linux/dax.h b/include/linux/dax.h index
> > > 3275e01ed33d..32e1c34349f2 100644
> > > --- a/include/linux/dax.h
> > > +++ b/include/linux/dax.h
> > > @@ -239,6 +239,10 @@ int dax_invalidate_mapping_entry_sync(struct
> > address_space *mapping,
> > > pgoff_t index);
> > > s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
> > > struct iomap *srcmap);
> > > +int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> > > + struct inode *dest, loff_t destoff,
> > > + loff_t len, bool *is_same,
> > > + const struct iomap_ops *ops);
> > > static inline bool dax_mapping(struct address_space *mapping) {
> > > return mapping->host && IS_DAX(mapping->host); diff --git
> > > a/include/linux/fs.h b/include/linux/fs.h index
> > > c3c88fdb9b2a..e2c348553d87 100644
> > > --- a/include/linux/fs.h
> > > +++ b/include/linux/fs.h
> > > @@ -71,6 +71,7 @@ struct fsverity_operations; struct fs_context;
> > > struct fs_parameter_spec; struct fileattr;
> > > +struct iomap_ops;
> > >
> > > extern void __init inode_init(void);
> > > extern void __init inode_init_early(void); @@ -2126,10 +2127,13 @@
> > > extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file
> > > *, extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> > > struct file *file_out, loff_t pos_out,
> > > size_t len, unsigned int flags); -extern int
> > > generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > > - struct file *file_out, loff_t pos_out,
> > > - loff_t *count,
> > > - unsigned int remap_flags);
> > > +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > > + struct file *file_out, loff_t pos_out,
> > > + loff_t *count, unsigned int remap_flags); int
> > > +dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
> > > + struct file *file_out, loff_t pos_out,
> > > + loff_t *len, unsigned int remap_flags,
> > > + const struct iomap_ops *ops);
> > > extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
> > > struct file *file_out, loff_t pos_out,
> > > loff_t len, unsigned int remap_flags);
> > > --
> > > 2.31.1
> > >
> > >
> > >
On Wed, May 12, 2021 at 01:37:24AM +0000, [email protected] wrote:
> > -----Original Message-----
> > From: Darrick J. Wong <[email protected]>
> > Subject: Re: [PATCH v5 3/7] fsdax: Add dax_iomap_cow_copy() for
> > dax_iomap_zero
> >
> > On Tue, May 11, 2021 at 11:09:29AM +0800, Shiyang Ruan wrote:
> > > Punch hole on a reflinked file needs dax_copy_edge() too. Otherwise,
> > > data in not aligned area will be not correct. So, add the srcmap to
> > > dax_iomap_zero() and replace memset() as dax_copy_edge().
> > >
> > > Signed-off-by: Shiyang Ruan <[email protected]>
> > > Reviewed-by: Ritesh Harjani <[email protected]>
> > > ---
> > > fs/dax.c | 25 +++++++++++++++----------
> > > fs/iomap/buffered-io.c | 2 +-
> > > include/linux/dax.h | 3 ++-
> > > 3 files changed, 18 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/fs/dax.c b/fs/dax.c
> > > index ef0e564e7904..ee9d28a79bfb 100644
> > > --- a/fs/dax.c
> > > +++ b/fs/dax.c
> > > @@ -1186,7 +1186,8 @@ static vm_fault_t dax_pmd_load_hole(struct
> > > xa_state *xas, struct vm_fault *vmf, } #endif /* CONFIG_FS_DAX_PMD
> > > */
> > >
> > > -s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
> > > +s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
> > > + struct iomap *srcmap)
> > > {
> > > sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
> > > pgoff_t pgoff;
> > > @@ -1208,19 +1209,23 @@ s64 dax_iomap_zero(loff_t pos, u64 length,
> > > struct iomap *iomap)
> > >
> > > if (page_aligned)
> > > rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
> > > - else
> > > + else {
> > > rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
> > > - if (rc < 0) {
> > > - dax_read_unlock(id);
> > > - return rc;
> > > - }
> > > -
> > > - if (!page_aligned) {
> > > - memset(kaddr + offset, 0, size);
> > > + if (rc < 0)
> > > + goto out;
> > > + if (iomap->addr != srcmap->addr) {
> >
> > Why isn't this "if (srcmap->type != IOMAP_HOLE)" ?
> >
> > I suppose it has the same effect, since @iomap should never be a hole and we
> > should never have a @srcmap that's the same as @iomap, but still, we use
> > IOMAP_HOLE checks in most other parts of fs/iomap/.
>
> According to its caller `iomap_zero_range_actor()`, whether
> srcmap->type is IOMAP_HOLE has already been checked before
> `dax_iomap_zero()`. So the check you suggested will always be true...
Ah right, so it is. I'll go review the newest version of this patch.
--D
>
>
> --
> Thanks,
> Ruan Shiyang.
>
> >
> > Other than that, the logic looks decent to me.
> >
> > --D
> >
> > > + rc = dax_iomap_cow_copy(offset, size, PAGE_SIZE, srcmap,
> > > + kaddr);
> > > + if (rc < 0)
> > > + goto out;
> > > + } else
> > > + memset(kaddr + offset, 0, size);
> > > dax_flush(iomap->dax_dev, kaddr + offset, size);
> > > }
> > > +
> > > +out:
> > > dax_read_unlock(id);
> > > - return size;
> > > + return rc < 0 ? rc : size;
> > > }
> > >
> > > static loff_t
> > > diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index
> > > f2cd2034a87b..2734955ea67f 100644
> > > --- a/fs/iomap/buffered-io.c
> > > +++ b/fs/iomap/buffered-io.c
> > > @@ -933,7 +933,7 @@ static loff_t iomap_zero_range_actor(struct inode
> > *inode, loff_t pos,
> > > s64 bytes;
> > >
> > > if (IS_DAX(inode))
> > > - bytes = dax_iomap_zero(pos, length, iomap);
> > > + bytes = dax_iomap_zero(pos, length, iomap, srcmap);
> > > else
> > > bytes = iomap_zero(inode, pos, length, iomap, srcmap);
> > > if (bytes < 0)
> > > diff --git a/include/linux/dax.h b/include/linux/dax.h index
> > > b52f084aa643..3275e01ed33d 100644
> > > --- a/include/linux/dax.h
> > > +++ b/include/linux/dax.h
> > > @@ -237,7 +237,8 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault
> > > *vmf, int dax_delete_mapping_entry(struct address_space *mapping,
> > > pgoff_t index); int dax_invalidate_mapping_entry_sync(struct
> > address_space *mapping,
> > > pgoff_t index);
> > > -s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap);
> > > +s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap,
> > > + struct iomap *srcmap);
> > > static inline bool dax_mapping(struct address_space *mapping) {
> > > return mapping->host && IS_DAX(mapping->host);
> > > --
> > > 2.31.1
> > >
> > >
> > >
> >
>