This patchset aims to take care of this issue to make reflink and dedupe
work correctly in XFS.
It is based on Goldwyn's patchsets: "v4 Btrfs dax support" and "Btrfs
iomap". I picked up some patches related and made a few fix to make it
basically works fine.
For dax framework:
1. adapt to the latest change in iomap.
For XFS:
1. report the source address and set IOMAP_COW type for those write
operations that need COW.
2. update extent list at the end.
3. add file contents comparison function based on dax framework.
4. use xfs_break_layouts() to support dax.
Goldwyn Rodrigues (3):
dax: replace mmap entry in case of CoW
fs: dedup file range to use a compare function
dax: memcpy before zeroing range
Shiyang Ruan (4):
dax: Introduce dax_copy_edges() for COW.
dax: copy data before write.
xfs: Add COW handle for fsdax.
xfs: Add dedupe support for fsdax.
fs/btrfs/ioctl.c | 11 ++-
fs/dax.c | 203 ++++++++++++++++++++++++++++++++++++++----
fs/iomap.c | 9 +-
fs/ocfs2/file.c | 2 +-
fs/read_write.c | 11 +--
fs/xfs/xfs_iomap.c | 42 +++++----
fs/xfs/xfs_reflink.c | 84 +++++++++--------
include/linux/dax.h | 15 ++--
include/linux/fs.h | 8 +-
include/linux/iomap.h | 6 ++
10 files changed, 294 insertions(+), 97 deletions(-)
--
2.17.0
From: Goldwyn Rodrigues <[email protected]>
We replace the existing entry to the newly allocated one
in case of CoW. Also, we mark the entry as PAGECACHE_TAG_TOWRITE
so writeback marks this entry as writeprotected. This
helps us snapshots so new write pagefaults after snapshots
trigger a CoW.
btrfs does not support hugepages so we don't handle PMD.
Signed-off-by: Goldwyn Rodrigues <[email protected]>
---
fs/dax.c | 36 ++++++++++++++++++++++++++++--------
1 file changed, 28 insertions(+), 8 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 084cc21d47a4..8eb065a1ec51 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -700,6 +700,9 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
return 0;
}
+#define DAX_IF_DIRTY (1ULL << 0)
+#define DAX_IF_COW (1ULL << 1)
+
/*
* By this point grab_mapping_entry() has ensured that we have a locked entry
* of the appropriate size so we don't have to worry about downgrading PMDs to
@@ -709,14 +712,17 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
*/
static void *dax_insert_entry(struct xa_state *xas,
struct address_space *mapping, struct vm_fault *vmf,
- void *entry, pfn_t pfn, unsigned long flags, bool dirty)
+ void *entry, pfn_t pfn, unsigned long flags,
+ unsigned int insert_flags)
{
void *new_entry = dax_make_entry(pfn, flags);
+ bool dirty = insert_flags & DAX_IF_DIRTY;
+ bool cow = insert_flags & DAX_IF_COW;
if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
+ if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
unsigned long index = xas->xa_index;
/* we are replacing a zero page with block mapping */
if (dax_is_pmd_entry(entry))
@@ -728,12 +734,12 @@ static void *dax_insert_entry(struct xa_state *xas,
xas_reset(xas);
xas_lock_irq(xas);
- if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
+ if (cow || (dax_entry_size(entry) != dax_entry_size(new_entry))) {
dax_disassociate_entry(entry, mapping, false);
dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
}
- if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+ if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
/*
* Only swap our new entry into the page cache if the current
* entry is a zero page or an empty entry. If a normal PTE or
@@ -753,6 +759,9 @@ static void *dax_insert_entry(struct xa_state *xas,
if (dirty)
xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
+ if (cow)
+ xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
+
xas_unlock_irq(xas);
return entry;
}
@@ -1040,7 +1049,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
vm_fault_t ret;
*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
- DAX_ZERO_PAGE, false);
+ DAX_ZERO_PAGE, 0);
ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
trace_dax_load_hole(inode, vmf, ret);
@@ -1310,6 +1319,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
void *entry;
pfn_t pfn;
void *kaddr;
+ unsigned long insert_flags = 0;
trace_dax_pte_fault(inode, vmf, ret);
/*
@@ -1371,6 +1381,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
error = copy_user_dax(iomap.bdev, iomap.dax_dev,
sector, PAGE_SIZE, vmf->cow_page, vaddr);
break;
+ case IOMAP_COW:
+ /* Should not be setting this - fallthrough */
default:
WARN_ON_ONCE(1);
error = -EIO;
@@ -1391,6 +1403,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
switch (iomap.type) {
case IOMAP_COW:
+ insert_flags |= DAX_IF_COW;
+ /* fallthrough */
case IOMAP_MAPPED:
if (iomap.flags & IOMAP_F_NEW) {
count_vm_event(PGMAJFAULT);
@@ -1401,8 +1415,10 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
if (error < 0)
goto error_finish_iomap;
+ if (write && !sync)
+ insert_flags |= DAX_IF_DIRTY;
entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
- 0, write && !sync);
+ 0, insert_flags);
if (iomap.type == IOMAP_COW) {
error = dax_copy_edges(inode, pos, PAGE_SIZE, &srcmap,
@@ -1490,7 +1506,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
pfn = page_to_pfn_t(zero_page);
*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
- DAX_PMD | DAX_ZERO_PAGE, false);
+ DAX_PMD | DAX_ZERO_PAGE, 0);
if (arch_needs_pgtable_deposit()) {
pgtable = pte_alloc_one(vma->vm_mm);
@@ -1542,6 +1558,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
int error;
pfn_t pfn;
void *kaddr;
+ unsigned long insert_flags = 0;
/*
* Check whether offset isn't beyond end of file now. Caller is
@@ -1627,8 +1644,11 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
if (error < 0)
goto finish_iomap;
+ if (write && !sync)
+ insert_flags |= DAX_IF_DIRTY;
+
entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
- DAX_PMD, write && !sync);
+ DAX_PMD, insert_flags);
if (iomap.type == IOMAP_COW) {
error = dax_copy_edges(inode, pos, PMD_SIZE, &srcmap,
--
2.17.0
From: Goldwyn Rodrigues <[email protected]>
With dax we cannot deal with readpage() etc. So, we create a
funciton callback to perform the file data comparison and pass
it to generic_remap_file_range_prep() so it can use iomap-based
functions.
This may not be the best way to solve this. Suggestions welcome.
Signed-off-by: Goldwyn Rodrigues <[email protected]>
---
fs/btrfs/ioctl.c | 11 ++++++--
fs/dax.c | 65 ++++++++++++++++++++++++++++++++++++++++++++
fs/ocfs2/file.c | 2 +-
fs/read_write.c | 11 ++++----
fs/xfs/xfs_reflink.c | 2 +-
include/linux/dax.h | 4 +++
include/linux/fs.h | 8 +++++-
7 files changed, 93 insertions(+), 10 deletions(-)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cd4e693406a0..8f9c749c4aa6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3936,6 +3936,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
bool same_inode = inode_out == inode_in;
u64 wb_len;
int ret;
+ compare_range_t cmp;
if (!(remap_flags & REMAP_FILE_DEDUP)) {
struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
@@ -3997,8 +3998,14 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (ret < 0)
goto out_unlock;
- ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
- len, remap_flags);
+ if (IS_DAX(file_inode(file_in)) && IS_DAX(file_inode(file_out)))
+ cmp = btrfs_dax_file_range_compare;
+ else
+ cmp = vfs_dedupe_file_range_compare;
+
+ ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags, cmp);
+
if (ret < 0 || *len == 0)
goto out_unlock;
diff --git a/fs/dax.c b/fs/dax.c
index 8eb065a1ec51..8111ba93f4d3 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -39,6 +39,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/fs_dax.h>
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+
static inline unsigned int pe_order(enum page_entry_size pe_size)
{
if (pe_size == PE_SIZE_PTE)
@@ -1817,3 +1819,66 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
return dax_insert_pfn_mkwrite(vmf, pfn, order);
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
+
+static inline void *iomap_address(struct iomap *iomap, loff_t off, loff_t len)
+{
+ loff_t start;
+ void *addr;
+ start = (get_start_sect(iomap->bdev) << 9) + iomap->addr +
+ (off - iomap->offset);
+ dax_direct_access(iomap->dax_dev, PHYS_PFN(start), PHYS_PFN(len),
+ &addr, NULL);
+ return addr;
+}
+
+int dax_file_range_compare(struct inode *src, loff_t srcoff, struct inode *dest,
+ loff_t destoff, loff_t len, bool *is_same, const struct iomap_ops *ops)
+{
+ void *saddr, *daddr;
+ struct iomap s_iomap = {0};
+ struct iomap d_iomap = {0};
+ bool same = true;
+ loff_t cmp_len;
+ int id, ret = 0;
+
+ id = dax_read_lock();
+ while (len) {
+ ret = ops->iomap_begin(src, srcoff, len, 0, &s_iomap, NULL);
+ if (ret < 0) {
+ if (ops->iomap_end)
+ ops->iomap_end(src, srcoff, len, ret, 0, &s_iomap);
+ return ret;
+ }
+ cmp_len = len;
+ cmp_len = MIN(len, s_iomap.offset + s_iomap.length - srcoff);
+
+ ret = ops->iomap_begin(dest, destoff, cmp_len, 0, &d_iomap, NULL);
+ if (ret < 0) {
+ if (ops->iomap_end) {
+ ops->iomap_end(src, srcoff, len, ret, 0, &s_iomap);
+ ops->iomap_end(dest, destoff, len, ret, 0, &d_iomap);
+ }
+ return ret;
+ }
+ cmp_len = MIN(cmp_len, d_iomap.offset + d_iomap.length - destoff);
+
+ saddr = iomap_address(&s_iomap, srcoff, cmp_len);
+ daddr = iomap_address(&d_iomap, destoff, cmp_len);
+
+ same = !memcmp(saddr, daddr, cmp_len);
+ if (!same)
+ break;
+ len -= cmp_len;
+ srcoff += cmp_len;
+ destoff += cmp_len;
+
+ if (ops->iomap_end) {
+ ret = ops->iomap_end(src, srcoff, len, 0, 0, &s_iomap);
+ ret = ops->iomap_end(dest, destoff, len, 0, 0, &d_iomap);
+ }
+ }
+ dax_read_unlock(id);
+ *is_same = same;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dax_file_range_compare);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index d640c5f8a85d..9d470306cfc3 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2558,7 +2558,7 @@ static loff_t ocfs2_remap_file_range(struct file *file_in, loff_t pos_in,
goto out_unlock;
ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
- &len, remap_flags);
+ &len, remap_flags, vfs_dedupe_file_range_compare);
if (ret < 0 || len == 0)
goto out_unlock;
diff --git a/fs/read_write.c b/fs/read_write.c
index 61b43ad7608e..c6283802ef1c 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1778,7 +1778,7 @@ static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
* Compare extents of two files to see if they are the same.
* Caller must have locked both inodes to prevent write races.
*/
-static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
struct inode *dest, loff_t destoff,
loff_t len, bool *is_same)
{
@@ -1845,6 +1845,7 @@ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
out_error:
return error;
}
+EXPORT_SYMBOL_GPL(vfs_dedupe_file_range_compare);
/*
* Check that the two inodes are eligible for cloning, the ranges make
@@ -1856,7 +1857,8 @@ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
*/
int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
- loff_t *len, unsigned int remap_flags)
+ loff_t *len, unsigned int remap_flags,
+ compare_range_t compare)
{
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
@@ -1915,9 +1917,8 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
*/
if (remap_flags & REMAP_FILE_DEDUP) {
bool is_same = false;
-
- ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
- inode_out, pos_out, *len, &is_same);
+ ret = (*compare)(inode_in, pos_in,
+ inode_out, pos_out, *len, &is_same);
if (ret)
return ret;
if (!is_same)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 680ae7662a78..68e4257cebb0 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1350,7 +1350,7 @@ xfs_reflink_remap_prep(
goto out_unlock;
ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
- len, remap_flags);
+ len, remap_flags, vfs_dedupe_file_range_compare);
if (ret < 0 || *len == 0)
goto out_unlock;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 0dd316a74a29..1370d39c91b6 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -157,6 +157,10 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index);
+int dax_file_range_compare(struct inode *src, loff_t srcoff,
+ struct inode *dest, loff_t destoff,
+ loff_t len, bool *is_same,
+ const struct iomap_ops *ops);
#ifdef CONFIG_FS_DAX
int __dax_zero_page_range(struct block_device *bdev,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dd28e7679089..0224503e42ce 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1883,10 +1883,16 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
unsigned long, loff_t *, rwf_t);
extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
loff_t, size_t, unsigned int);
+extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+ struct inode *dest, loff_t destoff,
+ loff_t len, bool *is_same);
+typedef int (*compare_range_t)(struct inode *src, loff_t srcpos,
+ struct inode *dest, loff_t destpos,
+ loff_t len, bool *is_same);
extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *count,
- unsigned int remap_flags);
+ unsigned int remap_flags, compare_range_t cmp);
extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags);
--
2.17.0
From: Goldwyn Rodrigues <[email protected]>
However, this needed more iomap fields, so it was easier
to pass iomap and compute inside the function rather
than passing a log of arguments.
Note, there is subtle difference between iomap_sector and
dax_iomap_sector(). Can we replace dax_iomap_sector with
iomap_sector()? It would need pos & PAGE_MASK though or else
bdev_dax_pgoff() return -EINVAL.
Signed-off-by: Goldwyn Rodrigues <[email protected]>
---
fs/dax.c | 17 ++++++++++++-----
fs/iomap.c | 9 +--------
include/linux/dax.h | 11 +++++------
include/linux/iomap.h | 6 ++++++
4 files changed, 24 insertions(+), 19 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 8111ba93f4d3..c92111950612 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1071,11 +1071,16 @@ static bool dax_range_is_aligned(struct block_device *bdev,
return true;
}
-int __dax_zero_page_range(struct block_device *bdev,
- struct dax_device *dax_dev, sector_t sector,
- unsigned int offset, unsigned int size)
+int __dax_zero_page_range(struct iomap *iomap, loff_t pos,
+ unsigned int offset, unsigned int size)
{
- if (dax_range_is_aligned(bdev, offset, size)) {
+ sector_t sector = dax_iomap_sector(iomap, pos & PAGE_MASK);
+ struct block_device *bdev = iomap->bdev;
+ struct dax_device *dax_dev = iomap->dax_dev;
+ int ret = 0;
+
+ if (!(iomap->type == IOMAP_COW) &&
+ dax_range_is_aligned(bdev, offset, size)) {
sector_t start_sector = sector + (offset >> 9);
return blkdev_issue_zeroout(bdev, start_sector,
@@ -1095,11 +1100,13 @@ int __dax_zero_page_range(struct block_device *bdev,
dax_read_unlock(id);
return rc;
}
+ if (iomap->type == IOMAP_COW)
+ ret = memcpy_mcsafe(kaddr, iomap->inline_data, offset);
memset(kaddr + offset, 0, size);
dax_flush(dax_dev, kaddr + offset, size);
dax_read_unlock(id);
}
- return 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(__dax_zero_page_range);
diff --git a/fs/iomap.c b/fs/iomap.c
index 3377a7ec5b7d..b5662b5bac26 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -99,12 +99,6 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
return written ? written : ret;
}
-static sector_t
-iomap_sector(struct iomap *iomap, loff_t pos)
-{
- return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
-}
-
static struct iomap_page *
iomap_page_create(struct inode *inode, struct page *page)
{
@@ -993,8 +987,7 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
struct iomap *iomap)
{
- return __dax_zero_page_range(iomap->bdev, iomap->dax_dev,
- iomap_sector(iomap, pos & PAGE_MASK), offset, bytes);
+ return __dax_zero_page_range(iomap, pos, offset, bytes);
}
static loff_t
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 1370d39c91b6..c469d9ff54b4 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -9,6 +9,7 @@
typedef unsigned long dax_entry_t;
+struct iomap;
struct iomap_ops;
struct dax_device;
struct dax_operations {
@@ -163,13 +164,11 @@ int dax_file_range_compare(struct inode *src, loff_t srcoff,
const struct iomap_ops *ops);
#ifdef CONFIG_FS_DAX
-int __dax_zero_page_range(struct block_device *bdev,
- struct dax_device *dax_dev, sector_t sector,
- unsigned int offset, unsigned int length);
+int __dax_zero_page_range(struct iomap *iomap, loff_t pos,
+ unsigned int offset, unsigned int size);
#else
-static inline int __dax_zero_page_range(struct block_device *bdev,
- struct dax_device *dax_dev, sector_t sector,
- unsigned int offset, unsigned int length)
+static inline int __dax_zero_page_range(struct iomap *iomap, loff_t pos,
+ unsigned int offset, unsigned int size)
{
return -ENXIO;
}
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 75b78dd91c3c..3628e57954c6 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -7,6 +7,7 @@
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/mm_types.h>
+#include <linux/blkdev.h>
struct address_space;
struct fiemap_extent_info;
@@ -122,6 +123,11 @@ static inline struct iomap_page *to_iomap_page(struct page *page)
return NULL;
}
+static inline sector_t iomap_sector(struct iomap *iomap, loff_t pos)
+{
+ return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
+}
+
ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
const struct iomap_ops *ops);
int iomap_readpage(struct page *page, const struct iomap_ops *ops);
--
2.17.0
Add dax_copy_edges() into each dax actor functions to perform COW in the
case of IOMAP_COW type is set.
Signed-off-by: Shiyang Ruan <[email protected]>
---
fs/dax.c | 39 +++++++++++++++++++++++++++++++++------
1 file changed, 33 insertions(+), 6 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 450baafe2ea4..084cc21d47a4 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1098,11 +1098,12 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range);
* offset/offset+length are not page aligned.
*/
static int dax_copy_edges(struct inode *inode, loff_t pos, loff_t length,
- struct iomap *srcmap, void *daddr)
+ struct iomap *srcmap, void *daddr, bool pmd)
{
- unsigned offset = pos & (PAGE_SIZE - 1);
+ size_t page_size = pmd ? PMD_SIZE : PAGE_SIZE;
+ unsigned offset = pos & (page_size - 1);
loff_t end = pos + length;
- loff_t pg_end = round_up(end, PAGE_SIZE);
+ loff_t pg_end = round_up(end, page_size);
void *saddr = 0;
int ret = 0;
@@ -1153,7 +1154,8 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
return iov_iter_zero(min(length, end - pos), iter);
}
- if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
+ if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
+ iomap->type != IOMAP_COW))
return -EIO;
/*
@@ -1192,6 +1194,13 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
break;
}
+ if (iomap->type == IOMAP_COW) {
+ ret = dax_copy_edges(inode, pos, length, srcmap, kaddr,
+ false);
+ if (ret)
+ break;
+ }
+
map_len = PFN_PHYS(map_len);
kaddr += offset;
map_len -= offset;
@@ -1300,6 +1309,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
vm_fault_t ret = 0;
void *entry;
pfn_t pfn;
+ void *kaddr;
trace_dax_pte_fault(inode, vmf, ret);
/*
@@ -1380,19 +1390,27 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
sync = dax_fault_is_synchronous(flags, vma, &iomap);
switch (iomap.type) {
+ case IOMAP_COW:
case IOMAP_MAPPED:
if (iomap.flags & IOMAP_F_NEW) {
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
major = VM_FAULT_MAJOR;
}
- error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn, NULL);
+ error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn, &kaddr);
if (error < 0)
goto error_finish_iomap;
entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
0, write && !sync);
+ if (iomap.type == IOMAP_COW) {
+ error = dax_copy_edges(inode, pos, PAGE_SIZE, &srcmap,
+ kaddr, false);
+ if (error)
+ goto error_finish_iomap;
+ }
+
/*
* If we are doing synchronous page fault and inode needs fsync,
* we can insert PTE into page tables only after that happens.
@@ -1523,6 +1541,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
loff_t pos;
int error;
pfn_t pfn;
+ void *kaddr;
/*
* Check whether offset isn't beyond end of file now. Caller is
@@ -1602,14 +1621,22 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
switch (iomap.type) {
+ case IOMAP_COW:
case IOMAP_MAPPED:
- error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn, NULL);
+ error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn, &kaddr);
if (error < 0)
goto finish_iomap;
entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
DAX_PMD, write && !sync);
+ if (iomap.type == IOMAP_COW) {
+ error = dax_copy_edges(inode, pos, PMD_SIZE, &srcmap,
+ kaddr, true);
+ if (error)
+ goto unlock_entry;
+ }
+
/*
* If we are doing synchronous page fault and inode needs fsync,
* we can insert PMD into page tables only after that happens.
--
2.17.0
To copy source data to destance address before write. Change
dax_iomap_pfn() to return the address as well in order to use it for
performing a memcpy in case the type is IOMAP_COW.
dax_copy_edges() is a helper functions performs a copy from one part of
the device to another for data not page aligned.
Signed-off-by: Goldwyn Rodrigues <[email protected]>
Signed-off-by: Shiyang Ruan <[email protected]>
---
fs/dax.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 52 insertions(+), 4 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 88b93796855e..450baafe2ea4 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -984,7 +984,7 @@ static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
}
static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
- pfn_t *pfnp)
+ pfn_t *pfnp, void **addr)
{
const sector_t sector = dax_iomap_sector(iomap, pos);
pgoff_t pgoff;
@@ -996,11 +996,13 @@ static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
return rc;
id = dax_read_lock();
length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
- NULL, pfnp);
+ addr, pfnp);
if (length < 0) {
rc = length;
goto out;
}
+ if (!pfnp)
+ goto out_check_addr;
rc = -EINVAL;
if (PFN_PHYS(length) < size)
goto out;
@@ -1010,6 +1012,12 @@ static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
if (length > 1 && !pfn_t_devmap(*pfnp))
goto out;
rc = 0;
+
+out_check_addr:
+ if (!addr)
+ goto out;
+ if (!*addr)
+ rc = -EFAULT;
out:
dax_read_unlock(id);
return rc;
@@ -1084,6 +1092,46 @@ int __dax_zero_page_range(struct block_device *bdev,
}
EXPORT_SYMBOL_GPL(__dax_zero_page_range);
+/*
+ * dax_copy_edges - Copies the part of the pages not included in
+ * the write, but required for CoW because
+ * offset/offset+length are not page aligned.
+ */
+static int dax_copy_edges(struct inode *inode, loff_t pos, loff_t length,
+ struct iomap *srcmap, void *daddr)
+{
+ unsigned offset = pos & (PAGE_SIZE - 1);
+ loff_t end = pos + length;
+ loff_t pg_end = round_up(end, PAGE_SIZE);
+ void *saddr = 0;
+ int ret = 0;
+
+ ret = dax_iomap_pfn(srcmap, pos, length, NULL, &saddr);
+ if (ret)
+ return ret;
+ /*
+ * Copy the first part of the page
+ * Note: we pass offset as length
+ */
+ if (offset) {
+ if (saddr)
+ ret = memcpy_mcsafe(daddr, saddr, offset);
+ else
+ memset(daddr, 0, offset);
+ }
+
+ /* Copy the last part of the range */
+ if (end < pg_end) {
+ if (saddr)
+ ret = memcpy_mcsafe(daddr + offset + length,
+ saddr + offset + length, pg_end - end);
+ else
+ memset(daddr + offset + length, 0,
+ pg_end - end);
+ }
+ return ret;
+}
+
static loff_t
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iomap *iomap, struct iomap *srcmap)
@@ -1338,7 +1386,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
major = VM_FAULT_MAJOR;
}
- error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
+ error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn, NULL);
if (error < 0)
goto error_finish_iomap;
@@ -1555,7 +1603,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
switch (iomap.type) {
case IOMAP_MAPPED:
- error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
+ error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn, NULL);
if (error < 0)
goto finish_iomap;
--
2.17.0
In dax mode, use a new range compare function provided by dax framework.
Don't share dax file with non-dax file. Use xfs lock and
xfs_break_layouts() to simplify the lock and break layout operation, and
rename to xfs_reflink_remap_lock_and_break_layout() in order to echo the
unlock function: xfs_reflink_remap_unlock().
Signed-off-by: Shiyang Ruan <[email protected]>
---
fs/xfs/xfs_reflink.c | 83 +++++++++++++++++++++++---------------------
1 file changed, 44 insertions(+), 39 deletions(-)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index a1b000be3699..096751d7990a 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1209,39 +1209,37 @@ xfs_reflink_remap_blocks(
* src iolock held, and therefore have to back out both locks.
*/
static int
-xfs_iolock_two_inodes_and_break_layout(
- struct inode *src,
- struct inode *dest)
+xfs_reflink_remap_lock_and_break_layout(
+ struct file *file_in,
+ struct file *file_out)
{
int error;
+ struct inode *inode_in = file_inode(file_in);
+ struct xfs_inode *src = XFS_I(inode_in);
+ struct inode *inode_out = file_inode(file_out);
+ struct xfs_inode *dest = XFS_I(inode_out);
+
+ uint src_iolock = XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED;
+ uint dest_iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
-retry:
- if (src < dest) {
- inode_lock_shared(src);
- inode_lock_nested(dest, I_MUTEX_NONDIR2);
+ if (src->i_ino < dest->i_ino) {
+ xfs_ilock(src, src_iolock);
+ xfs_ilock(dest, dest_iolock);
} else {
- /* src >= dest */
- inode_lock(dest);
+ /* inode_in >= inode_out */
+ xfs_ilock(dest, dest_iolock);
}
- error = break_layout(dest, false);
- if (error == -EWOULDBLOCK) {
- inode_unlock(dest);
- if (src < dest)
- inode_unlock_shared(src);
- error = break_layout(dest, true);
- if (error)
- return error;
- goto retry;
- }
+ error = xfs_break_layouts(inode_out, &dest_iolock, BREAK_UNMAP);
if (error) {
- inode_unlock(dest);
- if (src < dest)
- inode_unlock_shared(src);
+ xfs_iunlock(dest, dest_iolock);
+ if (src->i_ino < dest->i_ino)
+ xfs_iunlock(src, src_iolock);
return error;
}
- if (src > dest)
- inode_lock_shared_nested(src, I_MUTEX_NONDIR2);
+
+ if (src->i_ino > dest->i_ino)
+ xfs_ilock(src, src_iolock);
return 0;
}
@@ -1257,12 +1255,12 @@ xfs_reflink_remap_unlock(
struct xfs_inode *dest = XFS_I(inode_out);
bool same_inode = (inode_in == inode_out);
- xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
- if (!same_inode)
- xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
- inode_unlock(inode_out);
+ uint src_iolock = XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED;
+ uint dest_iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+
+ xfs_iunlock(dest, dest_iolock);
if (!same_inode)
- inode_unlock_shared(inode_in);
+ xfs_iunlock(src, src_iolock);
}
/*
@@ -1285,6 +1283,14 @@ xfs_reflink_zero_posteof(
&xfs_iomap_ops);
}
+int xfs_reflink_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+ struct inode *dest, loff_t destoff,
+ loff_t len, bool *is_same)
+{
+ return dax_file_range_compare(src, srcoff, dest, destoff, len, is_same,
+ &xfs_iomap_ops);
+}
+
/*
* Prepare two files for range cloning. Upon a successful return both inodes
* will have the iolock and mmaplock held, the page cache of the out file will
@@ -1327,18 +1333,13 @@ xfs_reflink_remap_prep(
struct xfs_inode *src = XFS_I(inode_in);
struct inode *inode_out = file_inode(file_out);
struct xfs_inode *dest = XFS_I(inode_out);
- bool same_inode = (inode_in == inode_out);
ssize_t ret;
+ compare_range_t cmp;
/* Lock both files against IO */
- ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);
+ ret = xfs_reflink_remap_lock_and_break_layout(file_in, file_out);
if (ret)
return ret;
- if (same_inode)
- xfs_ilock(src, XFS_MMAPLOCK_EXCL);
- else
- xfs_lock_two_inodes(src, XFS_MMAPLOCK_SHARED, dest,
- XFS_MMAPLOCK_EXCL);
/* Check file eligibility and prepare for block sharing. */
ret = -EINVAL;
@@ -1346,12 +1347,16 @@ xfs_reflink_remap_prep(
if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
goto out_unlock;
- /* Don't share DAX file data for now. */
- if (IS_DAX(inode_in) || IS_DAX(inode_out))
+ /* Don't share DAX file data with non-DAX file. */
+ if (IS_DAX(inode_in) != IS_DAX(inode_out))
goto out_unlock;
+ if (IS_DAX(inode_in))
+ cmp = xfs_reflink_dedupe_file_range_compare;
+ else
+ cmp = vfs_dedupe_file_range_compare;
ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
- len, remap_flags, vfs_dedupe_file_range_compare);
+ len, remap_flags, cmp);
if (ret < 0 || *len == 0)
goto out_unlock;
--
2.17.0
WRITE and ZERO on a shared extent need to perform COW. For direct io in
dax mode, it is handled like WRITE, but with block aligned. So COW
seems a bit redundant for it.
Because of COW, new extent has been allocated. The extent list needs to
be updated at the end.
Signed-off-by: Shiyang Ruan <[email protected]>
---
fs/xfs/xfs_iomap.c | 42 +++++++++++++++++++++++++-----------------
fs/xfs/xfs_reflink.c | 1 +
2 files changed, 26 insertions(+), 17 deletions(-)
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 6116a75f03da..ae3b9bf5d784 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -930,10 +930,10 @@ xfs_file_iomap_begin(
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- struct xfs_bmbt_irec imap;
+ struct xfs_bmbt_irec imap = { 0 }, cmap = { 0 };
xfs_fileoff_t offset_fsb, end_fsb;
int nimaps = 1, error = 0;
- bool shared = false;
+ bool shared = false, need_cow = false;
unsigned lockmode;
if (XFS_FORCED_SHUTDOWN(mp))
@@ -967,6 +967,8 @@ xfs_file_iomap_begin(
if (error)
goto out_unlock;
+ cmap = imap;
+
if (flags & IOMAP_REPORT) {
/* Trim the mapping to the nearest shared extent boundary. */
error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
@@ -983,8 +985,7 @@ xfs_file_iomap_begin(
* been done up front, so we don't need to do them here.
*/
if (xfs_is_cow_inode(ip)) {
- struct xfs_bmbt_irec cmap;
- bool directio = (flags & IOMAP_DIRECT);
+ bool directio = flags & IOMAP_DIRECT;
/* if zeroing doesn't need COW allocation, then we are done. */
if ((flags & IOMAP_ZERO) &&
@@ -992,23 +993,21 @@ xfs_file_iomap_begin(
goto out_found;
/* may drop and re-acquire the ilock */
- cmap = imap;
error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode,
- directio);
+ directio || IS_DAX(inode));
if (error)
goto out_unlock;
/*
- * For buffered writes we need to report the address of the
- * previous block (if there was any) so that the higher level
- * write code can perform read-modify-write operations; we
- * won't need the CoW fork mapping until writeback. For direct
- * I/O, which must be block aligned, we need to report the
- * newly allocated address. If the data fork has a hole, copy
- * the COW fork mapping to avoid allocating to the data fork.
+ * WRITE and ZERO on a shared extent under dax mode need to
+ * perform COW, source address will be reported in srcmap.
*/
- if (directio || imap.br_startblock == HOLESTARTBLOCK)
+ if (imap.br_startblock != HOLESTARTBLOCK && IS_DAX(inode) &&
+ shared) {
+ need_cow = true;
+ } else {
imap = cmap;
+ }
end_fsb = imap.br_startoff + imap.br_blockcount;
length = XFS_FSB_TO_B(mp, end_fsb) - offset;
@@ -1044,8 +1043,7 @@ xfs_file_iomap_begin(
*/
if (lockmode == XFS_ILOCK_EXCL)
xfs_ilock_demote(ip, lockmode);
- error = xfs_iomap_write_direct(ip, offset, length, &imap,
- nimaps);
+ error = xfs_iomap_write_direct(ip, offset, length, &imap, nimaps);
if (error)
return error;
@@ -1053,7 +1051,15 @@ xfs_file_iomap_begin(
trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
out_finish:
- return xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
+ if (need_cow) {
+ error = xfs_bmbt_to_iomap(ip, iomap, &cmap, shared);
+ if (error)
+ return error;
+ iomap->flags |= IOMAP_F_NEW;
+ iomap->type = IOMAP_COW;
+ return xfs_bmbt_to_iomap(ip, srcmap, &imap, shared);
+ } else
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
out_found:
ASSERT(nimaps);
@@ -1135,6 +1141,8 @@ xfs_file_iomap_end(
if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
length, written, iomap);
+ else if (iomap->type == IOMAP_COW && written)
+ return xfs_reflink_end_cow(XFS_I(inode), offset, length);
return 0;
}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 68e4257cebb0..a1b000be3699 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -446,6 +446,7 @@ xfs_reflink_allocate_cow(
*/
if (!convert_now || imap->br_state == XFS_EXT_NORM)
return 0;
+ imap->br_state = XFS_EXT_NORM;
trace_xfs_reflink_convert_cow(ip, imap);
return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
--
2.17.0
On 19:49 31/07, Shiyang Ruan wrote:
> This patchset aims to take care of this issue to make reflink and dedupe
> work correctly in XFS.
>
> It is based on Goldwyn's patchsets: "v4 Btrfs dax support" and "Btrfs
> iomap". I picked up some patches related and made a few fix to make it
> basically works fine.
>
> For dax framework:
> 1. adapt to the latest change in iomap.
>
> For XFS:
> 1. report the source address and set IOMAP_COW type for those write
> operations that need COW.
> 2. update extent list at the end.
> 3. add file contents comparison function based on dax framework.
> 4. use xfs_break_layouts() to support dax.
Shiyang,
I think you used the older patches which does not contain the iomap changes
which would call iomap_begin() with two iomaps. I have it in the btrfs-iomap
branch and plan to update it today. It is built on v5.3-rcX, so it should
contain the changes which moves the iomap code to the different directory.
I will build the dax patches on top of that.
However, we are making a big dependency chain here :(
--
Goldwyn
>
>
> Goldwyn Rodrigues (3):
> dax: replace mmap entry in case of CoW
> fs: dedup file range to use a compare function
> dax: memcpy before zeroing range
>
> Shiyang Ruan (4):
> dax: Introduce dax_copy_edges() for COW.
> dax: copy data before write.
> xfs: Add COW handle for fsdax.
> xfs: Add dedupe support for fsdax.
>
> fs/btrfs/ioctl.c | 11 ++-
> fs/dax.c | 203 ++++++++++++++++++++++++++++++++++++++----
> fs/iomap.c | 9 +-
> fs/ocfs2/file.c | 2 +-
> fs/read_write.c | 11 +--
> fs/xfs/xfs_iomap.c | 42 +++++----
> fs/xfs/xfs_reflink.c | 84 +++++++++--------
> include/linux/dax.h | 15 ++--
> include/linux/fs.h | 8 +-
> include/linux/iomap.h | 6 ++
> 10 files changed, 294 insertions(+), 97 deletions(-)
>
> --
> 2.17.0
>
>
>
--
Goldwyn
On 8/1/19 4:33 AM, Goldwyn Rodrigues wrote:
> On 19:49 31/07, Shiyang Ruan wrote:
>> This patchset aims to take care of this issue to make reflink and dedupe
>> work correctly in XFS.
>>
>> It is based on Goldwyn's patchsets: "v4 Btrfs dax support" and "Btrfs
>> iomap". I picked up some patches related and made a few fix to make it
>> basically works fine.
>>
>> For dax framework:
>> 1. adapt to the latest change in iomap.
>>
>> For XFS:
>> 1. report the source address and set IOMAP_COW type for those write
>> operations that need COW.
>> 2. update extent list at the end.
>> 3. add file contents comparison function based on dax framework.
>> 4. use xfs_break_layouts() to support dax.
>
> Shiyang,
>
> I think you used the older patches which does not contain the iomap changes
> which would call iomap_begin() with two iomaps. I have it in the btrfs-iomap
Oh, Sorry for my carelessness. This patchset is built on your "Btrfs
iomap". I didn't point it out in cover letter.
> branch and plan to update it today. It is built on v5.3-rcX, so it should
> contain the changes which moves the iomap code to the different directory.
> I will build the dax patches on top of that.
> However, we are making a big dependency chain here
Don't worry. It's fine for me. I'll follow your updates.
>
--
Thanks,
Shiyang Ruan.
On Thu, Aug 01, 2019 at 09:37:04AM +0800, Shiyang Ruan wrote:
>
>
> On 8/1/19 4:33 AM, Goldwyn Rodrigues wrote:
> > On 19:49 31/07, Shiyang Ruan wrote:
> > > This patchset aims to take care of this issue to make reflink and dedupe
> > > work correctly in XFS.
> > >
> > > It is based on Goldwyn's patchsets: "v4 Btrfs dax support" and "Btrfs
> > > iomap". I picked up some patches related and made a few fix to make it
> > > basically works fine.
> > >
> > > For dax framework:
> > > 1. adapt to the latest change in iomap.
> > >
> > > For XFS:
> > > 1. report the source address and set IOMAP_COW type for those write
> > > operations that need COW.
> > > 2. update extent list at the end.
> > > 3. add file contents comparison function based on dax framework.
> > > 4. use xfs_break_layouts() to support dax.
> >
> > Shiyang,
> >
> > I think you used the older patches which does not contain the iomap changes
> > which would call iomap_begin() with two iomaps. I have it in the btrfs-iomap
>
> Oh, Sorry for my carelessness. This patchset is built on your "Btrfs
> iomap". I didn't point it out in cover letter.
>
> > branch and plan to update it today. It is built on v5.3-rcX, so it should
> > contain the changes which moves the iomap code to the different directory.
> > I will build the dax patches on top of that.
> > However, we are making a big dependency chain here
> Don't worry. It's fine for me. I'll follow your updates.
Hi Shiyang,
I'll wait for you to update your patches on top of the latest btrfs
patches before looking at this any futher. it would be good to get
a set of iomap infrastructure patches separated from the btrfs
patchsets so could have them both built from a common patchset.
Cheers,
Dave.
--
Dave Chinner
[email protected]
Btw, I just had a chat with Dan last week on this. And he pointed out
that while this series deals with the read/write path issues of
reflink on DAX it doesn't deal with the mmap side issue that
page->mapping and page->index can point back to exactly one file.
I think we want a few xfstests that reflink a file and then use the
different links using mmap, as that should blow up pretty reliably.
On Tue, Oct 08, 2019 at 11:31:44PM -0700, Christoph Hellwig wrote:
> Btw, I just had a chat with Dan last week on this. And he pointed out
> that while this series deals with the read/write path issues of
> reflink on DAX it doesn't deal with the mmap side issue that
> page->mapping and page->index can point back to exactly one file.
>
> I think we want a few xfstests that reflink a file and then use the
> different links using mmap, as that should blow up pretty reliably.
Hmm, you're right, we don't actually have a test that checks the
behavior of mwriting all copies of a shared block. Ok, I'll go write
one.
--D
On Wed, Oct 09, 2019 at 10:11:52AM -0700, Darrick J. Wong wrote:
> On Tue, Oct 08, 2019 at 11:31:44PM -0700, Christoph Hellwig wrote:
> > Btw, I just had a chat with Dan last week on this. And he pointed out
> > that while this series deals with the read/write path issues of
> > reflink on DAX it doesn't deal with the mmap side issue that
> > page->mapping and page->index can point back to exactly one file.
> >
> > I think we want a few xfstests that reflink a file and then use the
> > different links using mmap, as that should blow up pretty reliably.
>
> Hmm, you're right, we don't actually have a test that checks the
> behavior of mwriting all copies of a shared block. Ok, I'll go write
> one.
I've pointed this problem out to everyone who has asked me "what do
we need to do to support reflink on DAX". I've even walked a couple
of people right through the problem that needs to be solved and
discussed the potential solutions to it.
Problems that I think need addressing:
- device dax and filesystem dax have fundamentally different
needs in this space, so they need to be separated and not
try to use the same solution.
- dax_lock_entry() being used as a substitute for
page_lock() but it not being held on the page itself means
it can't be extended to serialise access to the page
across multiple mappings that are unaware of each other
- dax_lock_page/dax_unlock_page interface for hardware
memory errors needs to report to the
filesystem for processing and repair, not assume the page
is user data and killing processes is the only possible
recovery mechanism.
- dax_associate_entry/dax_disassociate_entry can only work
for a 1:1 page:mapping,index relationship. It needs to go
away and be replaced by a mechanism that allows
tracking multiple page mapping/index/state tuples. This
has much wider use than DAX (e.g. sharing page cache pages
between reflinked files)
I've proposed shadow pages (based on a concept from Matethw Wilcox)
for each read-only reflink mapping with the real physical page being
owned by the filesystem and indexed by LBA in the filesystem buffer
cache. This would be based on whether the extent in the file the
page is mapped from has multiple references to it.
i.e. When a new page mapping occurs in a shared extent, we add the
page to the buffer cache (i.e. point a struct xfs_buf at it)i if it
isn't already present, then allocate a shadow page, point it at the
master, set it up with the new mapping,index tuple and add it to the
mapping tree. Then we can treat it as a unique page even though it
points to the read-only master page.
When the page get's COWed, we toss away the shadow page and the
master can be reclaimed with the reference count goes to zero or the
extent is no longer shared. Think of it kind of like the way we
multiply reference the zero page for holes in mmap()d dax regions,
except we can have millions of them and they are found by physical
buffer cache index lookups.
This works for both DAX and non-DAX sharing of read-only shared
filesytsem pages. i.e. it would form the basis of single-copy
read-only page cache pages for reflinked files.
There was quite a bit of talk at LSFMM 2018 about having a linked
list of mapping structures hanging off a struct page, one for each
mapping that references the page. Operations would then have to walk
all mappings that reference the page. This was useful to other
subsystems (HMM?) for some purpose I forget, but I'm not sure it's
particularly useful by itself for non-dax reflink purposes - I
suspect the filesystem would still need to track such pages itself
in it's buffer cache so it can find the cached page to link new
reflink copies to the same page...
ISTR a couple of other solutions were thrown around, but I don't
think anyone came up with a simple solution...
Cheers,
Dave.
--
Dave Chinner
[email protected]