Hi!
This is another part of the series that implements new DAX page fault locking.
This part contains easy DAX cleanups and fixes that were already reviewed and
are rather obvious so they could be merged right away. They mostly remove
dead unused code. The patches didn't change since the last posting of the DAX
page fault locking series except for some added Reviewed-by tags.
Since DAX error handling patches depend on these I assume they will get merged
through NVDIMM tree. These patches depend on ext4 fixes from the series "ext4:
DAX fixes". The dependency is mostly functional (these patches remove zeroing
from DAX code and ext4 was depending on this zeroing until the above series
fixed that) so NVDIMM tree should be merged after ext4 tree to avoid breaking
ext4 DAX support.
Honza
All the filesystems are now zeroing blocks themselves for DAX IO to avoid
races between dax_io() and dax_fault(). Remove the zeroing code from
dax_io() and add warning to catch the case when somebody unexpectedly
returns new or unwritten buffer.
Reviewed-by: Ross Zwisler <[email protected]>
Signed-off-by: Jan Kara <[email protected]>
---
fs/dax.c | 28 ++++++++++------------------
1 file changed, 10 insertions(+), 18 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index ccb8bc399d78..7c0036dd1570 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -119,18 +119,6 @@ int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size)
}
EXPORT_SYMBOL_GPL(dax_clear_sectors);
-/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
-static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
- loff_t pos, loff_t end)
-{
- loff_t final = end - pos + first; /* The final byte of the buffer */
-
- if (first > 0)
- clear_pmem(addr, first);
- if (final < size)
- clear_pmem(addr + final, size - final);
-}
-
static bool buffer_written(struct buffer_head *bh)
{
return buffer_mapped(bh) && !buffer_unwritten(bh);
@@ -169,6 +157,9 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
struct blk_dax_ctl dax = {
.addr = (void __pmem *) ERR_PTR(-EIO),
};
+ unsigned blkbits = inode->i_blkbits;
+ sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
+ >> blkbits;
if (rw == READ)
end = min(end, i_size_read(inode));
@@ -176,7 +167,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
while (pos < end) {
size_t len;
if (pos == max) {
- unsigned blkbits = inode->i_blkbits;
long page = pos >> PAGE_SHIFT;
sector_t block = page << (PAGE_SHIFT - blkbits);
unsigned first = pos - (block << blkbits);
@@ -192,6 +182,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
bh->b_size = 1 << blkbits;
bh_max = pos - first + bh->b_size;
bdev = bh->b_bdev;
+ /*
+ * We allow uninitialized buffers for writes
+ * beyond EOF as those cannot race with faults
+ */
+ WARN_ON_ONCE(
+ (buffer_new(bh) && block < file_blks) ||
+ (rw == WRITE && buffer_unwritten(bh)));
} else {
unsigned done = bh->b_size -
(bh_max - (pos - first));
@@ -211,11 +208,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
rc = map_len;
break;
}
- if (buffer_unwritten(bh) || buffer_new(bh)) {
- dax_new_buf(dax.addr, map_len, first,
- pos, end);
- need_wmb = true;
- }
dax.addr += first;
size = map_len - first;
}
--
2.6.6
From: NeilBrown <[email protected]>
These don't belong in radix-tree.c any more than PAGECACHE_TAG_* do.
Let's try to maintain the idea that radix-tree simply implements an
abstract data type.
Acked-by: Ross Zwisler <[email protected]>
Reviewed-by: Matthew Wilcox <[email protected]>
Signed-off-by: NeilBrown <[email protected]>
Signed-off-by: Jan Kara <[email protected]>
---
fs/dax.c | 9 +++++++++
include/linux/radix-tree.h | 9 ---------
2 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 75ba46d82a76..08799a510b4d 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -32,6 +32,15 @@
#include <linux/pfn_t.h>
#include <linux/sizes.h>
+#define RADIX_DAX_MASK 0xf
+#define RADIX_DAX_SHIFT 4
+#define RADIX_DAX_PTE (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_PMD (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
+#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
+#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
+ RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
+
static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
{
struct request_queue *q = bdev->bd_queue;
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 51a97ac8bfbf..d08d6ec3bf53 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -52,15 +52,6 @@
#define RADIX_TREE_EXCEPTIONAL_ENTRY 2
#define RADIX_TREE_EXCEPTIONAL_SHIFT 2
-#define RADIX_DAX_MASK 0xf
-#define RADIX_DAX_SHIFT 4
-#define RADIX_DAX_PTE (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_PMD (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
-#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
-#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
- RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
dax_do_io() is calling filemap_write_and_wait() if DIO_LOCKING flags is
set. Presumably this was copied over from direct IO code. However DAX
inodes have no pagecache pages to write so the call is pointless. Remove
it.
Reviewed-by: Ross Zwisler <[email protected]>
Signed-off-by: Jan Kara <[email protected]>
---
fs/dax.c | 10 +---------
1 file changed, 1 insertion(+), 9 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 7c0036dd1570..237581441bc1 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -268,15 +268,8 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
memset(&bh, 0, sizeof(bh));
bh.b_bdev = inode->i_sb->s_bdev;
- if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
- struct address_space *mapping = inode->i_mapping;
+ if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
inode_lock(inode);
- retval = filemap_write_and_wait_range(mapping, pos, end - 1);
- if (retval) {
- inode_unlock(inode);
- goto out;
- }
- }
/* Protects against truncate */
if (!(flags & DIO_SKIP_DIO_COUNT))
@@ -297,7 +290,6 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
if (!(flags & DIO_SKIP_DIO_COUNT))
inode_dio_end(inode);
- out:
return retval;
}
EXPORT_SYMBOL_GPL(dax_do_io);
--
2.6.6
Callers of dax fault handlers must make sure these calls cannot race
with truncate. Thus it is enough to check inode size when entering the
function and we don't have to recheck it again later in the handler.
Note that inode size itself can be decreased while the fault handler
runs but filesystem locking prevents against any radix tree or block
mapping information changes resulting from the truncate and that is what
we really care about.
Reviewed-by: Ross Zwisler <[email protected]>
Signed-off-by: Jan Kara <[email protected]>
---
fs/dax.c | 60 +-----------------------------------------------------------
1 file changed, 1 insertion(+), 59 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 237581441bc1..9bc6624251b4 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -305,20 +305,11 @@ EXPORT_SYMBOL_GPL(dax_do_io);
static int dax_load_hole(struct address_space *mapping, struct page *page,
struct vm_fault *vmf)
{
- unsigned long size;
- struct inode *inode = mapping->host;
if (!page)
page = find_or_create_page(mapping, vmf->pgoff,
GFP_KERNEL | __GFP_ZERO);
if (!page)
return VM_FAULT_OOM;
- /* Recheck i_size under page lock to avoid truncate race */
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (vmf->pgoff >= size) {
- unlock_page(page);
- put_page(page);
- return VM_FAULT_SIGBUS;
- }
vmf->page = page;
return VM_FAULT_LOCKED;
@@ -549,24 +540,10 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
.sector = to_sector(bh, inode),
.size = bh->b_size,
};
- pgoff_t size;
int error;
i_mmap_lock_read(mapping);
- /*
- * Check truncate didn't happen while we were allocating a block.
- * If it did, this block may or may not be still allocated to the
- * file. We can't tell the filesystem to free it because we can't
- * take i_mutex here. In the worst case, the file still has blocks
- * allocated past the end of the file.
- */
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (unlikely(vmf->pgoff >= size)) {
- error = -EIO;
- goto out;
- }
-
if (dax_map_atomic(bdev, &dax) < 0) {
error = PTR_ERR(dax.addr);
goto out;
@@ -632,15 +609,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
put_page(page);
goto repeat;
}
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (unlikely(vmf->pgoff >= size)) {
- /*
- * We have a struct page covering a hole in the file
- * from a read fault and we've raced with a truncate
- */
- error = -EIO;
- goto unlock_page;
- }
}
error = get_block(inode, block, &bh, 0);
@@ -673,17 +641,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (error)
goto unlock_page;
vmf->page = page;
- if (!page) {
+ if (!page)
i_mmap_lock_read(mapping);
- /* Check we didn't race with truncate */
- size = (i_size_read(inode) + PAGE_SIZE - 1) >>
- PAGE_SHIFT;
- if (vmf->pgoff >= size) {
- i_mmap_unlock_read(mapping);
- error = -EIO;
- goto out;
- }
- }
return VM_FAULT_LOCKED;
}
@@ -861,23 +820,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
i_mmap_lock_read(mapping);
- /*
- * If a truncate happened while we were allocating blocks, we may
- * leave blocks allocated to the file that are beyond EOF. We can't
- * take i_mutex here, so just leave them hanging; they'll be freed
- * when the file is deleted.
- */
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (pgoff >= size) {
- result = VM_FAULT_SIGBUS;
- goto out;
- }
- if ((pgoff | PG_PMD_COLOUR) >= size) {
- dax_pmd_dbg(&bh, address,
- "offset + huge page size > file size");
- goto fallback;
- }
Now that all filesystems zero out blocks allocated for a fault handler,
we can just remove the zeroing from the handler itself. Also add checks
that no filesystem returns to us unwritten or new buffer.
Reviewed-by: Ross Zwisler <[email protected]>
Signed-off-by: Jan Kara <[email protected]>
---
fs/dax.c | 19 +++----------------
1 file changed, 3 insertions(+), 16 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index c5ccf745d279..ccb8bc399d78 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -587,11 +587,6 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
error = PTR_ERR(dax.addr);
goto out;
}
-
- if (buffer_unwritten(bh) || buffer_new(bh)) {
- clear_pmem(dax.addr, PAGE_SIZE);
- wmb_pmem();
- }
dax_unmap_atomic(bdev, &dax);
error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
@@ -670,7 +665,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (error)
goto unlock_page;
- if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
+ if (!buffer_mapped(&bh) && !vmf->cow_page) {
if (vmf->flags & FAULT_FLAG_WRITE) {
error = get_block(inode, block, &bh, 1);
count_vm_event(PGMAJFAULT);
@@ -722,7 +717,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
/* Filesystem should not return unwritten buffers to us! */
- WARN_ON_ONCE(buffer_unwritten(&bh));
+ WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
error = dax_insert_mapping(inode, &bh, vma, vmf);
out:
@@ -854,7 +849,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
if (get_block(inode, block, &bh, 1) != 0)
return VM_FAULT_SIGBUS;
alloc = true;
- WARN_ON_ONCE(buffer_unwritten(&bh));
+ WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
}
bdev = bh.b_bdev;
@@ -953,14 +948,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
dax_pmd_dbg(&bh, address, "pfn not in memmap");
goto fallback;
}
Currently ext2 zeroes any data blocks allocated for DAX inode however it
still returns them as BH_New. Thus DAX code zeroes them again in
dax_insert_mapping() which can possibly overwrite the data that has been
already stored to those blocks by a racing dax_io(). Avoid marking
pre-zeroed buffers as new.
Reviewed-by: Ross Zwisler <[email protected]>
Signed-off-by: Jan Kara <[email protected]>
---
fs/ext2/inode.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 6bd58e6ff038..1f07b758b968 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -745,11 +745,11 @@ static int ext2_get_blocks(struct inode *inode,
mutex_unlock(&ei->truncate_mutex);
goto cleanup;
}
- }
+ } else
+ set_buffer_new(bh_result);
ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
mutex_unlock(&ei->truncate_mutex);
- set_buffer_new(bh_result);
got_it:
map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
if (count > blocks_to_boundary)
--
2.6.6
Fault handlers currently take complete_unwritten argument to convert
unwritten extents after PTEs are updated. However no filesystem uses
this anymore as the code is racy. Remove the unused argument.
Reviewed-by: Ross Zwisler <[email protected]>
Signed-off-by: Jan Kara <[email protected]>
---
fs/block_dev.c | 4 ++--
fs/dax.c | 43 +++++++++----------------------------------
fs/ext2/file.c | 4 ++--
fs/ext4/file.c | 4 ++--
fs/xfs/xfs_file.c | 7 +++----
include/linux/dax.h | 17 +++++++----------
include/linux/fs.h | 1 -
7 files changed, 25 insertions(+), 55 deletions(-)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 20a2c02b77c4..b25bb230b28a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1746,7 +1746,7 @@ static const struct address_space_operations def_blk_aops = {
*/
static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return __dax_fault(vma, vmf, blkdev_get_block, NULL);
+ return __dax_fault(vma, vmf, blkdev_get_block);
}
static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
@@ -1758,7 +1758,7 @@ static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, unsigned int flags)
{
- return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
+ return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block);
}
static const struct vm_operations_struct blkdev_dax_vm_ops = {
diff --git a/fs/dax.c b/fs/dax.c
index 08799a510b4d..c5ccf745d279 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -612,19 +612,13 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
* @vma: The virtual memory area where the fault occurred
* @vmf: The description of the fault
* @get_block: The filesystem method used to translate file offsets to blocks
- * @complete_unwritten: The filesystem method used to convert unwritten blocks
- * to written so the data written to them is exposed. This is required for
- * required by write faults for filesystems that will return unwritten
- * extent mappings from @get_block, but it is optional for reads as
- * dax_insert_mapping() will always zero unwritten blocks. If the fs does
- * not support unwritten extents, the it should pass NULL.
*
* When a page fault occurs, filesystems may call this helper in their
* fault handler for DAX files. __dax_fault() assumes the caller has done all
* the necessary locking for the page fault to proceed successfully.
*/
int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block, dax_iodone_t complete_unwritten)
+ get_block_t get_block)
{
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
@@ -727,23 +721,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
page = NULL;
}
- /*
- * If we successfully insert the new mapping over an unwritten extent,
- * we need to ensure we convert the unwritten extent. If there is an
- * error inserting the mapping, the filesystem needs to leave it as
- * unwritten to prevent exposure of the stale underlying data to
- * userspace, but we still need to call the completion function so
- * the private resources on the mapping buffer can be released. We
- * indicate what the callback should do via the uptodate variable, same
- * as for normal BH based IO completions.
- */
+ /* Filesystem should not return unwritten buffers to us! */
+ WARN_ON_ONCE(buffer_unwritten(&bh));
error = dax_insert_mapping(inode, &bh, vma, vmf);
- if (buffer_unwritten(&bh)) {
- if (complete_unwritten)
- complete_unwritten(&bh, !error);
- else
- WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
- }
out:
if (error == -ENOMEM)
@@ -772,7 +752,7 @@ EXPORT_SYMBOL(__dax_fault);
* fault handler for DAX files.
*/
int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block, dax_iodone_t complete_unwritten)
+ get_block_t get_block)
{
int result;
struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -781,7 +761,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
}
- result = __dax_fault(vma, vmf, get_block, complete_unwritten);
+ result = __dax_fault(vma, vmf, get_block);
if (vmf->flags & FAULT_FLAG_WRITE)
sb_end_pagefault(sb);
@@ -815,8 +795,7 @@ static void __dax_dbg(struct buffer_head *bh, unsigned long address,
#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd")
int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd, unsigned int flags, get_block_t get_block,
- dax_iodone_t complete_unwritten)
+ pmd_t *pmd, unsigned int flags, get_block_t get_block)
{
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
@@ -875,6 +854,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
if (get_block(inode, block, &bh, 1) != 0)
return VM_FAULT_SIGBUS;
alloc = true;
+ WARN_ON_ONCE(buffer_unwritten(&bh));
}
bdev = bh.b_bdev;
@@ -1020,9 +1000,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
out:
i_mmap_unlock_read(mapping);
- if (buffer_unwritten(&bh))
- complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
-
return result;
fallback:
@@ -1042,8 +1019,7 @@ EXPORT_SYMBOL_GPL(__dax_pmd_fault);
* pmd_fault handler for DAX files.
*/
int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd, unsigned int flags, get_block_t get_block,
- dax_iodone_t complete_unwritten)
+ pmd_t *pmd, unsigned int flags, get_block_t get_block)
{
int result;
struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -1052,8 +1028,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
}
- result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
- complete_unwritten);
+ result = __dax_pmd_fault(vma, address, pmd, flags, get_block);
if (flags & FAULT_FLAG_WRITE)
sb_end_pagefault(sb);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index c1400b109805..868c02317b05 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -51,7 +51,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
down_read(&ei->dax_sem);
- ret = __dax_fault(vma, vmf, ext2_get_block, NULL);
+ ret = __dax_fault(vma, vmf, ext2_get_block);
up_read(&ei->dax_sem);
if (vmf->flags & FAULT_FLAG_WRITE)
@@ -72,7 +72,7 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
}
down_read(&ei->dax_sem);
- ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+ ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block);
up_read(&ei->dax_sem);
if (flags & FAULT_FLAG_WRITE)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index dfb33da04589..cdcab0b36faa 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -207,7 +207,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (IS_ERR(handle))
result = VM_FAULT_SIGBUS;
else
- result = __dax_fault(vma, vmf, ext4_dax_get_block, NULL);
+ result = __dax_fault(vma, vmf, ext4_dax_get_block);
if (write) {
if (!IS_ERR(handle))
@@ -243,7 +243,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
result = VM_FAULT_SIGBUS;
else
result = __dax_pmd_fault(vma, addr, pmd, flags,
- ext4_dax_get_block, NULL);
+ ext4_dax_get_block);
if (write) {
if (!IS_ERR(handle))
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 569938a4a357..c2946f436a3a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1558,7 +1558,7 @@ xfs_filemap_page_mkwrite(
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
- ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
+ ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
} else {
ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
ret = block_page_mkwrite_return(ret);
@@ -1592,7 +1592,7 @@ xfs_filemap_fault(
* changes to xfs_get_blocks_direct() to map unwritten extent
* ioend for conversion on read-only mappings.
*/
- ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
+ ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
} else
ret = filemap_fault(vma, vmf);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1629,8 +1629,7 @@ xfs_filemap_pmd_fault(
}
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
- NULL);
+ ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (flags & FAULT_FLAG_WRITE)
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 636dd59ab505..7c45ac7ea1d1 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -10,10 +10,8 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size);
int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
int dax_truncate_page(struct inode *, loff_t from, get_block_t);
-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
- dax_iodone_t);
-int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
- dax_iodone_t);
+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
#ifdef CONFIG_FS_DAX
struct page *read_dax_sector(struct block_device *bdev, sector_t n);
@@ -27,21 +25,20 @@ static inline struct page *read_dax_sector(struct block_device *bdev,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
- unsigned int flags, get_block_t, dax_iodone_t);
+ unsigned int flags, get_block_t);
int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
- unsigned int flags, get_block_t, dax_iodone_t);
+ unsigned int flags, get_block_t);
#else
static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, unsigned int flags, get_block_t gb,
- dax_iodone_t di)
+ pmd_t *pmd, unsigned int flags, get_block_t gb)
{
return VM_FAULT_FALLBACK;
}
#define __dax_pmd_fault dax_pmd_fault
#endif
int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
-#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod)
-#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod)
+#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb)
+#define __dax_mkwrite(vma, vmf, gb) __dax_fault(vma, vmf, gb)
static inline bool vma_is_dax(struct vm_area_struct *vma)
{
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 70e61b58baaf..9f2813090d1b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -74,7 +74,6 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
ssize_t bytes, void *private);
-typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
#define MAY_EXEC 0x00000001
#define MAY_WRITE 0x00000002
--
2.6.6
On Wed, May 11, 2016 at 11:58:49AM +0200, Jan Kara wrote:
> Currently ext2 zeroes any data blocks allocated for DAX inode however it
> still returns them as BH_New. Thus DAX code zeroes them again in
> dax_insert_mapping() which can possibly overwrite the data that has been
> already stored to those blocks by a racing dax_io(). Avoid marking
> pre-zeroed buffers as new.
>
> Reviewed-by: Ross Zwisler <[email protected]>
> Signed-off-by: Jan Kara <[email protected]>
> ---
> fs/ext2/inode.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
> index 6bd58e6ff038..1f07b758b968 100644
> --- a/fs/ext2/inode.c
> +++ b/fs/ext2/inode.c
> @@ -745,11 +745,11 @@ static int ext2_get_blocks(struct inode *inode,
> mutex_unlock(&ei->truncate_mutex);
> goto cleanup;
> }
> - }
> + } else
> + set_buffer_new(bh_result);
>
> ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
> mutex_unlock(&ei->truncate_mutex);
> - set_buffer_new(bh_result);
> got_it:
> map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
> if (count > blocks_to_boundary)
> --
> 2.6.6
Interestingly this change is causing a bunch of xfstests regressions for me
with ext2 + DAX. All of these tests pass without this one change.
# ./check generic/075 generic/091 generic/112 generic/127 generic/231 generic/263
FSTYP -- ext2
PLATFORM -- Linux/x86_64 alara 4.6.0-rc5+
MKFS_OPTIONS -- /dev/pmem0p2
MOUNT_OPTIONS -- -o dax -o context=system_u:object_r:nfs_t:s0 /dev/pmem0p2 /mnt/xfstests_scratch
generic/075 2s ... [failed, exit status 1] - output mismatch (see /root/xfstests/results//generic/075.out.bad)
--- tests/generic/075.out 2015-10-02 10:19:36.798795839 -0600
+++ /root/xfstests/results//generic/075.out.bad 2016-05-12 12:40:06.558706982 -0600
@@ -4,15 +4,5 @@
-----------------------------------------------
fsx.0 : -d -N numops -S 0
-----------------------------------------------
-
------------------------------------------------
-fsx.1 : -d -N numops -S 0 -x
------------------------------------------------
...
(Run 'diff -u tests/generic/075.out /root/xfstests/results//generic/075.out.bad' to see the entire diff)
generic/091 1s ... [failed, exit status 1] - output mismatch (see /root/xfstests/results//generic/091.out.bad)
--- tests/generic/091.out 2015-10-02 10:19:36.800795853 -0600
+++ /root/xfstests/results//generic/091.out.bad 2016-05-12 12:40:07.366712217 -0600
@@ -1,7 +1,110 @@
QA output created by 091
fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W
-fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W
-fsx -N 10000 -o 32768 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W
-fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W
-fsx -N 10000 -o 32768 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W
-fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -W
...
(Run 'diff -u tests/generic/091.out /root/xfstests/results//generic/091.out.bad' to see the entire diff)
generic/112 2s ... [failed, exit status 1] - output mismatch (see /root/xfstests/results//generic/112.out.bad)
--- tests/generic/112.out 2015-10-02 10:19:36.802795866 -0600
+++ /root/xfstests/results//generic/112.out.bad 2016-05-12 12:40:08.601720218 -0600
@@ -4,15 +4,5 @@
-----------------------------------------------
fsx.0 : -A -d -N numops -S 0
-----------------------------------------------
-
------------------------------------------------
-fsx.1 : -A -d -N numops -S 0 -x
------------------------------------------------
...
(Run 'diff -u tests/generic/112.out /root/xfstests/results//generic/112.out.bad' to see the entire diff)
generic/127 14s ... - output mismatch (see /root/xfstests/results//generic/127.out.bad)
--- tests/generic/127.out 2016-02-17 16:02:40.110656181 -0700
+++ /root/xfstests/results//generic/127.out.bad 2016-05-12 12:40:16.861773730 -0600
@@ -4,10 +4,905 @@
=== FSX Light Mode, Memory Mapping ===
All 100000 operations completed A-OK!
=== FSX Standard Mode, No Memory Mapping ===
-All 100000 operations completed A-OK!
+ltp/fsx -q -l 262144 -o 65536 -S 191110531 -N 100000 -R -W fsx_std_nommap
+READ BAD DATA: offset = 0xd72d, size = 0x826b, fname = /mnt/xfstests_test/fsx_std_nommap
+OFFSET GOOD BAD RANGE
...
(Run 'diff -u tests/generic/127.out /root/xfstests/results//generic/127.out.bad' to see the entire diff)
generic/231 22s ... [failed, exit status 1] - output mismatch (see /root/xfstests/results//generic/231.out.bad)
--- tests/generic/231.out 2016-02-17 16:02:40.120656239 -0700
+++ /root/xfstests/results//generic/231.out.bad 2016-05-12 12:40:19.266789311 -0600
@@ -1,16 +1,1802 @@
QA output created by 231
=== FSX Standard Mode, Memory Mapping, 1 Tasks ===
-All 20000 operations completed A-OK!
-Comparing user usage
-Comparing group usage
-=== FSX Standard Mode, Memory Mapping, 4 Tasks ===
-All 20000 operations completed A-OK!
...
(Run 'diff -u tests/generic/231.out /root/xfstests/results//generic/231.out.bad' to see the entire diff)
generic/263 1s ... [failed, exit status 1] - output mismatch (see /root/xfstests/results//generic/263.out.bad)
--- tests/generic/263.out 2015-10-02 10:19:36.807795900 -0600
+++ /root/xfstests/results//generic/263.out.bad 2016-05-12 12:40:19.801792777 -0600
@@ -1,3 +1,1429 @@
QA output created by 263
fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z
-fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z
+main: filesystem does not support fallocate mode 0, disabling!
+main: filesystem does not support fallocate mode FALLOC_FL_KEEP_SIZE, disabling!
+main: filesystem does not support fallocate mode FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, disabling!
+main: filesystem does not support fallocate mode FALLOC_FL_ZERO_RANGE, disabling!
...
(Run 'diff -u tests/generic/263.out /root/xfstests/results//generic/263.out.bad' to see the entire diff)
Ran: generic/075 generic/091 generic/112 generic/127 generic/231 generic/263
Failures: generic/075 generic/091 generic/112 generic/127 generic/231 generic/263
Failed 6 of 6 tests
On Thu 12-05-16 12:45:22, Ross Zwisler wrote:
> On Wed, May 11, 2016 at 11:58:49AM +0200, Jan Kara wrote:
> > Currently ext2 zeroes any data blocks allocated for DAX inode however it
> > still returns them as BH_New. Thus DAX code zeroes them again in
> > dax_insert_mapping() which can possibly overwrite the data that has been
> > already stored to those blocks by a racing dax_io(). Avoid marking
> > pre-zeroed buffers as new.
> >
> > Reviewed-by: Ross Zwisler <[email protected]>
> > Signed-off-by: Jan Kara <[email protected]>
> > ---
> > fs/ext2/inode.c | 4 ++--
> > 1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
> > index 6bd58e6ff038..1f07b758b968 100644
> > --- a/fs/ext2/inode.c
> > +++ b/fs/ext2/inode.c
> > @@ -745,11 +745,11 @@ static int ext2_get_blocks(struct inode *inode,
> > mutex_unlock(&ei->truncate_mutex);
> > goto cleanup;
> > }
> > - }
> > + } else
> > + set_buffer_new(bh_result);
> >
> > ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
> > mutex_unlock(&ei->truncate_mutex);
> > - set_buffer_new(bh_result);
> > got_it:
> > map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
> > if (count > blocks_to_boundary)
> > --
> > 2.6.6
>
> Interestingly this change is causing a bunch of xfstests regressions for me
> with ext2 + DAX. All of these tests pass without this one change.
Good catch. Attached patch fixes this issue for me. Preferably it should be
merged before the above ext2 change.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Mon, 2016-05-16 at 17:22 +0200, Jan Kara wrote:
> On Thu 12-05-16 12:45:22, Ross Zwisler wrote:
> >
> > On Wed, May 11, 2016 at 11:58:49AM +0200, Jan Kara wrote:
> > >
> > > Currently ext2 zeroes any data blocks allocated for DAX inode
> > > however it
> > > still returns them as BH_New. Thus DAX code zeroes them again in
> > > dax_insert_mapping() which can possibly overwrite the data that
> > > has been
> > > already stored to those blocks by a racing dax_io(). Avoid
> > > marking
> > > pre-zeroed buffers as new.
> > >
> > > Reviewed-by: Ross Zwisler <[email protected]>
> > > Signed-off-by: Jan Kara <[email protected]>
> > > ---
> > > fs/ext2/inode.c | 4 ++--
> > > 1 file changed, 2 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
> > > index 6bd58e6ff038..1f07b758b968 100644
> > > --- a/fs/ext2/inode.c
> > > +++ b/fs/ext2/inode.c
> > > @@ -745,11 +745,11 @@ static int ext2_get_blocks(struct inode
> > > *inode,
> > > mutex_unlock(&ei->truncate_mutex);
> > > goto cleanup;
> > > }
> > > - }
> > > + } else
> > > + set_buffer_new(bh_result);
> > >
> > > ext2_splice_branch(inode, iblock, partial,
> > > indirect_blks, count);
> > > mutex_unlock(&ei->truncate_mutex);
> > > - set_buffer_new(bh_result);
> > > got_it:
> > > map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-
> > > 1].key));
> > > if (count > blocks_to_boundary)
> > > --
> > > 2.6.6
> > Interestingly this change is causing a bunch of xfstests
> > regressions for me
> > with ext2 + DAX. All of these tests pass without this one change.
> Good catch. Attached patch fixes this issue for me. Preferably it
> should be
> merged before the above ext2 change.
>
> Honza
Hey Jan,
In my patch 3 of the error handling series, I have:
- err = dax_clear_sectors(inode->i_sb->s_bdev,
- le32_to_cpu(chain[depth-1].key) <<
- (inode->i_blkbits - 9),
- 1 << inode->i_blkbits);
+ err = sb_issue_zeroout(inode->i_sb,
+ le32_to_cpu(chain[depth-1].key), 1, GFP_NOFS);
Does this mean I have to change to send the sb_issue_zeroout for
'count' blocks.. i.e.
- err = dax_clear_sectors(inode->i_sb->s_bdev,
- le32_to_cpu(chain[depth-1].key) <<
- (inode->i_blkbits - 9),
- 1 << inode->i_blkbits);
+ err = sb_issue_zeroout(inode->i_sb,
+ le32_to_cpu(chain[depth-1].key), count, GFP_NOFS);
If so, I'll update my series tomorrow to include in both of these changes.
Thanks,
-Vishal
On Tue 17-05-16 06:52:47, Verma, Vishal L wrote:
> On Mon, 2016-05-16 at 17:22 +0200, Jan Kara wrote:
> > On Thu 12-05-16 12:45:22, Ross Zwisler wrote:
> > >
> > > On Wed, May 11, 2016 at 11:58:49AM +0200, Jan Kara wrote:
> > > >
> > > > Currently ext2 zeroes any data blocks allocated for DAX inode
> > > > however it
> > > > still returns them as BH_New. Thus DAX code zeroes them again in
> > > > dax_insert_mapping() which can possibly overwrite the data that
> > > > has been
> > > > already stored to those blocks by a racing dax_io(). Avoid
> > > > marking
> > > > pre-zeroed buffers as new.
> > > >
> > > > Reviewed-by: Ross Zwisler <[email protected]>
> > > > Signed-off-by: Jan Kara <[email protected]>
> > > > ---
> > > > ?fs/ext2/inode.c | 4 ++--
> > > > ?1 file changed, 2 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
> > > > index 6bd58e6ff038..1f07b758b968 100644
> > > > --- a/fs/ext2/inode.c
> > > > +++ b/fs/ext2/inode.c
> > > > @@ -745,11 +745,11 @@ static int ext2_get_blocks(struct inode
> > > > *inode,
> > > > ? mutex_unlock(&ei->truncate_mutex);
> > > > ? goto cleanup;
> > > > ? }
> > > > - }
> > > > + } else
> > > > + set_buffer_new(bh_result);
> > > > ?
> > > > ? ext2_splice_branch(inode, iblock, partial,
> > > > indirect_blks, count);
> > > > ? mutex_unlock(&ei->truncate_mutex);
> > > > - set_buffer_new(bh_result);
> > > > ?got_it:
> > > > ? map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-
> > > > 1].key));
> > > > ? if (count > blocks_to_boundary)
> > > > --?
> > > > 2.6.6
> > > Interestingly this change is causing a bunch of xfstests
> > > regressions for me
> > > with ext2 + DAX.??All of these tests pass without this one change.
> > Good catch. Attached patch fixes this issue for me. Preferably it
> > should be
> > merged before the above ext2 change.
> >
> > Honza
>
> Hey Jan,
>
> In my patch 3 of the error handling series, I have:
>
> -???????????????err = dax_clear_sectors(inode->i_sb->s_bdev,
> -???????????????????????????????le32_to_cpu(chain[depth-1].key) <<
> -???????????????????????????????(inode->i_blkbits - 9),
> -???????????????????????????????1 << inode->i_blkbits);
> +???????????????err = sb_issue_zeroout(inode->i_sb,
> +???????????????????????????????le32_to_cpu(chain[depth-1].key), 1, GFP_NOFS);
>
> Does this mean I have to change to send the sb_issue_zeroout for
> 'count' blocks.. i.e.
Yes, I've noticed the conflict today as well.
> -???????????????err = dax_clear_sectors(inode->i_sb->s_bdev,
> -???????????????????????????????le32_to_cpu(chain[depth-1].key) <<
> -???????????????????????????????(inode->i_blkbits - 9),
> -???????????????????????????????1 << inode->i_blkbits);
> +???????????????err = sb_issue_zeroout(inode->i_sb,
> +???????????????????????????????le32_to_cpu(chain[depth-1].key), count, GFP_NOFS);
>
> If so, I'll update my series tomorrow to include in both of these changes.
I'd prefer these two to stay separate commits (they are really
independent). Since you already depend on other patches from the DAX
cleanup series, just add this patch to the list of dependencies and base
your change on that... Hmm?
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Tue, 2016-05-17 at 09:19 +0200, Jan Kara wrote:
> On Tue 17-05-16 06:52:47, Verma, Vishal L wrote:
[...]
> >
> >
> > - err = dax_clear_sectors(inode->i_sb->s_bdev,
> > - le32_to_cpu(chain[depth-1].key) <<
> > - (inode->i_blkbits - 9),
> > - 1 << inode->i_blkbits);
> > + err = sb_issue_zeroout(inode->i_sb,
> > + le32_to_cpu(chain[depth-1].key),
> > count, GFP_NOFS);
> >
> > If so, I'll update my series tomorrow to include in both of these
> > changes.
> I'd prefer these two to stay separate commits (they are really
> independent). Since you already depend on other patches from the DAX
> cleanup series, just add this patch to the list of dependencies and
> base
> your change on that... Hmm?
Yes I agree. I'm preparing a branch that adds your fix, and modifying my
sb_issue_zeroout to use the same fix. I've verified that those tests
pass after both changes.
Thanks!
-Vishal