2017-05-10 08:54:19

by Jan Kara

[permalink] [raw]
Subject: [PATCH 4/4] dax: Fix data corruption when fault races with write

Currently DAX read fault can race with write(2) in the following way:

CPU1 - write(2) CPU2 - read fault
dax_iomap_pte_fault()
->iomap_begin() - sees hole
dax_iomap_rw()
iomap_apply()
->iomap_begin - allocates blocks
dax_iomap_actor()
invalidate_inode_pages2_range()
- there's nothing to invalidate
grab_mapping_entry()
- we add zero page in the radix tree
and map it to page tables

The result is that hole page is mapped into page tables (and thus zeros
are seen in mmap) while file has data written in that place.

Fix the problem by locking exception entry before mapping blocks for the
fault. That way we are sure invalidate_inode_pages2_range() call for
racing write will either block on entry lock waiting for the fault to
finish (and unmap stale page tables after that) or read fault will see
already allocated blocks by write(2).

Fixes: 9f141d6ef6258a3a37a045842d9ba7e68f368956
CC: [email protected]
Reviewed-by: Ross Zwisler <ross.zwisler-VuQAYsv1563Yd54FQh9/[email protected]>
Signed-off-by: Jan Kara <[email protected]>
---
fs/dax.c | 32 ++++++++++++++++----------------
1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 123d9903c77d..32f020c9cedf 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1148,6 +1148,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
flags |= IOMAP_WRITE;

+ entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
+ if (IS_ERR(entry)) {
+ vmf_ret = dax_fault_return(PTR_ERR(entry));
+ goto out;
+ }
+
/*
* Note that we don't bother to use iomap_apply here: DAX required
* the file system block size to be equal the page size, which means
@@ -1156,17 +1162,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
if (error) {
vmf_ret = dax_fault_return(error);
- goto out;
+ goto unlock_entry;
}
if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
- vmf_ret = dax_fault_return(-EIO); /* fs corruption? */
- goto finish_iomap;
- }
-
- entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
- if (IS_ERR(entry)) {
- vmf_ret = dax_fault_return(PTR_ERR(entry));
- goto finish_iomap;
+ error = -EIO; /* fs corruption? */
+ goto error_finish_iomap;
}

sector = dax_iomap_sector(&iomap, pos);
@@ -1188,13 +1188,13 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
}

if (error)
- goto error_unlock_entry;
+ goto error_finish_iomap;

__SetPageUptodate(vmf->cow_page);
vmf_ret = finish_fault(vmf);
if (!vmf_ret)
vmf_ret = VM_FAULT_DONE_COW;
- goto unlock_entry;
+ goto finish_iomap;
}

switch (iomap.type) {
@@ -1214,7 +1214,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
case IOMAP_HOLE:
if (!(vmf->flags & FAULT_FLAG_WRITE)) {
vmf_ret = dax_load_hole(mapping, &entry, vmf);
- goto unlock_entry;
+ goto finish_iomap;
}
/*FALLTHRU*/
default:
@@ -1223,10 +1223,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
break;
}

- error_unlock_entry:
+ error_finish_iomap:
vmf_ret = dax_fault_return(error) | major;
- unlock_entry:
- put_locked_mapping_entry(mapping, vmf->pgoff, entry);
finish_iomap:
if (ops->iomap_end) {
int copied = PAGE_SIZE;
@@ -1241,7 +1239,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
*/
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
}
-out:
+ unlock_entry:
+ put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ out:
trace_dax_pte_fault_done(inode, vmf, vmf_ret);
return vmf_ret;
}
--
2.12.0


2017-05-10 17:27:00

by Ross Zwisler

[permalink] [raw]
Subject: [PATCH 5/4] dax: Fix PMD data corruption when fault races with write

This is based on a patch from Jan Kara that fixed the equivalent race in
the DAX PTE fault path.

Currently DAX PMD read fault can race with write(2) in the following way:

CPU1 - write(2) CPU2 - read fault
dax_iomap_pmd_fault()
->iomap_begin() - sees hole

dax_iomap_rw()
iomap_apply()
->iomap_begin - allocates blocks
dax_iomap_actor()
invalidate_inode_pages2_range()
- there's nothing to invalidate

grab_mapping_entry()
- we add huge zero page to the radix tree
and map it to page tables

The result is that hole page is mapped into page tables (and thus zeros
are seen in mmap) while file has data written in that place.

Fix the problem by locking exception entry before mapping blocks for the
fault. That way we are sure invalidate_inode_pages2_range() call for
racing write will either block on entry lock waiting for the fault to
finish (and unmap stale page tables after that) or read fault will see
already allocated blocks by write(2).

Signed-off-by: Ross Zwisler <[email protected]>
Fixes: 9f141d6ef6258a3a37a045842d9ba7e68f368956
CC: [email protected]
---

Jan, I just realized that we need an equivalent fix in the PMD path. Let's
keep this with the rest of your series so they get applied together,
applied to stable together, etc.

This applies cleanly to the current linux/master (56868a460b83) + the four
patches from Jan's series. I've run it through xfstests and some targeted
testing for the PMD path.

---
fs/dax.c | 28 ++++++++++++++--------------
1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 32f020c..93ae872 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1388,6 +1388,16 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
goto fallback;

/*
+ * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
+ * PMD or a HZP entry. If it can't (because a 4k page is already in
+ * the tree, for instance), it will return -EEXIST and we just fall
+ * back to 4k entries.
+ */
+ entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
+ if (IS_ERR(entry))
+ goto fallback;
+
+ /*
* Note that we don't use iomap_apply here. We aren't doing I/O, only
* setting up a mapping, so really we're using iomap_begin() as a way
* to look up our filesystem block.
@@ -1395,21 +1405,11 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
pos = (loff_t)pgoff << PAGE_SHIFT;
error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
if (error)
- goto fallback;
+ goto unlock_entry;

if (iomap.offset + iomap.length < pos + PMD_SIZE)
goto finish_iomap;

- /*
- * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
- * PMD or a HZP entry. If it can't (because a 4k page is already in
- * the tree, for instance), it will return -EEXIST and we just fall
- * back to 4k entries.
- */
- entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
- if (IS_ERR(entry))
- goto finish_iomap;
-
switch (iomap.type) {
case IOMAP_MAPPED:
result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
@@ -1417,7 +1417,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (WARN_ON_ONCE(write))
- goto unlock_entry;
+ break;
result = dax_pmd_load_hole(vmf, &iomap, &entry);
break;
default:
@@ -1425,8 +1425,6 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
break;
}

- unlock_entry:
- put_locked_mapping_entry(mapping, pgoff, entry);
finish_iomap:
if (ops->iomap_end) {
int copied = PMD_SIZE;
@@ -1442,6 +1440,8 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
&iomap);
}
+ unlock_entry:
+ put_locked_mapping_entry(mapping, pgoff, entry);
fallback:
if (result == VM_FAULT_FALLBACK) {
split_huge_pmd(vma, vmf->pmd, vmf->address);
--
2.9.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

2017-05-11 08:39:08

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 5/4] dax: Fix PMD data corruption when fault races with write

On Wed 10-05-17 11:27:00, Ross Zwisler wrote:
> This is based on a patch from Jan Kara that fixed the equivalent race in
> the DAX PTE fault path.
>
> Currently DAX PMD read fault can race with write(2) in the following way:
>
> CPU1 - write(2) CPU2 - read fault
> dax_iomap_pmd_fault()
> ->iomap_begin() - sees hole
>
> dax_iomap_rw()
> iomap_apply()
> ->iomap_begin - allocates blocks
> dax_iomap_actor()
> invalidate_inode_pages2_range()
> - there's nothing to invalidate
>
> grab_mapping_entry()
> - we add huge zero page to the radix tree
> and map it to page tables
>
> The result is that hole page is mapped into page tables (and thus zeros
> are seen in mmap) while file has data written in that place.
>
> Fix the problem by locking exception entry before mapping blocks for the
> fault. That way we are sure invalidate_inode_pages2_range() call for
> racing write will either block on entry lock waiting for the fault to
> finish (and unmap stale page tables after that) or read fault will see
> already allocated blocks by write(2).
>
> Signed-off-by: Ross Zwisler <[email protected]>
> Fixes: 9f141d6ef6258a3a37a045842d9ba7e68f368956
> CC: [email protected]
> ---
>
> Jan, I just realized that we need an equivalent fix in the PMD path. Let's
> keep this with the rest of your series so they get applied together,
> applied to stable together, etc.
>
> This applies cleanly to the current linux/master (56868a460b83) + the four
> patches from Jan's series. I've run it through xfstests and some targeted
> testing for the PMD path.

Ah, right. Thanks for fixing it up. The patch looks good. You can add:

Reviewed-by: Jan Kara <[email protected]>

Honza

> ---
> fs/dax.c | 28 ++++++++++++++--------------
> 1 file changed, 14 insertions(+), 14 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 32f020c..93ae872 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1388,6 +1388,16 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
> goto fallback;
>
> /*
> + * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
> + * PMD or a HZP entry. If it can't (because a 4k page is already in
> + * the tree, for instance), it will return -EEXIST and we just fall
> + * back to 4k entries.
> + */
> + entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
> + if (IS_ERR(entry))
> + goto fallback;
> +
> + /*
> * Note that we don't use iomap_apply here. We aren't doing I/O, only
> * setting up a mapping, so really we're using iomap_begin() as a way
> * to look up our filesystem block.
> @@ -1395,21 +1405,11 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
> pos = (loff_t)pgoff << PAGE_SHIFT;
> error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
> if (error)
> - goto fallback;
> + goto unlock_entry;
>
> if (iomap.offset + iomap.length < pos + PMD_SIZE)
> goto finish_iomap;
>
> - /*
> - * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
> - * PMD or a HZP entry. If it can't (because a 4k page is already in
> - * the tree, for instance), it will return -EEXIST and we just fall
> - * back to 4k entries.
> - */
> - entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
> - if (IS_ERR(entry))
> - goto finish_iomap;
> -
> switch (iomap.type) {
> case IOMAP_MAPPED:
> result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
> @@ -1417,7 +1417,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
> case IOMAP_UNWRITTEN:
> case IOMAP_HOLE:
> if (WARN_ON_ONCE(write))
> - goto unlock_entry;
> + break;
> result = dax_pmd_load_hole(vmf, &iomap, &entry);
> break;
> default:
> @@ -1425,8 +1425,6 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
> break;
> }
>
> - unlock_entry:
> - put_locked_mapping_entry(mapping, pgoff, entry);
> finish_iomap:
> if (ops->iomap_end) {
> int copied = PMD_SIZE;
> @@ -1442,6 +1440,8 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
> ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
> &iomap);
> }
> + unlock_entry:
> + put_locked_mapping_entry(mapping, pgoff, entry);
> fallback:
> if (result == VM_FAULT_FALLBACK) {
> split_huge_pmd(vma, vmf->pmd, vmf->address);
> --
> 2.9.3
>
--
Jan Kara <[email protected]>
SUSE Labs, CR