2013-08-28 12:22:03

by Maxim Patlasov

[permalink] [raw]
Subject: [PATCH] fuse: hotfix truncate_pagecache() issue

The way how fuse calls truncate_pagecache() from fuse_change_attributes()
is completely wrong. Because, w/o i_mutex held, we never sure whether
'oldsize' and 'attr->size' are valid by the time of execution of
truncate_pagecache(inode, oldsize, attr->size). In fact, as soon as we
released fc->lock in the middle of fuse_change_attributes(), we completely
loose control of actions which may happen with given inode until we reach
truncate_pagecache. The list of potentially dangerous actions includes mmap-ed
reads and writes, ftruncate(2) and write(2) extending file size.

The typical outcome of doing truncate_pagecache() with outdated arguments is
data corruption from user point of view. This is (in some sense) acceptable
in cases when the issue is triggered by a change of the file on the server
(i.e. externally wrt fuse operation), but it is absolutely intolerable in
scenarios when a single fuse client modifies a file without any external
intervention. A real life case I discovered by fsx-linux looked like this:

1. Shrinking ftruncate(2) comes to fuse_do_setattr(). The latter sends
FUSE_SETATTR to the server synchronously, but before getting fc->lock ...
2. fuse_dentry_revalidate() is asynchronously called. It sends FUSE_LOOKUP
to the server synchronously, then calls fuse_change_attributes(). The latter
updates i_size, releases fc->lock, but before comparing oldsize vs attr->size..
3. fuse_do_setattr() from the first step proceeds by acquiring fc->lock and
updating attributes and i_size, but now oldsize is equal to outarg.attr.size
because i_size has just been updated (step 2). Hence, fuse_do_setattr()
returns w/o calling truncate_pagecache().
4. As soon as ftruncate(2) completes, the user extends file size by write(2)
making a hole in the middle of file, then reads data from the hole either by
read(2) or mmap-ed read. The user expects to get zero data from the hole, but
gets stale data because truncate_pagecache() is not executed yet.

The patch is a hotfix resolving the issue in a simplistic way: let's skip
dangerous i_size update and truncate_pagecache if an operation changing file
size is in progress. This simplistic approach looks correct for the cases
w/o external changes. And to handle them properly, more sophisticated and
intrusive techniques (e.g. NFS-like one) would be required. I'd like
to postpone it until the issue is well discussed on the mailing list(s).

Signed-off-by: Maxim Patlasov <[email protected]>
---
fs/fuse/dir.c | 7 ++++++-
fs/fuse/file.c | 8 +++++++-
fs/fuse/fuse_i.h | 2 ++
fs/fuse/inode.c | 3 ++-
4 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 72a5d5b..c3eb878 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1590,6 +1590,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
struct file *file)
{
struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_req *req;
struct fuse_setattr_in inarg;
struct fuse_attr_out outarg;
@@ -1617,8 +1618,10 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
if (IS_ERR(req))
return PTR_ERR(req);

- if (is_truncate)
+ if (is_truncate) {
fuse_set_nowrite(inode);
+ set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+ }

memset(&inarg, 0, sizeof(inarg));
memset(&outarg, 0, sizeof(outarg));
@@ -1680,12 +1683,14 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
invalidate_inode_pages2(inode->i_mapping);
}

+ clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
return 0;

error:
if (is_truncate)
fuse_release_nowrite(inode);

+ clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
return err;
}

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5c121fe..2f1e87c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -629,7 +629,8 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
struct fuse_inode *fi = get_fuse_inode(inode);

spin_lock(&fc->lock);
- if (attr_ver == fi->attr_version && size < inode->i_size) {
+ if (attr_ver == fi->attr_version && size < inode->i_size &&
+ !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
fi->attr_version = ++fc->attr_version;
i_size_write(inode, size);
}
@@ -1032,12 +1033,16 @@ static ssize_t fuse_perform_write(struct file *file,
{
struct inode *inode = mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
int err = 0;
ssize_t res = 0;

if (is_bad_inode(inode))
return -EIO;

+ if (inode->i_size < pos + iov_iter_count(ii))
+ set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
do {
struct fuse_req *req;
ssize_t count;
@@ -1073,6 +1078,7 @@ static ssize_t fuse_perform_write(struct file *file,
if (res > 0)
fuse_write_update_size(inode, pos);

+ clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
fuse_invalidate_attr(inode);

return res > 0 ? res : err;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index fde7249..5ced199 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -115,6 +115,8 @@ struct fuse_inode {
enum {
/** Advise readdirplus */
FUSE_I_ADVISE_RDPLUS,
+ /** An operation changing file size is in progress */
+ FUSE_I_SIZE_UNSTABLE,
};

struct fuse_conn;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 0b57859..e0fe703 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -201,7 +201,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
struct timespec old_mtime;

spin_lock(&fc->lock);
- if (attr_version != 0 && fi->attr_version > attr_version) {
+ if ((attr_version != 0 && fi->attr_version > attr_version) ||
+ test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
spin_unlock(&fc->lock);
return;
}


2013-08-29 09:25:43

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH] fuse: hotfix truncate_pagecache() issue

On Wed, Aug 28, 2013 at 04:21:46PM +0400, Maxim Patlasov wrote:
> The way how fuse calls truncate_pagecache() from fuse_change_attributes()
> is completely wrong. Because, w/o i_mutex held, we never sure whether
> 'oldsize' and 'attr->size' are valid by the time of execution of
> truncate_pagecache(inode, oldsize, attr->size). In fact, as soon as we
> released fc->lock in the middle of fuse_change_attributes(), we completely
> loose control of actions which may happen with given inode until we reach
> truncate_pagecache. The list of potentially dangerous actions includes mmap-ed
> reads and writes, ftruncate(2) and write(2) extending file size.
>
> The typical outcome of doing truncate_pagecache() with outdated arguments is
> data corruption from user point of view. This is (in some sense) acceptable
> in cases when the issue is triggered by a change of the file on the server
> (i.e. externally wrt fuse operation), but it is absolutely intolerable in
> scenarios when a single fuse client modifies a file without any external
> intervention. A real life case I discovered by fsx-linux looked like this:
>
> 1. Shrinking ftruncate(2) comes to fuse_do_setattr(). The latter sends
> FUSE_SETATTR to the server synchronously, but before getting fc->lock ...
> 2. fuse_dentry_revalidate() is asynchronously called. It sends FUSE_LOOKUP
> to the server synchronously, then calls fuse_change_attributes(). The latter
> updates i_size, releases fc->lock, but before comparing oldsize vs attr->size..
> 3. fuse_do_setattr() from the first step proceeds by acquiring fc->lock and
> updating attributes and i_size, but now oldsize is equal to outarg.attr.size
> because i_size has just been updated (step 2). Hence, fuse_do_setattr()
> returns w/o calling truncate_pagecache().
> 4. As soon as ftruncate(2) completes, the user extends file size by write(2)
> making a hole in the middle of file, then reads data from the hole either by
> read(2) or mmap-ed read. The user expects to get zero data from the hole, but
> gets stale data because truncate_pagecache() is not executed yet.
>
> The patch is a hotfix resolving the issue in a simplistic way: let's skip
> dangerous i_size update and truncate_pagecache if an operation changing file
> size is in progress. This simplistic approach looks correct for the cases
> w/o external changes. And to handle them properly, more sophisticated and
> intrusive techniques (e.g. NFS-like one) would be required. I'd like
> to postpone it until the issue is well discussed on the mailing list(s).

Thanks for the analysis!

Okay, what about this even more simplistic approach?

Not tested, but I think it addresses the very crux of the issue: not truncating
the page cache even though we should.

AFAICS there's no such issue with write(2) or fallocate(2). But I haven't
thought about this very deeply.

And indeed, handling external changes even half way sanely is "fun".

Thanks,
Miklos

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 72a5d5b..fdb5036 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1675,7 +1675,8 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
* Only call invalidate_inode_pages2() after removing
* FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
*/
- if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
+ if ((S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) ||
+ is_truncate) {
truncate_pagecache(inode, oldsize, outarg.attr.size);
invalidate_inode_pages2(inode->i_mapping);
}

2013-08-29 13:01:12

by Maxim Patlasov

[permalink] [raw]
Subject: Re: [PATCH] fuse: hotfix truncate_pagecache() issue

Hi,

08/29/2013 01:25 PM, Miklos Szeredi пишет:
> On Wed, Aug 28, 2013 at 04:21:46PM +0400, Maxim Patlasov wrote:
>> The way how fuse calls truncate_pagecache() from fuse_change_attributes()
>> is completely wrong. Because, w/o i_mutex held, we never sure whether
>> 'oldsize' and 'attr->size' are valid by the time of execution of
>> truncate_pagecache(inode, oldsize, attr->size). In fact, as soon as we
>> released fc->lock in the middle of fuse_change_attributes(), we completely
>> loose control of actions which may happen with given inode until we reach
>> truncate_pagecache. The list of potentially dangerous actions includes mmap-ed
>> reads and writes, ftruncate(2) and write(2) extending file size.
>>
>> The typical outcome of doing truncate_pagecache() with outdated arguments is
>> data corruption from user point of view. This is (in some sense) acceptable
>> in cases when the issue is triggered by a change of the file on the server
>> (i.e. externally wrt fuse operation), but it is absolutely intolerable in
>> scenarios when a single fuse client modifies a file without any external
>> intervention. A real life case I discovered by fsx-linux looked like this:
>>
>> 1. Shrinking ftruncate(2) comes to fuse_do_setattr(). The latter sends
>> FUSE_SETATTR to the server synchronously, but before getting fc->lock ...
>> 2. fuse_dentry_revalidate() is asynchronously called. It sends FUSE_LOOKUP
>> to the server synchronously, then calls fuse_change_attributes(). The latter
>> updates i_size, releases fc->lock, but before comparing oldsize vs attr->size..
>> 3. fuse_do_setattr() from the first step proceeds by acquiring fc->lock and
>> updating attributes and i_size, but now oldsize is equal to outarg.attr.size
>> because i_size has just been updated (step 2). Hence, fuse_do_setattr()
>> returns w/o calling truncate_pagecache().
>> 4. As soon as ftruncate(2) completes, the user extends file size by write(2)
>> making a hole in the middle of file, then reads data from the hole either by
>> read(2) or mmap-ed read. The user expects to get zero data from the hole, but
>> gets stale data because truncate_pagecache() is not executed yet.
>>
>> The patch is a hotfix resolving the issue in a simplistic way: let's skip
>> dangerous i_size update and truncate_pagecache if an operation changing file
>> size is in progress. This simplistic approach looks correct for the cases
>> w/o external changes. And to handle them properly, more sophisticated and
>> intrusive techniques (e.g. NFS-like one) would be required. I'd like
>> to postpone it until the issue is well discussed on the mailing list(s).
> Thanks for the analysis!
>
> Okay, what about this even more simplistic approach?
>
> Not tested, but I think it addresses the very crux of the issue: not truncating
> the page cache even though we should.

The patch looks fine, but it solves only one side of the problem
(exactly what you formulated above). Another side is opposite:
truncating page cache too late, when the state of inode changed
significantly. The beginning may be as in the scenario above:
fuse_dentry_revalidate() discovered that i_size changed (due to our own
fuse_do_setattr()) and is going to call truncate_pagecache() for some
'new_size' it believes valid right now. But by the time that particular
truncate_pagecache() is called, a lot of things may happen:

1) fuse_do_setattr() called truncate_pagecache() according to your patch
2) the file was extended either by write(2) or ftruncate(2) or fallocate(2).
3) mmap-ed write make a page in the extended region dirty

The result will be the lost of data user wrote on the step '3)'. (my
patch solves the issue at least for all cases w/o external changes)

>
> AFAICS there's no such issue with write(2) or fallocate(2). But I haven't
> thought about this very deeply.

I added bits to the fuse_perform_write() to address that other side of
the issue. Fixing fallocate is also required, but I postponed it until
you include that another fix for fallocate (incorrect use of
fuse_set_nowrite()).

Thanks,
Maxim

2013-08-29 16:18:13

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH] fuse: hotfix truncate_pagecache() issue

On Thu, Aug 29, 2013 at 3:01 PM, Maxim Patlasov <[email protected]> wrote:
> The patch looks fine, but it solves only one side of the problem (exactly
> what you formulated above). Another side is opposite: truncating page cache
> too late, when the state of inode changed significantly. The beginning may
> be as in the scenario above: fuse_dentry_revalidate() discovered that i_size
> changed (due to our own fuse_do_setattr()) and is going to call
> truncate_pagecache() for some 'new_size' it believes valid right now. But by
> the time that particular truncate_pagecache() is called, a lot of things may
> happen:
>
> 1) fuse_do_setattr() called truncate_pagecache() according to your patch
> 2) the file was extended either by write(2) or ftruncate(2) or fallocate(2).
> 3) mmap-ed write make a page in the extended region dirty
>
> The result will be the lost of data user wrote on the step '3)'. (my patch
> solves the issue at least for all cases w/o external changes)

I get it.

Could you please resend the patch with all these failure cases
described in the header?

Thanks,
Miklos

2013-08-30 13:06:19

by Maxim Patlasov

[permalink] [raw]
Subject: [PATCH] fuse: hotfix truncate_pagecache() issue -v2

The way how fuse calls truncate_pagecache() from fuse_change_attributes()
is completely wrong. Because, w/o i_mutex held, we never sure whether
'oldsize' and 'attr->size' are valid by the time of execution of
truncate_pagecache(inode, oldsize, attr->size). In fact, as soon as we
released fc->lock in the middle of fuse_change_attributes(), we completely
loose control of actions which may happen with given inode until we reach
truncate_pagecache. The list of potentially dangerous actions includes mmap-ed
reads and writes, ftruncate(2) and write(2) extending file size.

The typical outcome of doing truncate_pagecache() with outdated arguments is
data corruption from user point of view. This is (in some sense) acceptable
in cases when the issue is triggered by a change of the file on the server
(i.e. externally wrt fuse operation), but it is absolutely intolerable in
scenarios when a single fuse client modifies a file without any external
intervention. A real life case I discovered by fsx-linux looked like this:

1. Shrinking ftruncate(2) comes to fuse_do_setattr(). The latter sends
FUSE_SETATTR to the server synchronously, but before getting fc->lock ...
2. fuse_dentry_revalidate() is asynchronously called. It sends FUSE_LOOKUP
to the server synchronously, then calls fuse_change_attributes(). The latter
updates i_size, releases fc->lock, but before comparing oldsize vs attr->size..
3. fuse_do_setattr() from the first step proceeds by acquiring fc->lock and
updating attributes and i_size, but now oldsize is equal to outarg.attr.size
because i_size has just been updated (step 2). Hence, fuse_do_setattr()
returns w/o calling truncate_pagecache().
4. As soon as ftruncate(2) completes, the user extends file size by write(2)
making a hole in the middle of file, then reads data from the hole either by
read(2) or mmap-ed read. The user expects to get zero data from the hole, but
gets stale data because truncate_pagecache() is not executed yet.

The scenario above illustrates one side of the problem: not truncating the
page cache even though we should. Another side corresponds to truncating page
cache too late, when the state of inode changed significantly. Theoretically,
the following is possible:

1. As in the previous scenario fuse_dentry_revalidate() discovered that i_size
changed (due to our own fuse_do_setattr()) and is going to call
truncate_pagecache() for some 'new_size' it believes valid right now. But by
the time that particular truncate_pagecache() is called ...
2. fuse_do_setattr() returns (either having called truncate_pagecache() or
not -- it doesn't matter).
3. The file is extended either by write(2) or ftruncate(2) or fallocate(2).
4. mmap-ed write makes a page in the extended region dirty.

The result will be the lost of data user wrote on the fourth step.

The patch is a hotfix resolving the issue in a simplistic way: let's skip
dangerous i_size update and truncate_pagecache if an operation changing file
size is in progress. This simplistic approach looks correct for the cases
w/o external changes. And to handle them properly, more sophisticated and
intrusive techniques (e.g. NFS-like one) would be required. I'd like
to postpone it until the issue is well discussed on the mailing list(s).

Changed in v2:
- improved patch description to cover both sides of the issue.

Signed-off-by: Maxim Patlasov <[email protected]>
---
fs/fuse/dir.c | 7 ++++++-
fs/fuse/file.c | 8 +++++++-
fs/fuse/fuse_i.h | 2 ++
fs/fuse/inode.c | 3 ++-
4 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 72a5d5b..c3eb878 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1590,6 +1590,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
struct file *file)
{
struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_req *req;
struct fuse_setattr_in inarg;
struct fuse_attr_out outarg;
@@ -1617,8 +1618,10 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
if (IS_ERR(req))
return PTR_ERR(req);

- if (is_truncate)
+ if (is_truncate) {
fuse_set_nowrite(inode);
+ set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+ }

memset(&inarg, 0, sizeof(inarg));
memset(&outarg, 0, sizeof(outarg));
@@ -1680,12 +1683,14 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
invalidate_inode_pages2(inode->i_mapping);
}

+ clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
return 0;

error:
if (is_truncate)
fuse_release_nowrite(inode);

+ clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
return err;
}

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5c121fe..2f1e87c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -629,7 +629,8 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
struct fuse_inode *fi = get_fuse_inode(inode);

spin_lock(&fc->lock);
- if (attr_ver == fi->attr_version && size < inode->i_size) {
+ if (attr_ver == fi->attr_version && size < inode->i_size &&
+ !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
fi->attr_version = ++fc->attr_version;
i_size_write(inode, size);
}
@@ -1032,12 +1033,16 @@ static ssize_t fuse_perform_write(struct file *file,
{
struct inode *inode = mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
int err = 0;
ssize_t res = 0;

if (is_bad_inode(inode))
return -EIO;

+ if (inode->i_size < pos + iov_iter_count(ii))
+ set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
do {
struct fuse_req *req;
ssize_t count;
@@ -1073,6 +1078,7 @@ static ssize_t fuse_perform_write(struct file *file,
if (res > 0)
fuse_write_update_size(inode, pos);

+ clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
fuse_invalidate_attr(inode);

return res > 0 ? res : err;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index fde7249..5ced199 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -115,6 +115,8 @@ struct fuse_inode {
enum {
/** Advise readdirplus */
FUSE_I_ADVISE_RDPLUS,
+ /** An operation changing file size is in progress */
+ FUSE_I_SIZE_UNSTABLE,
};

struct fuse_conn;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 0b57859..e0fe703 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -201,7 +201,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
struct timespec old_mtime;

spin_lock(&fc->lock);
- if (attr_version != 0 && fi->attr_version > attr_version) {
+ if ((attr_version != 0 && fi->attr_version > attr_version) ||
+ test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
spin_unlock(&fc->lock);
return;
}