Miklos,
This version implements your suggestion about cropping secondary requests
in fuse_send_writepage().
> The patch-set fixes a few issues I found while reviewing your
> recent "fixes for writepages" patch-set.
>
> The patches are for writepages.v2. If they are OK, I'll re-check
> them for for-next.
Thanks,
Maxim
---
Maxim Patlasov (4):
fuse: writepages: roll back changes if request not found
fuse: writepages: crop secondary requests
fuse: writepage: update bdi writeout when deleting secondary request
fuse: writepages: protect secondary requests from fuse file release
fs/fuse/file.c | 64 +++++++++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 51 insertions(+), 13 deletions(-)
--
Signature
fuse_writepage_in_flight() returns false if it fails to find request with
given index in fi->writepages. Then the caller proceeds with populating
data->orig_pages[] and incrementing req->num_pages. Hence,
fuse_writepage_in_flight() must revert changes it made in request before
returning false.
Signed-off-by: Maxim Patlasov <[email protected]>
---
fs/fuse/file.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 135360e..575e44f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1659,8 +1659,11 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
break;
}
}
- if (!found)
+ if (!found) {
+ new_req->num_pages = 0;
+ list_add(&new_req->writepages_entry, &fi->writepages);
goto out_unlock;
+ }
for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) {
BUG_ON(tmp->inode != new_req->inode);
If writeback happens while fuse is in FUSE_NOWRITE condition, the request
will be queued but not processed immediately (see fuse_flush_writepages()).
Until FUSE_NOWRITE becomes relaxed, more writebacks can happen. They will
be queued as "secondary" requests to that first ("primary") request.
Existing implementation crops only primary request. This is not correct
because a subsequent extending write(2) may increase i_size and then secondary
requests won't be cropped properly. The result would be stale data written to
the server to a file offset where zeros must be.
Similar problem may happen if secondary requests are attached to an in-flight
request that was already cropped.
The patch solves the issue by cropping all secondary requests in
fuse_writepage_end(). Thanks to Miklos for idea.
Signed-off-by: Maxim Patlasov <[email protected]>
---
fs/fuse/file.c | 52 +++++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 41 insertions(+), 11 deletions(-)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 575e44f..a3c7123 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1435,6 +1435,22 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
wake_up(&fi->page_waitq);
}
+/* Returns zero if the request is truncated off completely */
+static inline loff_t fuse_crop_request(struct fuse_req *req, loff_t size)
+{
+ struct fuse_write_in *inarg = &req->misc.write.in;
+ __u64 data_size = inarg->size ? : req->num_pages * PAGE_CACHE_SIZE;
+
+ if (inarg->offset + data_size <= size)
+ inarg->size = data_size;
+ else if (inarg->offset < size)
+ inarg->size = size - inarg->offset;
+ else
+ return 0;
+
+ return inarg->size;
+}
+
/* Called under fc->lock, may release and reacquire it */
static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
__releases(fc->lock)
@@ -1442,22 +1458,14 @@ __acquires(fc->lock)
{
struct fuse_inode *fi = get_fuse_inode(req->inode);
loff_t size = i_size_read(req->inode);
- struct fuse_write_in *inarg = &req->misc.write.in;
- __u64 data_size = req->num_pages * PAGE_CACHE_SIZE;
if (!fc->connected)
goto out_free;
- if (inarg->offset + data_size <= size) {
- inarg->size = data_size;
- } else if (inarg->offset < size) {
- inarg->size = size - inarg->offset;
- } else {
- /* Got truncated off completely */
- goto out_free;
- }
+ req->in.args[1].size = fuse_crop_request(req, size);
+ if (req->in.args[1].size == 0)
+ goto out_free; /* Got truncated off completely */
- req->in.args[1].size = inarg->size;
fi->writectr++;
fuse_request_send_background_locked(fc, req);
return;
@@ -1495,13 +1503,24 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
{
struct inode *inode = req->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_req *drop = NULL;
mapping_set_error(inode->i_mapping, req->out.h.error);
spin_lock(&fc->lock);
while (req->misc.write.next) {
struct fuse_req *next = req->misc.write.next;
+ struct fuse_write_in *inarg = &req->misc.write.in;
+ loff_t size = inarg->offset + inarg->size;
+
req->misc.write.next = next->misc.write.next;
next->misc.write.next = NULL;
+
+ if (fuse_crop_request(next, size) == 0) {
+ next->misc.write.next = drop;
+ drop = next;
+ continue;
+ }
+
list_add(&next->writepages_entry, &fi->writepages);
list_add_tail(&next->list, &fi->queued_writes);
fuse_flush_writepages(inode);
@@ -1510,6 +1529,17 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
fuse_writepage_finish(fc, req);
spin_unlock(&fc->lock);
fuse_writepage_free(fc, req);
+
+ while (drop) {
+ struct fuse_req *next = drop->misc.write.next;
+ struct backing_dev_info *bdi =
+ drop->inode->i_mapping->backing_dev_info;
+ dec_bdi_stat(bdi, BDI_WRITEBACK);
+ dec_zone_page_state(drop->pages[0], NR_WRITEBACK_TEMP);
+ fuse_writepage_free(fc, drop);
+ fuse_put_request(fc, drop);
+ drop = next;
+ }
}
static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
BDI_WRITTEN counter is used to estimate bdi bandwidth. It must be incremented
every time as bdi ends page writeback. No matter whether it was fulfilled by
actual write or by discarding the request (e.g. due to shrunk i_size).
Note that even before writepages patches, the case "Got truncated off
completely" was handled in fuse_send_writepage() by calling
fuse_writepage_finish() which updated BDI_WRITTEN unconditionally.
Signed-off-by: Maxim Patlasov <[email protected]>
---
fs/fuse/file.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a3c7123..5d323bd 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1536,6 +1536,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
drop->inode->i_mapping->backing_dev_info;
dec_bdi_stat(bdi, BDI_WRITEBACK);
dec_zone_page_state(drop->pages[0], NR_WRITEBACK_TEMP);
+ bdi_writeout_inc(bdi);
fuse_writepage_free(fc, drop);
fuse_put_request(fc, drop);
drop = next;
@@ -1706,11 +1707,14 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
old_req->state == FUSE_REQ_PENDING)) {
+ struct backing_dev_info *bdi = page->mapping->backing_dev_info;
+
copy_highpage(old_req->pages[0], page);
spin_unlock(&fc->lock);
- dec_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK);
+ dec_bdi_stat(bdi, BDI_WRITEBACK);
dec_zone_page_state(page, NR_WRITEBACK_TEMP);
+ bdi_writeout_inc(bdi);
fuse_writepage_free(fc, new_req);
fuse_request_free(new_req);
goto out;
All async fuse requests must be supplied with extra reference to a fuse file.
This is necessary to ensure that the fuse file is not released until all
in-flight requests are completed. Fuse secondary writeback requests must
obey this rule as well.
Signed-off-by: Maxim Patlasov <[email protected]>
---
fs/fuse/file.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5d323bd..266a792 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1521,6 +1521,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
continue;
}
+ next->ff = fuse_file_get(req->ff);
list_add(&next->writepages_entry, &fi->writepages);
list_add_tail(&next->list, &fi->queued_writes);
fuse_flush_writepages(inode);
On Wed, Oct 02, 2013 at 09:38:32PM +0400, Maxim Patlasov wrote:
> If writeback happens while fuse is in FUSE_NOWRITE condition, the request
> will be queued but not processed immediately (see fuse_flush_writepages()).
> Until FUSE_NOWRITE becomes relaxed, more writebacks can happen. They will
> be queued as "secondary" requests to that first ("primary") request.
>
> Existing implementation crops only primary request. This is not correct
> because a subsequent extending write(2) may increase i_size and then secondary
> requests won't be cropped properly. The result would be stale data written to
> the server to a file offset where zeros must be.
>
> Similar problem may happen if secondary requests are attached to an in-flight
> request that was already cropped.
>
> The patch solves the issue by cropping all secondary requests in
> fuse_writepage_end(). Thanks to Miklos for idea.
How about this, even simpler, one?
Thanks,
Miklos
Index: linux/fs/fuse/file.c
===================================================================
--- linux.orig/fs/fuse/file.c 2013-10-03 11:27:00.597084704 +0200
+++ linux/fs/fuse/file.c 2013-10-03 11:53:30.477208467 +0200
@@ -1436,12 +1436,12 @@ static void fuse_writepage_finish(struct
}
/* Called under fc->lock, may release and reacquire it */
-static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req,
+ loff_t size)
__releases(fc->lock)
__acquires(fc->lock)
{
struct fuse_inode *fi = get_fuse_inode(req->inode);
- loff_t size = i_size_read(req->inode);
struct fuse_write_in *inarg = &req->misc.write.in;
__u64 data_size = req->num_pages * PAGE_CACHE_SIZE;
@@ -1476,7 +1476,7 @@ __acquires(fc->lock)
*
* Called with fc->lock
*/
-void fuse_flush_writepages(struct inode *inode)
+void __fuse_flush_writepages(struct inode *inode, loff_t crop)
__releases(fc->lock)
__acquires(fc->lock)
{
@@ -1487,9 +1487,15 @@ __acquires(fc->lock)
while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
req = list_entry(fi->queued_writes.next, struct fuse_req, list);
list_del_init(&req->list);
- fuse_send_writepage(fc, req);
+ fuse_send_writepage(fc, req, crop);
}
}
+void fuse_flush_writepages(struct inode *inode)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+ __fuse_flush_writepages(inode, i_size_read(inode));
+}
static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
{
@@ -1499,12 +1505,13 @@ static void fuse_writepage_end(struct fu
mapping_set_error(inode->i_mapping, req->out.h.error);
spin_lock(&fc->lock);
while (req->misc.write.next) {
+ struct fuse_write_in *inarg = &req->misc.write.in;
struct fuse_req *next = req->misc.write.next;
req->misc.write.next = next->misc.write.next;
next->misc.write.next = NULL;
list_add(&next->writepages_entry, &fi->writepages);
list_add_tail(&next->list, &fi->queued_writes);
- fuse_flush_writepages(inode);
+ __fuse_flush_writepages(inode, inarg->offset + inarg->size);
}
fi->writectr--;
fuse_writepage_finish(fc, req);
On Wed, Oct 02, 2013 at 09:38:43PM +0400, Maxim Patlasov wrote:
> BDI_WRITTEN counter is used to estimate bdi bandwidth. It must be incremented
> every time as bdi ends page writeback. No matter whether it was fulfilled by
> actual write or by discarding the request (e.g. due to shrunk i_size).
>
> Note that even before writepages patches, the case "Got truncated off
> completely" was handled in fuse_send_writepage() by calling
> fuse_writepage_finish() which updated BDI_WRITTEN unconditionally.
Hmm, I'm not sure I can agree with this. If BDI_WRITTEN is used for bandwidth
estimation, then I think it's more correct not to count rewrites and truncated
pages.
But I don't see this matter either way since this is just used as a heuristic
and the occasional extra or lack of count shouldn't make a significant
difference.
Thanks,
Miklos
>
> Signed-off-by: Maxim Patlasov <[email protected]>
> ---
> fs/fuse/file.c | 6 +++++-
> 1 file changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index a3c7123..5d323bd 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -1536,6 +1536,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
> drop->inode->i_mapping->backing_dev_info;
> dec_bdi_stat(bdi, BDI_WRITEBACK);
> dec_zone_page_state(drop->pages[0], NR_WRITEBACK_TEMP);
> + bdi_writeout_inc(bdi);
> fuse_writepage_free(fc, drop);
> fuse_put_request(fc, drop);
> drop = next;
> @@ -1706,11 +1707,14 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
>
> if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
> old_req->state == FUSE_REQ_PENDING)) {
> + struct backing_dev_info *bdi = page->mapping->backing_dev_info;
> +
> copy_highpage(old_req->pages[0], page);
> spin_unlock(&fc->lock);
>
> - dec_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK);
> + dec_bdi_stat(bdi, BDI_WRITEBACK);
> dec_zone_page_state(page, NR_WRITEBACK_TEMP);
> + bdi_writeout_inc(bdi);
> fuse_writepage_free(fc, new_req);
> fuse_request_free(new_req);
> goto out;
>
On Wed, Oct 02, 2013 at 09:38:54PM +0400, Maxim Patlasov wrote:
> All async fuse requests must be supplied with extra reference to a fuse file.
> This is necessary to ensure that the fuse file is not released until all
> in-flight requests are completed. Fuse secondary writeback requests must
> obey this rule as well.
Okay, applied.
Thanks,
Miklos
>
> Signed-off-by: Maxim Patlasov <[email protected]>
> ---
> fs/fuse/file.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 5d323bd..266a792 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -1521,6 +1521,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
> continue;
> }
>
> + next->ff = fuse_file_get(req->ff);
> list_add(&next->writepages_entry, &fi->writepages);
> list_add_tail(&next->list, &fi->queued_writes);
> fuse_flush_writepages(inode);
>
On 10/03/2013 01:57 PM, Miklos Szeredi wrote:
> On Wed, Oct 02, 2013 at 09:38:32PM +0400, Maxim Patlasov wrote:
>> If writeback happens while fuse is in FUSE_NOWRITE condition, the request
>> will be queued but not processed immediately (see fuse_flush_writepages()).
>> Until FUSE_NOWRITE becomes relaxed, more writebacks can happen. They will
>> be queued as "secondary" requests to that first ("primary") request.
>>
>> Existing implementation crops only primary request. This is not correct
>> because a subsequent extending write(2) may increase i_size and then secondary
>> requests won't be cropped properly. The result would be stale data written to
>> the server to a file offset where zeros must be.
>>
>> Similar problem may happen if secondary requests are attached to an in-flight
>> request that was already cropped.
>>
>> The patch solves the issue by cropping all secondary requests in
>> fuse_writepage_end(). Thanks to Miklos for idea.
> How about this, even simpler, one?
Very cute, but unfortunately it has a flaw. See please inline comment below.
>
> Thanks,
> Miklos
>
>
> Index: linux/fs/fuse/file.c
> ===================================================================
> --- linux.orig/fs/fuse/file.c 2013-10-03 11:27:00.597084704 +0200
> +++ linux/fs/fuse/file.c 2013-10-03 11:53:30.477208467 +0200
> @@ -1436,12 +1436,12 @@ static void fuse_writepage_finish(struct
> }
>
> /* Called under fc->lock, may release and reacquire it */
> -static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
> +static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req,
> + loff_t size)
> __releases(fc->lock)
> __acquires(fc->lock)
> {
> struct fuse_inode *fi = get_fuse_inode(req->inode);
> - loff_t size = i_size_read(req->inode);
> struct fuse_write_in *inarg = &req->misc.write.in;
> __u64 data_size = req->num_pages * PAGE_CACHE_SIZE;
>
> @@ -1476,7 +1476,7 @@ __acquires(fc->lock)
> *
> * Called with fc->lock
> */
> -void fuse_flush_writepages(struct inode *inode)
> +void __fuse_flush_writepages(struct inode *inode, loff_t crop)
> __releases(fc->lock)
> __acquires(fc->lock)
> {
> @@ -1487,9 +1487,15 @@ __acquires(fc->lock)
> while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
> req = list_entry(fi->queued_writes.next, struct fuse_req, list);
> list_del_init(&req->list);
> - fuse_send_writepage(fc, req);
> + fuse_send_writepage(fc, req, crop);
> }
> }
> +void fuse_flush_writepages(struct inode *inode)
> +__releases(fc->lock)
> +__acquires(fc->lock)
> +{
> + __fuse_flush_writepages(inode, i_size_read(inode));
> +}
>
> static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
> {
> @@ -1499,12 +1505,13 @@ static void fuse_writepage_end(struct fu
> mapping_set_error(inode->i_mapping, req->out.h.error);
> spin_lock(&fc->lock);
> while (req->misc.write.next) {
> + struct fuse_write_in *inarg = &req->misc.write.in;
> struct fuse_req *next = req->misc.write.next;
> req->misc.write.next = next->misc.write.next;
> next->misc.write.next = NULL;
> list_add(&next->writepages_entry, &fi->writepages);
> list_add_tail(&next->list, &fi->queued_writes);
> - fuse_flush_writepages(inode);
> + __fuse_flush_writepages(inode, inarg->offset + inarg->size);
__fuse_flush_writepages() will ignore its 'crop' arg if fi->writectr is
below zero. This can easily happen if a request is finalized after
fuse_set_nowrite(). So in a scenario like this:
1. There is an in-flight primary request with a chain of secondary ones.
2. User calls ftruncate(2) to extend file; fuse_set_nowrite() makes
fi->writectr negative and starts waiting for completion of that
in-flight request
3. Userspace fuse daemon ACKs the request and fuse_writepage_end() is
called; it calls __fuse_flush_writepages(), but the latter does nothing
because fi->writectr < 0
4. fuse_do_setattr() proceeds extending i_size and calling
__fuse_release_nowrite(). But now new (increased) i_size will be used as
'crop' arg of __fuse_flush_writepages()
stale data can leak to the server.
Thanks,
Maxim
On 10/03/2013 02:26 PM, Miklos Szeredi wrote:
> On Wed, Oct 02, 2013 at 09:38:43PM +0400, Maxim Patlasov wrote:
>> BDI_WRITTEN counter is used to estimate bdi bandwidth. It must be incremented
>> every time as bdi ends page writeback. No matter whether it was fulfilled by
>> actual write or by discarding the request (e.g. due to shrunk i_size).
>>
>> Note that even before writepages patches, the case "Got truncated off
>> completely" was handled in fuse_send_writepage() by calling
>> fuse_writepage_finish() which updated BDI_WRITTEN unconditionally.
> Hmm, I'm not sure I can agree with this. If BDI_WRITTEN is used for bandwidth
> estimation, then I think it's more correct not to count rewrites and truncated
> pages.
I thought about it before submitting the patch, but my understanding is
a bit different. Look how balance_dirty_pages and friends juggle with
BDI_WRITTEN and BDI_DIRTIED. That layer knows nothing about fuse and its
internals. Imagine that right now (if actual backend throughput is about
10MB/sec) you believe that dirtying 26 pages per 10 milliseconds is
fine, but when they lapsed you discovers that BDI_DIRTIED delta is 26
while BDI_WRITTEN delta is only 13. Logically, you must decide to cut
dirty-rate by factor two, but the decision would be incorrect in case of
unaccounted truncated rewrites.
>
> But I don't see this matter either way since this is just used as a heuristic
> and the occasional extra or lack of count shouldn't make a significant
> difference.
I agree, but for another reason. I think it won't make a significant
difference because rewrites coinciding with writebacks coinciding with
truncations will happen very rare in real life.
Thanks,
Maxim
>
> Thanks,
> Miklos
>
>> Signed-off-by: Maxim Patlasov <[email protected]>
>> ---
>> fs/fuse/file.c | 6 +++++-
>> 1 file changed, 5 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
>> index a3c7123..5d323bd 100644
>> --- a/fs/fuse/file.c
>> +++ b/fs/fuse/file.c
>> @@ -1536,6 +1536,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
>> drop->inode->i_mapping->backing_dev_info;
>> dec_bdi_stat(bdi, BDI_WRITEBACK);
>> dec_zone_page_state(drop->pages[0], NR_WRITEBACK_TEMP);
>> + bdi_writeout_inc(bdi);
>> fuse_writepage_free(fc, drop);
>> fuse_put_request(fc, drop);
>> drop = next;
>> @@ -1706,11 +1707,14 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
>>
>> if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
>> old_req->state == FUSE_REQ_PENDING)) {
>> + struct backing_dev_info *bdi = page->mapping->backing_dev_info;
>> +
>> copy_highpage(old_req->pages[0], page);
>> spin_unlock(&fc->lock);
>>
>> - dec_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK);
>> + dec_bdi_stat(bdi, BDI_WRITEBACK);
>> dec_zone_page_state(page, NR_WRITEBACK_TEMP);
>> + bdi_writeout_inc(bdi);
>> fuse_writepage_free(fc, new_req);
>> fuse_request_free(new_req);
>> goto out;
>>
>
On Thu, Oct 03, 2013 at 05:28:30PM +0400, Maxim Patlasov wrote:
> 1. There is an in-flight primary request with a chain of secondary ones.
> 2. User calls ftruncate(2) to extend file; fuse_set_nowrite() makes
> fi->writectr negative and starts waiting for completion of that
> in-flight request
> 3. Userspace fuse daemon ACKs the request and fuse_writepage_end()
> is called; it calls __fuse_flush_writepages(), but the latter does
> nothing because fi->writectr < 0
> 4. fuse_do_setattr() proceeds extending i_size and calling
> __fuse_release_nowrite(). But now new (increased) i_size will be
> used as 'crop' arg of __fuse_flush_writepages()
>
> stale data can leak to the server.
So, lets do this then: skip fuse_flush_writepages() and call
fuse_send_writepage() directly. It will ignore the NOWRITE logic, but that's
okay, this happens rarely and cannot happen more than once in a row.
Does this look good?
Can you actually trigger this path with your testing?
Thanks,
Miklos
Index: linux/fs/fuse/file.c
===================================================================
--- linux.orig/fs/fuse/file.c 2013-10-03 12:12:33.480918954 +0200
+++ linux/fs/fuse/file.c 2013-10-03 17:06:23.702510854 +0200
@@ -1436,12 +1436,12 @@ static void fuse_writepage_finish(struct
}
/* Called under fc->lock, may release and reacquire it */
-static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req,
+ loff_t size)
__releases(fc->lock)
__acquires(fc->lock)
{
struct fuse_inode *fi = get_fuse_inode(req->inode);
- loff_t size = i_size_read(req->inode);
struct fuse_write_in *inarg = &req->misc.write.in;
__u64 data_size = req->num_pages * PAGE_CACHE_SIZE;
@@ -1482,12 +1482,13 @@ __acquires(fc->lock)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
+ size_t crop = i_size_read(inode);
struct fuse_req *req;
while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
req = list_entry(fi->queued_writes.next, struct fuse_req, list);
list_del_init(&req->list);
- fuse_send_writepage(fc, req);
+ fuse_send_writepage(fc, req, crop);
}
}
@@ -1499,12 +1500,13 @@ static void fuse_writepage_end(struct fu
mapping_set_error(inode->i_mapping, req->out.h.error);
spin_lock(&fc->lock);
while (req->misc.write.next) {
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_write_in *inarg = &req->misc.write.in;
struct fuse_req *next = req->misc.write.next;
req->misc.write.next = next->misc.write.next;
next->misc.write.next = NULL;
list_add(&next->writepages_entry, &fi->writepages);
- list_add_tail(&next->list, &fi->queued_writes);
- fuse_flush_writepages(inode);
+ fuse_send_writepage(fc, next, inarg->offset + inarg->size);
}
fi->writectr--;
fuse_writepage_finish(fc, req);
On 10/03/2013 07:14 PM, Miklos Szeredi wrote:
> On Thu, Oct 03, 2013 at 05:28:30PM +0400, Maxim Patlasov wrote:
>
>> 1. There is an in-flight primary request with a chain of secondary ones.
>> 2. User calls ftruncate(2) to extend file; fuse_set_nowrite() makes
>> fi->writectr negative and starts waiting for completion of that
>> in-flight request
>> 3. Userspace fuse daemon ACKs the request and fuse_writepage_end()
>> is called; it calls __fuse_flush_writepages(), but the latter does
>> nothing because fi->writectr < 0
>> 4. fuse_do_setattr() proceeds extending i_size and calling
>> __fuse_release_nowrite(). But now new (increased) i_size will be
>> used as 'crop' arg of __fuse_flush_writepages()
>>
>> stale data can leak to the server.
> So, lets do this then: skip fuse_flush_writepages() and call
> fuse_send_writepage() directly. It will ignore the NOWRITE logic, but that's
> okay, this happens rarely and cannot happen more than once in a row.
>
> Does this look good?
Yes, but let's at least add a comment explaining why it's safe. There
are three different cases and what you write above explains only one of
them:
1st case (trivial): there are no concurrent activities using
fuse_set/release_nowrite. Then we're on safe side because
fuse_flush_writepages() would call fuse_send_writepage() anyway.
2nd case: someone called fuse_set_nowrite and it is waiting now for
completion of all in-flight requests. Here what you wrote about
"happening rarely and no more than once" is applicable.
3rd case: someone (e.g. fuse_do_setattr()) is in the middle of
fuse_set_nowrite..fuse_release_nowrite section. The fact that
fuse_set_nowrite returned implies that all in-flight requests were
completed along with all its secondary requests (because we increment
writectr for a secondry before decrementing it for the primary -- that's
how fuse_writepage_end is implemeted). Further requests are blocked by
negative writectr. Hence there cannot be any in-flight requests and no
invocations of fuse_writepage_end while we're in
fuse_set_nowrite..fuse_release_nowrite section.
It looks obvious now, but I'm not sure we'll able to recollect it later.
>
> Can you actually trigger this path with your testing?
No.
Thanks,
Maxim
>
> Thanks,
> Miklos
>
>
> Index: linux/fs/fuse/file.c
> ===================================================================
> --- linux.orig/fs/fuse/file.c 2013-10-03 12:12:33.480918954 +0200
> +++ linux/fs/fuse/file.c 2013-10-03 17:06:23.702510854 +0200
> @@ -1436,12 +1436,12 @@ static void fuse_writepage_finish(struct
> }
>
> /* Called under fc->lock, may release and reacquire it */
> -static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
> +static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req,
> + loff_t size)
> __releases(fc->lock)
> __acquires(fc->lock)
> {
> struct fuse_inode *fi = get_fuse_inode(req->inode);
> - loff_t size = i_size_read(req->inode);
> struct fuse_write_in *inarg = &req->misc.write.in;
> __u64 data_size = req->num_pages * PAGE_CACHE_SIZE;
>
> @@ -1482,12 +1482,13 @@ __acquires(fc->lock)
> {
> struct fuse_conn *fc = get_fuse_conn(inode);
> struct fuse_inode *fi = get_fuse_inode(inode);
> + size_t crop = i_size_read(inode);
> struct fuse_req *req;
>
> while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
> req = list_entry(fi->queued_writes.next, struct fuse_req, list);
> list_del_init(&req->list);
> - fuse_send_writepage(fc, req);
> + fuse_send_writepage(fc, req, crop);
> }
> }
>
> @@ -1499,12 +1500,13 @@ static void fuse_writepage_end(struct fu
> mapping_set_error(inode->i_mapping, req->out.h.error);
> spin_lock(&fc->lock);
> while (req->misc.write.next) {
> + struct fuse_conn *fc = get_fuse_conn(inode);
> + struct fuse_write_in *inarg = &req->misc.write.in;
> struct fuse_req *next = req->misc.write.next;
> req->misc.write.next = next->misc.write.next;
> next->misc.write.next = NULL;
> list_add(&next->writepages_entry, &fi->writepages);
> - list_add_tail(&next->list, &fi->queued_writes);
> - fuse_flush_writepages(inode);
> + fuse_send_writepage(fc, next, inarg->offset + inarg->size);
> }
> fi->writectr--;
> fuse_writepage_finish(fc, req);
>
>
On Thu, Oct 3, 2013 at 5:50 PM, Maxim Patlasov <[email protected]> wrote:
> On 10/03/2013 07:14 PM, Miklos Szeredi wrote:
>>
>> On Thu, Oct 03, 2013 at 05:28:30PM +0400, Maxim Patlasov wrote:
>>
>>> 1. There is an in-flight primary request with a chain of secondary ones.
>>> 2. User calls ftruncate(2) to extend file; fuse_set_nowrite() makes
>>> fi->writectr negative and starts waiting for completion of that
>>> in-flight request
>>> 3. Userspace fuse daemon ACKs the request and fuse_writepage_end()
>>> is called; it calls __fuse_flush_writepages(), but the latter does
>>> nothing because fi->writectr < 0
>>> 4. fuse_do_setattr() proceeds extending i_size and calling
>>> __fuse_release_nowrite(). But now new (increased) i_size will be
>>> used as 'crop' arg of __fuse_flush_writepages()
>>>
>>> stale data can leak to the server.
>>
>> So, lets do this then: skip fuse_flush_writepages() and call
>> fuse_send_writepage() directly. It will ignore the NOWRITE logic, but
>> that's
>> okay, this happens rarely and cannot happen more than once in a row.
>>
>> Does this look good?
>
>
> Yes, but let's at least add a comment explaining why it's safe. There are
> three different cases and what you write above explains only one of them:
>
> 1st case (trivial): there are no concurrent activities using
> fuse_set/release_nowrite. Then we're on safe side because
> fuse_flush_writepages() would call fuse_send_writepage() anyway.
> 2nd case: someone called fuse_set_nowrite and it is waiting now for
> completion of all in-flight requests. Here what you wrote about "happening
> rarely and no more than once" is applicable.
> 3rd case: someone (e.g. fuse_do_setattr()) is in the middle of
> fuse_set_nowrite..fuse_release_nowrite section. The fact that
> fuse_set_nowrite returned implies that all in-flight requests were completed
> along with all its secondary requests (because we increment writectr for a
> secondry before decrementing it for the primary -- that's how
> fuse_writepage_end is implemeted). Further requests are blocked by negative
> writectr. Hence there cannot be any in-flight requests and no invocations of
> fuse_writepage_end while we're in fuse_set_nowrite..fuse_release_nowrite
> section.
>
> It looks obvious now, but I'm not sure we'll able to recollect it later.
Added your analysis as a comment and all patches pushed to writepages.v2.
>> Can you actually trigger this path with your testing?
>
>
> No.
Hmm, did you do any testing with the wait-for-page-writeback disabled
in fuse_mkwrite()?
Thanks,
Miklos
On 10/03/2013 08:09 PM, Miklos Szeredi wrote:
> On Thu, Oct 3, 2013 at 5:50 PM, Maxim Patlasov <[email protected]> wrote:
>> On 10/03/2013 07:14 PM, Miklos Szeredi wrote:
>>> On Thu, Oct 03, 2013 at 05:28:30PM +0400, Maxim Patlasov wrote:
>>>
>>>> 1. There is an in-flight primary request with a chain of secondary ones.
>>>> 2. User calls ftruncate(2) to extend file; fuse_set_nowrite() makes
>>>> fi->writectr negative and starts waiting for completion of that
>>>> in-flight request
>>>> 3. Userspace fuse daemon ACKs the request and fuse_writepage_end()
>>>> is called; it calls __fuse_flush_writepages(), but the latter does
>>>> nothing because fi->writectr < 0
>>>> 4. fuse_do_setattr() proceeds extending i_size and calling
>>>> __fuse_release_nowrite(). But now new (increased) i_size will be
>>>> used as 'crop' arg of __fuse_flush_writepages()
>>>>
>>>> stale data can leak to the server.
>>> So, lets do this then: skip fuse_flush_writepages() and call
>>> fuse_send_writepage() directly. It will ignore the NOWRITE logic, but
>>> that's
>>> okay, this happens rarely and cannot happen more than once in a row.
>>>
>>> Does this look good?
>>
>> Yes, but let's at least add a comment explaining why it's safe. There are
>> three different cases and what you write above explains only one of them:
>>
>> 1st case (trivial): there are no concurrent activities using
>> fuse_set/release_nowrite. Then we're on safe side because
>> fuse_flush_writepages() would call fuse_send_writepage() anyway.
>> 2nd case: someone called fuse_set_nowrite and it is waiting now for
>> completion of all in-flight requests. Here what you wrote about "happening
>> rarely and no more than once" is applicable.
>> 3rd case: someone (e.g. fuse_do_setattr()) is in the middle of
>> fuse_set_nowrite..fuse_release_nowrite section. The fact that
>> fuse_set_nowrite returned implies that all in-flight requests were completed
>> along with all its secondary requests (because we increment writectr for a
>> secondry before decrementing it for the primary -- that's how
>> fuse_writepage_end is implemeted). Further requests are blocked by negative
>> writectr. Hence there cannot be any in-flight requests and no invocations of
>> fuse_writepage_end while we're in fuse_set_nowrite..fuse_release_nowrite
>> section.
>>
>> It looks obvious now, but I'm not sure we'll able to recollect it later.
> Added your analysis as a comment and all patches pushed to writepages.v2.
Great! So I can proceed with re-basing the rest of
writeback-cache-policy pile to writepages.v2 soon.
>
>>> Can you actually trigger this path with your testing?
>>
>> No.
> Hmm, did you do any testing with the wait-for-page-writeback disabled
> in fuse_mkwrite()?
Yes, of course. I've been doing that for a week on two very different
h/w nodes, but I'm using ordinary fsx (not some artificial hand-crafted
mmap-writer) and I usually get only a dozen "rewrite: 1" messages per
night. This is enough to make sure that rewrite code main-path is OK,
but not enough to be sure that all corner cases are covered.
Thanks,
Maxim
Hi Miklos,
On 10/03/2013 08:22 PM, Maxim Patlasov wrote:
> On 10/03/2013 08:09 PM, Miklos Szeredi wrote:
>> On Thu, Oct 3, 2013 at 5:50 PM, Maxim Patlasov <[email protected]> wrote:
>>> On 10/03/2013 07:14 PM, Miklos Szeredi wrote:
>>>> On Thu, Oct 03, 2013 at 05:28:30PM +0400, Maxim Patlasov wrote:
>>>>
>>>>> 1. There is an in-flight primary request with a chain of secondary ones.
>>>>> 2. User calls ftruncate(2) to extend file; fuse_set_nowrite() makes
>>>>> fi->writectr negative and starts waiting for completion of that
>>>>> in-flight request
>>>>> 3. Userspace fuse daemon ACKs the request and fuse_writepage_end()
>>>>> is called; it calls __fuse_flush_writepages(), but the latter does
>>>>> nothing because fi->writectr < 0
>>>>> 4. fuse_do_setattr() proceeds extending i_size and calling
>>>>> __fuse_release_nowrite(). But now new (increased) i_size will be
>>>>> used as 'crop' arg of __fuse_flush_writepages()
>>>>>
>>>>> stale data can leak to the server.
>>>> So, lets do this then: skip fuse_flush_writepages() and call
>>>> fuse_send_writepage() directly. It will ignore the NOWRITE logic, but
>>>> that's
>>>> okay, this happens rarely and cannot happen more than once in a row.
>>>>
>>>> Does this look good?
>>> Yes, but let's at least add a comment explaining why it's safe. There are
>>> three different cases and what you write above explains only one of them:
>>>
>>> 1st case (trivial): there are no concurrent activities using
>>> fuse_set/release_nowrite. Then we're on safe side because
>>> fuse_flush_writepages() would call fuse_send_writepage() anyway.
>>> 2nd case: someone called fuse_set_nowrite and it is waiting now for
>>> completion of all in-flight requests. Here what you wrote about "happening
>>> rarely and no more than once" is applicable.
>>> 3rd case: someone (e.g. fuse_do_setattr()) is in the middle of
>>> fuse_set_nowrite..fuse_release_nowrite section. The fact that
>>> fuse_set_nowrite returned implies that all in-flight requests were completed
>>> along with all its secondary requests (because we increment writectr for a
>>> secondry before decrementing it for the primary -- that's how
>>> fuse_writepage_end is implemeted). Further requests are blocked by negative
>>> writectr. Hence there cannot be any in-flight requests and no invocations of
>>> fuse_writepage_end while we're in fuse_set_nowrite..fuse_release_nowrite
>>> section.
>>>
>>> It looks obvious now, but I'm not sure we'll able to recollect it later.
>> Added your analysis as a comment and all patches pushed to writepages.v2.
> Great! So I can proceed with re-basing the rest of
> writeback-cache-policy pile to writepages.v2 soon.
More testing (with writeback-cache-policy enabled) revealed another bug
in that implementation. The problem deals with a write(2) extending i_size:
1. There is an in-flight primary request now. It was properly cropped
against i_size which was valid then and is valid now. So there is a page
in the request that will be written to the server partially.
2. write(2) to a distant offset makes a hole and extends i_size.
3. write(2) populates that whole page by new user data.
4. Writeback happens and fuse_writepage_in_flight() attaches a secondary
request to the primary request.
5. fuse_writepage_end() for the primary request calls
fuse_send_writepage() with 'crop' arg equal to "inarg->offset +
inarg->size". But inarg->size was calculated before i_size extension, so
the second request will be cropped as well as primary. The result is
that the tail of secondary request populated by valid actual user data
won't be stored on the server.
The problem will be hidden by adding fuse_wait_on_page_writeback() to
write_begin fuse method, but the implementation will remain unsafe if we
believe a re-dirty may happen spontaneously. Straightforward solution
would be to crop secondary requests at the time of their queuing (using
actual i_size). Then fuse_send_writepage() would crop further only if
i_size shrunk. Please let me know if you come up with a smarter idea.
Thanks,
Maxim
On 10/09/2013 12:20 PM, Maxim Patlasov wrote:
> Hi Miklos,
>
> On 10/03/2013 08:22 PM, Maxim Patlasov wrote:
>> On 10/03/2013 08:09 PM, Miklos Szeredi wrote:
>>> On Thu, Oct 3, 2013 at 5:50 PM, Maxim Patlasov <[email protected]> wrote:
>>>> On 10/03/2013 07:14 PM, Miklos Szeredi wrote:
>>>>> On Thu, Oct 03, 2013 at 05:28:30PM +0400, Maxim Patlasov wrote:
>>>>>
>>>>>> 1. There is an in-flight primary request with a chain of secondary ones.
>>>>>> 2. User calls ftruncate(2) to extend file; fuse_set_nowrite() makes
>>>>>> fi->writectr negative and starts waiting for completion of that
>>>>>> in-flight request
>>>>>> 3. Userspace fuse daemon ACKs the request and fuse_writepage_end()
>>>>>> is called; it calls __fuse_flush_writepages(), but the latter does
>>>>>> nothing because fi->writectr < 0
>>>>>> 4. fuse_do_setattr() proceeds extending i_size and calling
>>>>>> __fuse_release_nowrite(). But now new (increased) i_size will be
>>>>>> used as 'crop' arg of __fuse_flush_writepages()
>>>>>>
>>>>>> stale data can leak to the server.
>>>>> So, lets do this then: skip fuse_flush_writepages() and call
>>>>> fuse_send_writepage() directly. It will ignore the NOWRITE logic, but
>>>>> that's
>>>>> okay, this happens rarely and cannot happen more than once in a row.
>>>>>
>>>>> Does this look good?
>>>> Yes, but let's at least add a comment explaining why it's safe. There are
>>>> three different cases and what you write above explains only one of them:
>>>>
>>>> 1st case (trivial): there are no concurrent activities using
>>>> fuse_set/release_nowrite. Then we're on safe side because
>>>> fuse_flush_writepages() would call fuse_send_writepage() anyway.
>>>> 2nd case: someone called fuse_set_nowrite and it is waiting now for
>>>> completion of all in-flight requests. Here what you wrote about "happening
>>>> rarely and no more than once" is applicable.
>>>> 3rd case: someone (e.g. fuse_do_setattr()) is in the middle of
>>>> fuse_set_nowrite..fuse_release_nowrite section. The fact that
>>>> fuse_set_nowrite returned implies that all in-flight requests were completed
>>>> along with all its secondary requests (because we increment writectr for a
>>>> secondry before decrementing it for the primary -- that's how
>>>> fuse_writepage_end is implemeted). Further requests are blocked by negative
>>>> writectr. Hence there cannot be any in-flight requests and no invocations of
>>>> fuse_writepage_end while we're in fuse_set_nowrite..fuse_release_nowrite
>>>> section.
>>>>
>>>> It looks obvious now, but I'm not sure we'll able to recollect it later.
>>> Added your analysis as a comment and all patches pushed to writepages.v2.
>> Great! So I can proceed with re-basing the rest of
>> writeback-cache-policy pile to writepages.v2 soon.
> More testing (with writeback-cache-policy enabled) revealed another bug
> in that implementation. The problem deals with a write(2) extending i_size:
>
> 1. There is an in-flight primary request now. It was properly cropped
> against i_size which was valid then and is valid now. So there is a page
> in the request that will be written to the server partially.
> 2. write(2) to a distant offset makes a hole and extends i_size.
> 3. write(2) populates that whole page by new user data.
> 4. Writeback happens and fuse_writepage_in_flight() attaches a secondary
> request to the primary request.
> 5. fuse_writepage_end() for the primary request calls
> fuse_send_writepage() with 'crop' arg equal to "inarg->offset +
> inarg->size". But inarg->size was calculated before i_size extension, so
> the second request will be cropped as well as primary. The result is
> that the tail of secondary request populated by valid actual user data
> won't be stored on the server.
>
> The problem will be hidden by adding fuse_wait_on_page_writeback() to
> write_begin fuse method, but the implementation will remain unsafe if we
> believe a re-dirty may happen spontaneously. Straightforward solution
> would be to crop secondary requests at the time of their queuing (using
> actual i_size). Then fuse_send_writepage() would crop further only if
> i_size shrunk. Please let me know if you come up with a smarter idea.
Sorry for flooding. I've just realized that the problem is actually
solved (not "hidden") by adding fuse_wait_on_page_writeback() to
write_begin fuse method. No need to rework cropping mechanism again.
Thanks,
Maxim