2003-01-29 06:00:02

by Oliver Xymoron

[permalink] [raw]
Subject: [PATCH 2.5] Report write errors to applications

Andrew, this is a forward-port of the 2.4 write error propagation
patch I just posted, applies against -mm6. Lightly tested on 2.5, but
should be straightforward.

diff -urN -x '.patch*' -x '*.orig' orig/fs/buffer.c patched/fs/buffer.c
--- orig/fs/buffer.c 2003-01-24 13:24:21.000000000 -0600
+++ patched/fs/buffer.c 2003-01-24 13:25:08.000000000 -0600
@@ -165,15 +165,27 @@
* Default synchronous end-of-IO handler.. Just mark it up-to-date and
* unlock the buffer. This is what ll_rw_block uses too.
*/
-void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
{
if (uptodate) {
set_buffer_uptodate(bh);
} else {
- /*
- * This happens, due to failed READA attempts.
- * buffer_io_error(bh);
- */
+ /* This happens, due to failed READA attempts. */
+ clear_buffer_uptodate(bh);
+ }
+ unlock_buffer(bh);
+ put_bh(bh);
+}
+
+void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+{
+ if (uptodate) {
+ set_buffer_uptodate(bh);
+ } else {
+ buffer_io_error(bh);
+ printk(KERN_WARNING "lost page write due to I/O error on %s\n",
+ bdevname(bh->b_bdev));
+ bh->b_page->mapping->error = -EIO;
clear_buffer_uptodate(bh);
}
unlock_buffer(bh);
@@ -550,6 +562,9 @@
set_buffer_uptodate(bh);
} else {
buffer_io_error(bh);
+ printk(KERN_WARNING "lost page write due to I/O error on %s\n",
+ bdevname(bh->b_bdev));
+ bh->b_page->mapping->error = -EIO;
clear_buffer_uptodate(bh);
SetPageError(page);
}
@@ -1201,7 +1216,7 @@
if (buffer_dirty(bh))
buffer_error();
get_bh(bh);
- bh->b_end_io = end_buffer_io_sync;
+ bh->b_end_io = end_buffer_read_sync;
submit_bh(READ, bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
@@ -2604,13 +2619,14 @@
continue;

get_bh(bh);
- bh->b_end_io = end_buffer_io_sync;
if (rw == WRITE) {
+ bh->b_end_io = end_buffer_write_sync;
if (test_clear_buffer_dirty(bh)) {
submit_bh(WRITE, bh);
continue;
}
} else {
+ bh->b_end_io = end_buffer_read_sync;
if (!buffer_uptodate(bh)) {
submit_bh(rw, bh);
continue;
diff -urN -x '.patch*' -x '*.orig' orig/fs/inode.c patched/fs/inode.c
--- orig/fs/inode.c 2003-01-24 13:24:21.000000000 -0600
+++ patched/fs/inode.c 2003-01-24 13:24:26.000000000 -0600
@@ -134,6 +134,7 @@
mapping->dirtied_when = 0;
mapping->assoc_mapping = NULL;
mapping->backing_dev_info = &default_backing_dev_info;
+ mapping->error = 0;
if (sb->s_bdev)
inode->i_data.backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
memset(&inode->u, 0, sizeof(inode->u));
diff -urN -x '.patch*' -x '*.orig' orig/fs/ntfs/compress.c patched/fs/ntfs/compress.c
--- orig/fs/ntfs/compress.c 2003-01-13 23:58:43.000000000 -0600
+++ patched/fs/ntfs/compress.c 2003-01-24 13:24:26.000000000 -0600
@@ -598,7 +598,7 @@
continue;
}
atomic_inc(&tbh->b_count);
- tbh->b_end_io = end_buffer_io_sync;
+ tbh->b_end_io = end_buffer_read_sync;
submit_bh(READ, tbh);
}

diff -urN -x '.patch*' -x '*.orig' orig/fs/open.c patched/fs/open.c
--- orig/fs/open.c 2003-01-13 23:58:05.000000000 -0600
+++ patched/fs/open.c 2003-01-24 13:24:26.000000000 -0600
@@ -843,21 +843,41 @@
*/
int filp_close(struct file *filp, fl_owner_t id)
{
- int retval;
+ struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
+ struct inode *inode = mapping->host;
+ int retval = 0, err;
+
+ /* Report and clear outstanding errors */
+ err = filp->f_error;
+ if (err) {
+ filp->f_error = 0;
+ retval = err;
+ }
+
+ down(&inode->i_sem);
+ err = mapping->error;
+ if (err && !retval) {
+ mapping->error = 0;
+ retval = err;
+ }
+ up(&inode->i_sem);

if (!file_count(filp)) {
printk(KERN_ERR "VFS: Close: file count is 0\n");
- return 0;
+ return retval;
}
- retval = 0;
+
if (filp->f_op && filp->f_op->flush) {
lock_kernel();
- retval = filp->f_op->flush(filp);
+ err = filp->f_op->flush(filp);
unlock_kernel();
+ if (err && !retval)
+ retval = err;
}
dnotify_flush(filp, id);
locks_remove_posix(filp, id);
fput(filp);
+
return retval;
}

diff -urN -x '.patch*' -x '*.orig' orig/include/linux/buffer_head.h patched/include/linux/buffer_head.h
--- orig/include/linux/buffer_head.h 2003-01-24 13:24:21.000000000 -0600
+++ patched/include/linux/buffer_head.h 2003-01-24 13:24:26.000000000 -0600
@@ -136,7 +136,8 @@
int try_to_free_buffers(struct page *);
void create_empty_buffers(struct page *, unsigned long,
unsigned long b_state);
-void end_buffer_io_sync(struct buffer_head *bh, int uptodate);
+void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
+void end_buffer_write_sync(struct buffer_head *bh, int uptodate);

/* Things to do with buffers at mapping->private_list */
void buffer_insert_list(spinlock_t *lock,
diff -urN -x '.patch*' -x '*.orig' orig/include/linux/fs.h patched/include/linux/fs.h
--- orig/include/linux/fs.h 2003-01-24 13:24:21.000000000 -0600
+++ patched/include/linux/fs.h 2003-01-24 13:24:26.000000000 -0600
@@ -326,6 +326,7 @@
spinlock_t private_lock; /* for use by the address_space */
struct list_head private_list; /* ditto */
struct address_space *assoc_mapping; /* ditto */
+ int error; /* write error for fsync */
};

struct char_device {
diff -urN -x '.patch*' -x '*.orig' orig/kernel/ksyms.c patched/kernel/ksyms.c
--- orig/kernel/ksyms.c 2003-01-24 13:24:21.000000000 -0600
+++ patched/kernel/ksyms.c 2003-01-24 13:24:26.000000000 -0600
@@ -176,7 +176,8 @@
EXPORT_SYMBOL(d_lookup);
EXPORT_SYMBOL(d_path);
EXPORT_SYMBOL(mark_buffer_dirty);
-EXPORT_SYMBOL(end_buffer_io_sync);
+EXPORT_SYMBOL(end_buffer_read_sync);
+EXPORT_SYMBOL(end_buffer_write_sync);
EXPORT_SYMBOL(__mark_inode_dirty);
EXPORT_SYMBOL(get_empty_filp);
EXPORT_SYMBOL(init_private_file);
diff -urN -x '.patch*' -x '*.orig' orig/mm/filemap.c patched/mm/filemap.c
--- orig/mm/filemap.c 2003-01-24 13:24:21.000000000 -0600
+++ patched/mm/filemap.c 2003-01-24 13:24:26.000000000 -0600
@@ -185,6 +185,14 @@
write_lock(&mapping->page_lock);
}
write_unlock(&mapping->page_lock);
+
+ /* Check for outstanding write errors */
+ if (mapping->error)
+ {
+ ret = mapping->error;
+ mapping->error = 0;
+ }
+
return ret;
}

diff -urN -x '.patch*' -x '*.orig' orig/mm/vmscan.c patched/mm/vmscan.c
--- orig/mm/vmscan.c 2003-01-24 13:24:21.000000000 -0600
+++ patched/mm/vmscan.c 2003-01-24 13:24:26.000000000 -0600
@@ -342,6 +342,9 @@
SetPageReclaim(page);
res = mapping->a_ops->writepage(page, &wbc);

+ if (res < 0) {
+ mapping->error = res;
+ }
if (res == WRITEPAGE_ACTIVATE) {
ClearPageReclaim(page);
goto activate_locked;


--
"Love the dolphins," she advised him. "Write by W.A.S.T.E.."


2003-01-29 07:19:58

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 2.5] Report write errors to applications

Oliver Xymoron <[email protected]> wrote:
>
> Andrew, this is a forward-port of the 2.4 write error propagation
> patch I just posted, applies against -mm6. Lightly tested on 2.5, but
> should be straightforward.

You sent Marcelo the 2.5 patch.

> diff -urN -x '.patch*' -x '*.orig' orig/fs/buffer.c patched/fs/buffer.c
> --- orig/fs/buffer.c 2003-01-24 13:24:21.000000000 -0600
> +++ patched/fs/buffer.c 2003-01-24 13:25:08.000000000 -0600
> @@ -165,15 +165,27 @@
> * Default synchronous end-of-IO handler.. Just mark it up-to-date and
> * unlock the buffer. This is what ll_rw_block uses too.
> */
> -void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
> +void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
> {
> if (uptodate) {
> set_buffer_uptodate(bh);
> } else {
> - /*
> - * This happens, due to failed READA attempts.
> - * buffer_io_error(bh);
> - */
> + /* This happens, due to failed READA attempts. */
> + clear_buffer_uptodate(bh);
> + }
> + unlock_buffer(bh);
> + put_bh(bh);
> +}
> +
> +void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
> +{
> + if (uptodate) {
> + set_buffer_uptodate(bh);
> + } else {
> + buffer_io_error(bh);
> + printk(KERN_WARNING "lost page write due to I/O error on %s\n",
> + bdevname(bh->b_bdev));
> + bh->b_page->mapping->error = -EIO;

Accessing bh->b_page here could be risky.

In 2.5, this _has_ to be safe, because submit_bh() requires it.

But in 2.4 (and in thus-far-untested corners of 2.5) it is possible that
someone is feeding a buffer_head with a garbage ->b_page into ll_rw_blk().

Now it could be that we're safe in this respect (the comment above
generic_make_request says that the caller must set up b_page, but perhaps lots
of drivers do not look at it. Not sure...)

So what you could do here is to deliberately dereference bh->b_page even in
the non-error case so those problems are quickly detected.



But probably, there's not much point in doing any of this for ll_rw_blk() and
end_buffer_io_sync(). Most of those buffers are against the blockdev
mapping, and once the errors are propagated into the blockdev mapping's
address space we no longer know what file they pertain to.

It would be better to check the uptodate state of the buffer_head in
drop_buffers(), and to propagate errors into the address_space there. For
buffers which are still attached to the S_ISREG file's address_space we're OK
- fsync_buffers_list() will handle them and will return errors to the fsync()
caller. We only need to handle those buffers which were stripped
asynchronously by VM activity.


> clear_buffer_uptodate(bh);
> }
> unlock_buffer(bh);
> @@ -550,6 +562,9 @@
> set_buffer_uptodate(bh);
> } else {
> buffer_io_error(bh);
> + printk(KERN_WARNING "lost page write due to I/O error on %s\n",
> + bdevname(bh->b_bdev));
> + bh->b_page->mapping->error = -EIO;

page->mapping->error = -EIO;

would suffice here.

> clear_buffer_uptodate(bh);
> SetPageError(page);
> }
> @@ -1201,7 +1216,7 @@
> if (buffer_dirty(bh))
> buffer_error();
> get_bh(bh);
> - bh->b_end_io = end_buffer_io_sync;
> + bh->b_end_io = end_buffer_read_sync;
> submit_bh(READ, bh);
> wait_on_buffer(bh);
> if (buffer_uptodate(bh))
> @@ -2604,13 +2619,14 @@
> continue;
>
> get_bh(bh);
> - bh->b_end_io = end_buffer_io_sync;
> if (rw == WRITE) {
> + bh->b_end_io = end_buffer_write_sync;
> if (test_clear_buffer_dirty(bh)) {
> submit_bh(WRITE, bh);
> continue;
> }
> } else {
> + bh->b_end_io = end_buffer_read_sync;
> if (!buffer_uptodate(bh)) {
> submit_bh(rw, bh);
> continue;
> diff -urN -x '.patch*' -x '*.orig' orig/fs/inode.c patched/fs/inode.c
> --- orig/fs/inode.c 2003-01-24 13:24:21.000000000 -0600
> +++ patched/fs/inode.c 2003-01-24 13:24:26.000000000 -0600
> @@ -134,6 +134,7 @@
> mapping->dirtied_when = 0;
> mapping->assoc_mapping = NULL;
> mapping->backing_dev_info = &default_backing_dev_info;
> + mapping->error = 0;

Machines can have millions of address_spaces in-core and here you have used
32 bits where two would do.

Please make mapping->error an unsigned long and just use bits zero and one
here for ENOSPC and EIO. Make sure that atomic ops are used, and this way we
get 30 spare bits in the address_space for future expansion.

> @@ -598,7 +598,7 @@
> continue;
> }
> atomic_inc(&tbh->b_count);
> - tbh->b_end_io = end_buffer_io_sync;
> + tbh->b_end_io = end_buffer_read_sync;
> submit_bh(READ, tbh);
> }
>
> diff -urN -x '.patch*' -x '*.orig' orig/fs/open.c patched/fs/open.c
> --- orig/fs/open.c 2003-01-13 23:58:05.000000000 -0600
> +++ patched/fs/open.c 2003-01-24 13:24:26.000000000 -0600
> @@ -843,21 +843,41 @@
> */
> int filp_close(struct file *filp, fl_owner_t id)
> {
> - int retval;
> + struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
> + struct inode *inode = mapping->host;
> + int retval = 0, err;
> +
> + /* Report and clear outstanding errors */
> + err = filp->f_error;
> + if (err) {
> + filp->f_error = 0;
> + retval = err;
> + }
> +
> + down(&inode->i_sem);
> + err = mapping->error;
> + if (err && !retval) {
> + mapping->error = 0;
> + retval = err;
> + }

I don't really understand why you're reporting the error on close().
Applications probably do not expect that.

It would be better to report the error to the next call to fsync(),
fdatasync() and msync(). Reporting it on close as well doesn't hurt, but
*sync() should be the main reporting interface.


> + up(&inode->i_sem);

with test_and_clear_bit() against mapping->errors we can do away with the
i_sem locking here.

--- orig/mm/vmscan.c 2003-01-24 13:24:21.000000000 -0600
+++ patched/mm/vmscan.c 2003-01-24 13:24:26.000000000 -0600
@@ -342,6 +342,9 @@
SetPageReclaim(page);
res = mapping->a_ops->writepage(page, &wbc);

+ if (res < 0) {
+ mapping->error = res;
+ }

Most writepage() activity happens in fs/mpage.c - you'll need to propagate IO
errors into the address_space in the BIO completion handlers there.

2003-01-29 16:15:04

by Oliver Xymoron

[permalink] [raw]
Subject: Re: [PATCH 2.5] Report write errors to applications

On Tue, Jan 28, 2003 at 11:29:29PM -0800, Andrew Morton wrote:
> Oliver Xymoron <[email protected]> wrote:
> >
> > Andrew, this is a forward-port of the 2.4 write error propagation
> > patch I just posted, applies against -mm6. Lightly tested on 2.5, but
> > should be straightforward.
>
> You sent Marcelo the 2.5 patch.

Doh.

> > diff -urN -x '.patch*' -x '*.orig' orig/fs/buffer.c patched/fs/buffer.c
> > +++ patched/fs/buffer.c 2003-01-24 13:25:08.000000000 -0600
> > @@ -165,15 +165,27 @@
> > * Default synchronous end-of-IO handler.. Just mark it up-to-date and
> > * unlock the buffer. This is what ll_rw_block uses too.
> > */
> > -void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
> > +void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
> > {
> > if (uptodate) {
> > set_buffer_uptodate(bh);
> > } else {
> > - /*
> > - * This happens, due to failed READA attempts.
> > - * buffer_io_error(bh);
> > - */
> > + /* This happens, due to failed READA attempts. */
> > + clear_buffer_uptodate(bh);
> > + }
> > + unlock_buffer(bh);
> > + put_bh(bh);
> > +}
> > +
> > +void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
> > +{
> > + if (uptodate) {
> > + set_buffer_uptodate(bh);
> > + } else {
> > + buffer_io_error(bh);
> > + printk(KERN_WARNING "lost page write due to I/O error on %s\n",
> > + bdevname(bh->b_bdev));
> > + bh->b_page->mapping->error = -EIO;
>
> Accessing bh->b_page here could be risky.
>
> In 2.5, this _has_ to be safe, because submit_bh() requires it.
>
> But in 2.4 (and in thus-far-untested corners of 2.5) it is possible that
> someone is feeding a buffer_head with a garbage ->b_page into ll_rw_blk().

I actually looked at this a bit and it seemed fine. What are some
likely candidates? I'll take a look.

> Now it could be that we're safe in this respect (the comment above
> generic_make_request says that the caller must set up b_page, but
> perhaps lots of drivers do not look at it. Not sure...)
>
> So what you could do here is to deliberately dereference bh->b_page even in
> the non-error case so those problems are quickly detected.

Enforcing it in 2.5 this way might make sense, but if it's actually a
problem in 2.4, I doubt that's acceptable at this point. Some form of this fix
needs to make it to 2.4, which is where the people with big disk farms
are going to be for at least the next year.

> But probably, there's not much point in doing any of this for ll_rw_blk() and
> end_buffer_io_sync(). Most of those buffers are against the blockdev
> mapping, and once the errors are propagated into the blockdev mapping's
> address space we no longer know what file they pertain to.

Ok, didn't realize how much this had changed in 2.5. This works nicely
for regular files in 2.4.

> It would be better to check the uptodate state of the buffer_head in
> drop_buffers(), and to propagate errors into the address_space there. For
> buffers which are still attached to the S_ISREG file's address_space we're OK
> - fsync_buffers_list() will handle them and will return errors to the fsync()
> caller. We only need to handle those buffers which were stripped
> asynchronously by VM activity.

Are we guaranteed that we'll get a try_to_free_buffers after IO
completion and before sync? I haven't dug through this path much.

>
> > clear_buffer_uptodate(bh);
> > }
> > unlock_buffer(bh);
> > @@ -550,6 +562,9 @@
> > set_buffer_uptodate(bh);
> > } else {
> > buffer_io_error(bh);
> > + printk(KERN_WARNING "lost page write due to I/O error on %s\n",
> > + bdevname(bh->b_bdev));
> > + bh->b_page->mapping->error = -EIO;
>
> page->mapping->error = -EIO;
>
> would suffice here.

Will look at that.

> > clear_buffer_uptodate(bh);
> > SetPageError(page);
> > }
> > @@ -1201,7 +1216,7 @@
> > if (buffer_dirty(bh))
> > buffer_error();
> > get_bh(bh);
> > - bh->b_end_io = end_buffer_io_sync;
> > + bh->b_end_io = end_buffer_read_sync;
> > submit_bh(READ, bh);
> > wait_on_buffer(bh);
> > if (buffer_uptodate(bh))
> > @@ -2604,13 +2619,14 @@
> > continue;
> >
> > get_bh(bh);
> > - bh->b_end_io = end_buffer_io_sync;
> > if (rw == WRITE) {
> > + bh->b_end_io = end_buffer_write_sync;
> > if (test_clear_buffer_dirty(bh)) {
> > submit_bh(WRITE, bh);
> > continue;
> > }
> > } else {
> > + bh->b_end_io = end_buffer_read_sync;
> > if (!buffer_uptodate(bh)) {
> > submit_bh(rw, bh);
> > continue;
> > diff -urN -x '.patch*' -x '*.orig' orig/fs/inode.c patched/fs/inode.c
> > +++ patched/fs/inode.c 2003-01-24 13:24:26.000000000 -0600
> > @@ -134,6 +134,7 @@
> > mapping->dirtied_when = 0;
> > mapping->assoc_mapping = NULL;
> > mapping->backing_dev_info = &default_backing_dev_info;
> > + mapping->error = 0;
>
> Machines can have millions of address_spaces in-core and here you have used
> 32 bits where two would do.
>
> Please make mapping->error an unsigned long and just use bits zero and one
> here for ENOSPC and EIO. Make sure that atomic ops are used, and this way we
> get 30 spare bits in the address_space for future expansion.

I certainly thought about that (given your partial patch that did the
same with pages), but I couldn't see the point. We can return exactly
one error and given that they're all effectively "everything you did
since the last sync is hosed", there's no obvious way to choose among
them. There is no ENOSPC_AND_BTW_IO. As long as we're dedicating a new
structure member to this, why not simplify the logic and just track
the latest error?

If we're going to bother with this complexity, then we ought to merge
it with an existing element, for example, combine it with gfp_mask. If
you think that's worth the effort, I'll take a look at it. I don't
think this complexity makes sense for 2.4 though.

> > @@ -598,7 +598,7 @@
> > continue;
> > }
> > atomic_inc(&tbh->b_count);
> > - tbh->b_end_io = end_buffer_io_sync;
> > + tbh->b_end_io = end_buffer_read_sync;
> > submit_bh(READ, tbh);
> > }
> >
> > diff -urN -x '.patch*' -x '*.orig' orig/fs/open.c patched/fs/open.c
> > +++ patched/fs/open.c 2003-01-24 13:24:26.000000000 -0600
> > @@ -843,21 +843,41 @@
> > */
> > int filp_close(struct file *filp, fl_owner_t id)
> > {
> > - int retval;
> > + struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
> > + struct inode *inode = mapping->host;
> > + int retval = 0, err;
> > +
> > + /* Report and clear outstanding errors */
> > + err = filp->f_error;
> > + if (err) {
> > + filp->f_error = 0;
> > + retval = err;
> > + }
> > +
> > + down(&inode->i_sem);
> > + err = mapping->error;
> > + if (err && !retval) {
> > + mapping->error = 0;
> > + retval = err;
> > + }
>
> I don't really understand why you're reporting the error on close().
> Applications probably do not expect that.
>
> It would be better to report the error to the next call to fsync(),
> fdatasync() and msync(). Reporting it on close as well doesn't hurt, but
> *sync() should be the main reporting interface.

Yes, I do those too, there's a chunk that you don't appear to have
quoted from filemap which is in the common waiting path of all the sync
syscalls:

diff -urN -x '.patch*' -x '*.orig' orig/mm/filemap.c patched/mm/filemap.c
--- orig/mm/filemap.c 2003-01-24 13:24:21.000000000 -0600
+++ patched/mm/filemap.c 2003-01-24 13:24:26.000000000 -0600
@@ -185,6 +185,14 @@
write_lock(&mapping->page_lock);
}
write_unlock(&mapping->page_lock);
+
+ /* Check for outstanding write errors */
+ if (mapping->error)
+ {
+ ret = mapping->error;
+ mapping->error = 0;
+ }
+
return ret;
}

Though it turns out a lot of apps _are_ checking close and not doing
sync (I believe this includes some of the standard fileutils).

>
> > + up(&inode->i_sem);
>
> with test_and_clear_bit() against mapping->errors we can do away with the
> i_sem locking here.

I'm actually not really concerned about concurrent writers in the
close case. It inherently races with with the VM pushing pages out
anyway. And we already make no guarantees about _who_ the error gets
delivered to in the multiple writers case, so the case where two
simultaneous closers get errors is ok too. The above locking is just
cut and paste from the code that handles another class of inode error.

> +++ patched/mm/vmscan.c 2003-01-24 13:24:26.000000000 -0600
> @@ -342,6 +342,9 @@
> SetPageReclaim(page);
> res = mapping->a_ops->writepage(page, &wbc);
>
> + if (res < 0) {
> + mapping->error = res;
> + }
>
> Most writepage() activity happens in fs/mpage.c - you'll need to propagate IO
> errors into the address_space in the BIO completion handlers there.

Another 2.5 change I hadn't noticed. Ok, will look at that. I haven't
come up with a good test for the writepage ENOSPC, thoughts?

--
"Love the dolphins," she advised him. "Write by W.A.S.T.E.."

2003-01-29 21:15:53

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 2.5] Report write errors to applications

Oliver Xymoron <[email protected]> wrote:
>
> > - fsync_buffers_list() will handle them and will return errors to the fsync()
> > caller. We only need to handle those buffers which were stripped
> > asynchronously by VM activity.
>
> Are we guaranteed that we'll get a try_to_free_buffers after IO
> completion and before sync? I haven't dug through this path much.

Think so. That's the only place where buffers are detached. Otherwise,
fsync_buffers_list() looks at them all.

There's also the prune_icache() buffer-stripper remove_inode_buffers().
Nobody knows about the inode by that time, but there's a chance that the
inode will be rescued before it is thrown away, so remove_inode_buffers()
should propagate errors into the address_space as well.

> Another 2.5 change I hadn't noticed. Ok, will look at that. I haven't
> come up with a good test for the writepage ENOSPC, thoughts?

See kswapd-writepage.c from
http://www.zip.com.au/~akpm/linux/patches/2.5/ext3-tools.tar.gz

If you run that over a filesystem which has only a few K of space, apply
memory pressure while it is sleeping then data loss will ensue.

hm, there's also enospc-writepage.c which is designed to exactly demonstrate
this problem.


2003-01-30 21:02:55

by Oliver Xymoron

[permalink] [raw]
Subject: Re: [PATCH 2.5] Report write errors to applications

On Wed, Jan 29, 2003 at 01:42:05PM -0800, Andrew Morton wrote:
> Oliver Xymoron <[email protected]> wrote:
> >
> > > - fsync_buffers_list() will handle them and will return errors to the fsync()
> > > caller. We only need to handle those buffers which were stripped
> > > asynchronously by VM activity.
> >
> > Are we guaranteed that we'll get a try_to_free_buffers after IO
> > completion and before sync? I haven't dug through this path much.
>
> Think so. That's the only place where buffers are detached. Otherwise,
> fsync_buffers_list() looks at them all.

The other problem here is that by the time we're in
try_to_free_buffers we no longer know that we're looking at a harmless
stale page (readahead?) or a write error, which is why Linus had me
make the separate end_buffer functions. So I don't think this pans out
- thoughts?

--
"Love the dolphins," she advised him. "Write by W.A.S.T.E.."

2003-01-30 21:31:37

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 2.5] Report write errors to applications

Oliver Xymoron wrote:
>
> On Wed, Jan 29, 2003 at 01:42:05PM -0800, Andrew Morton wrote:
> > Oliver Xymoron <[email protected]> wrote:
> > >
> > > > - fsync_buffers_list() will handle them and will return errors to the fsync()
> > > > caller. We only need to handle those buffers which were stripped
> > > > asynchronously by VM activity.
> > >
> > > Are we guaranteed that we'll get a try_to_free_buffers after IO
> > > completion and before sync? I haven't dug through this path much.
> >
> > Think so. That's the only place where buffers are detached. Otherwise,
> > fsync_buffers_list() looks at them all.
>
> The other problem here is that by the time we're in
> try_to_free_buffers we no longer know that we're looking at a harmless
> stale page (readahead?) or a write error, which is why Linus had me
> make the separate end_buffer functions. So I don't think this pans out
> - thoughts?

If the buffer has buffer_req and !buffer_uptodate and !buffer_locked
we know that it was submitted for IO, that the IO has completed, and
that it failed.

2003-01-30 21:50:53

by Oliver Xymoron

[permalink] [raw]
Subject: Re: [PATCH 2.5] Report write errors to applications

On Thu, Jan 30, 2003 at 01:39:31PM -0800, Andrew Morton wrote:
> Oliver Xymoron wrote:
> >
> > On Wed, Jan 29, 2003 at 01:42:05PM -0800, Andrew Morton wrote:
> > > Oliver Xymoron <[email protected]> wrote:
> > > >
> > > > > - fsync_buffers_list() will handle them and will return errors to the fsync()
> > > > > caller. We only need to handle those buffers which were stripped
> > > > > asynchronously by VM activity.
> > > >
> > > > Are we guaranteed that we'll get a try_to_free_buffers after IO
> > > > completion and before sync? I haven't dug through this path much.
> > >
> > > Think so. That's the only place where buffers are detached. Otherwise,
> > > fsync_buffers_list() looks at them all.
> >
> > The other problem here is that by the time we're in
> > try_to_free_buffers we no longer know that we're looking at a harmless
> > stale page (readahead?) or a write error, which is why Linus had me
> > make the separate end_buffer functions. So I don't think this pans out
> > - thoughts?
>
> If the buffer has buffer_req and !buffer_uptodate and !buffer_locked
> we know that it was submitted for IO, that the IO has completed, and
> that it failed.

2.5 says this:

void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
{
if (uptodate) {
set_buffer_uptodate(bh);
} else {
/*
* This happens, due to failed READA attempts.
* buffer_io_error(bh);
*/
clear_buffer_uptodate(bh);
}
unlock_buffer(bh);
put_bh(bh);
}

The comment suggests that we need to distinguish read errors from
write errors and I tend to agree. Bear in mind that we're limited to
doing this per inode, so if we start flagging errors for reads, we can
really confuse writers. Perhaps not fatal, but suboptimal certainly.

On the other hand, I haven't been able to find anywhere in 2.4 that's
setting b_end_io to end_io_write_sync that's not also setting b_page,
so I think my original patch is safe in this regard. I suspect 2.5 is
similar.

--
"Love the dolphins," she advised him. "Write by W.A.S.T.E.."

2003-01-30 22:04:39

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 2.5] Report write errors to applications

Oliver Xymoron wrote:
>
> The comment suggests that we need to distinguish read errors from
> write errors and I tend to agree.

OK, well you could call set_buffer_write_io_error/set_buffer_read_io_error()
in the end_io handlers, and pick that up later on.

To avoid adding a couple of new clear_bits in submit_bh,
you could do:

if (test_set_buffer_req(bh)) {
clear_buffer_write_io_error(bh);
clear_buffer_read_io_error(bh);
}

2003-01-31 18:33:31

by Oliver Xymoron

[permalink] [raw]
Subject: Re: [PATCH 2.5] Report write errors to applications

On Thu, Jan 30, 2003 at 02:13:54PM -0800, Andrew Morton wrote:
> Oliver Xymoron wrote:
> >
> > The comment suggests that we need to distinguish read errors from
> > write errors and I tend to agree.
>
> OK, well you could call set_buffer_write_io_error/set_buffer_read_io_error()
> in the end_io handlers, and pick that up later on.
>
> To avoid adding a couple of new clear_bits in submit_bh,
> you could do:
>
> if (test_set_buffer_req(bh)) {
> clear_buffer_write_io_error(bh);
> clear_buffer_read_io_error(bh);
> }

Read errors aren't really a problem as they tend to be caught
synchronously. And we also have to be careful not to clear out a write
error by doing a subsequent read on the now !uptodate page.

How's this look? Against -mm7, compiles, untested:

diff -urN -x '.patch*' -x '*.orig' orig/fs/buffer.c patched/fs/buffer.c
--- orig/fs/buffer.c 2003-01-31 12:34:05.000000000 -0600
+++ patched/fs/buffer.c 2003-01-31 12:37:13.000000000 -0600
@@ -165,15 +165,27 @@
* Default synchronous end-of-IO handler.. Just mark it up-to-date and
* unlock the buffer. This is what ll_rw_block uses too.
*/
-void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
{
if (uptodate) {
set_buffer_uptodate(bh);
} else {
- /*
- * This happens, due to failed READA attempts.
- * buffer_io_error(bh);
- */
+ /* This happens, due to failed READA attempts. */
+ clear_buffer_uptodate(bh);
+ }
+ unlock_buffer(bh);
+ put_bh(bh);
+}
+
+void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+{
+ if (uptodate) {
+ set_buffer_uptodate(bh);
+ } else {
+ buffer_io_error(bh);
+ printk(KERN_WARNING "lost page write due to I/O error on %s\n",
+ bdevname(bh->b_bdev));
+ set_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
}
unlock_buffer(bh);
@@ -550,6 +562,9 @@
set_buffer_uptodate(bh);
} else {
buffer_io_error(bh);
+ printk(KERN_WARNING "lost page write due to I/O error on %s\n",
+ bdevname(bh->b_bdev));
+ page->mapping->error = -EIO;
clear_buffer_uptodate(bh);
SetPageError(page);
}
@@ -1201,7 +1216,7 @@
if (buffer_dirty(bh))
buffer_error();
get_bh(bh);
- bh->b_end_io = end_buffer_io_sync;
+ bh->b_end_io = end_buffer_read_sync;
submit_bh(READ, bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
@@ -2543,8 +2558,10 @@
buffer_error();
if (rw == READ && buffer_dirty(bh))
buffer_error();
-
- set_buffer_req(bh);
+
+ /* Only clear out a write error when rewriting */
+ if (test_set_buffer_req(bh) && rw == WRITE)
+ clear_buffer_write_io_error(bh);

/*
* from here on down, it's all bio -- do the initial mapping,
@@ -2604,13 +2621,14 @@
continue;

get_bh(bh);
- bh->b_end_io = end_buffer_io_sync;
if (rw == WRITE) {
+ bh->b_end_io = end_buffer_write_sync;
if (test_clear_buffer_dirty(bh)) {
submit_bh(WRITE, bh);
continue;
}
} else {
+ bh->b_end_io = end_buffer_read_sync;
if (!buffer_uptodate(bh)) {
submit_bh(rw, bh);
continue;
@@ -2672,6 +2690,8 @@
bh = head;
do {
check_ttfb_buffer(page, bh);
+ if (buffer_write_io_error(bh))
+ page->mapping->error = -EIO;
if (buffer_busy(bh))
goto failed;
if (!buffer_uptodate(bh) && !buffer_req(bh))
diff -urN -x '.patch*' -x '*.orig' orig/fs/inode.c patched/fs/inode.c
--- orig/fs/inode.c 2003-01-31 12:34:05.000000000 -0600
+++ patched/fs/inode.c 2003-01-31 12:34:09.000000000 -0600
@@ -134,6 +134,7 @@
mapping->dirtied_when = 0;
mapping->assoc_mapping = NULL;
mapping->backing_dev_info = &default_backing_dev_info;
+ mapping->error = 0;
if (sb->s_bdev)
inode->i_data.backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
memset(&inode->u, 0, sizeof(inode->u));
diff -urN -x '.patch*' -x '*.orig' orig/fs/mpage.c patched/fs/mpage.c
--- orig/fs/mpage.c 2003-01-31 12:34:05.000000000 -0600
+++ patched/fs/mpage.c 2003-01-31 12:34:09.000000000 -0600
@@ -562,6 +562,8 @@
if (bio)
bio = mpage_bio_submit(WRITE, bio);
*ret = page->mapping->a_ops->writepage(page, wbc);
+ if (*ret < 0)
+ page->mapping->error = *ret;
out:
return bio;
}
@@ -665,6 +667,8 @@
test_clear_page_dirty(page)) {
if (writepage) {
ret = (*writepage)(page, wbc);
+ if (ret < 0)
+ page->mapping->error = ret;
} else {
bio = mpage_writepage(bio, page, get_block,
&last_block_in_bio, &ret, wbc);
diff -urN -x '.patch*' -x '*.orig' orig/fs/ntfs/compress.c patched/fs/ntfs/compress.c
--- orig/fs/ntfs/compress.c 2003-01-13 23:58:43.000000000 -0600
+++ patched/fs/ntfs/compress.c 2003-01-31 12:34:09.000000000 -0600
@@ -598,7 +598,7 @@
continue;
}
atomic_inc(&tbh->b_count);
- tbh->b_end_io = end_buffer_io_sync;
+ tbh->b_end_io = end_buffer_read_sync;
submit_bh(READ, tbh);
}

diff -urN -x '.patch*' -x '*.orig' orig/fs/open.c patched/fs/open.c
--- orig/fs/open.c 2003-01-13 23:58:05.000000000 -0600
+++ patched/fs/open.c 2003-01-31 12:34:09.000000000 -0600
@@ -843,21 +843,38 @@
*/
int filp_close(struct file *filp, fl_owner_t id)
{
- int retval;
+ struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
+ int retval = 0, err;
+
+ /* Report and clear outstanding errors */
+ err = filp->f_error;
+ if (err) {
+ filp->f_error = 0;
+ retval = err;
+ }
+
+ err = mapping->error;
+ if (err && !retval) {
+ mapping->error = 0;
+ retval = err;
+ }

if (!file_count(filp)) {
printk(KERN_ERR "VFS: Close: file count is 0\n");
- return 0;
+ return retval;
}
- retval = 0;
+
if (filp->f_op && filp->f_op->flush) {
lock_kernel();
- retval = filp->f_op->flush(filp);
+ err = filp->f_op->flush(filp);
unlock_kernel();
+ if (err && !retval)
+ retval = err;
}
dnotify_flush(filp, id);
locks_remove_posix(filp, id);
fput(filp);
+
return retval;
}

diff -urN -x '.patch*' -x '*.orig' orig/include/linux/buffer_head.h patched/include/linux/buffer_head.h
--- orig/include/linux/buffer_head.h 2003-01-31 12:34:05.000000000 -0600
+++ patched/include/linux/buffer_head.h 2003-01-31 12:36:56.000000000 -0600
@@ -24,8 +24,9 @@
BH_Async_Read, /* Is under end_buffer_async_read I/O */
BH_Async_Write, /* Is under end_buffer_async_write I/O */
BH_Delay, /* Buffer is not yet allocated on disk */
-
BH_Boundary, /* Block is followed by a discontiguity */
+ BH_Write_EIO, /* I/O error on write */
+
BH_PrivateStart,/* not a state bit, but the first bit available
* for private allocation by other entities
*/
@@ -103,12 +104,14 @@
BUFFER_FNS(Lock, locked)
TAS_BUFFER_FNS(Lock, locked)
BUFFER_FNS(Req, req)
+TAS_BUFFER_FNS(Req, req)
BUFFER_FNS(Mapped, mapped)
BUFFER_FNS(New, new)
BUFFER_FNS(Async_Read, async_read)
BUFFER_FNS(Async_Write, async_write)
-BUFFER_FNS(Delay, delay);
+BUFFER_FNS(Delay, delay)
BUFFER_FNS(Boundary, boundary)
+BUFFER_FNS(Write_EIO,write_io_error)

#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
#define touch_buffer(bh) mark_page_accessed(bh->b_page)
@@ -136,7 +139,8 @@
int try_to_free_buffers(struct page *);
void create_empty_buffers(struct page *, unsigned long,
unsigned long b_state);
-void end_buffer_io_sync(struct buffer_head *bh, int uptodate);
+void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
+void end_buffer_write_sync(struct buffer_head *bh, int uptodate);

/* Things to do with buffers at mapping->private_list */
void buffer_insert_list(spinlock_t *lock,
diff -urN -x '.patch*' -x '*.orig' orig/include/linux/fs.h patched/include/linux/fs.h
--- orig/include/linux/fs.h 2003-01-31 12:34:05.000000000 -0600
+++ patched/include/linux/fs.h 2003-01-31 12:34:09.000000000 -0600
@@ -326,6 +326,7 @@
spinlock_t private_lock; /* for use by the address_space */
struct list_head private_list; /* ditto */
struct address_space *assoc_mapping; /* ditto */
+ int error; /* write error for fsync */
};

struct char_device {
diff -urN -x '.patch*' -x '*.orig' orig/kernel/ksyms.c patched/kernel/ksyms.c
--- orig/kernel/ksyms.c 2003-01-31 12:34:05.000000000 -0600
+++ patched/kernel/ksyms.c 2003-01-31 12:34:09.000000000 -0600
@@ -176,7 +176,8 @@
EXPORT_SYMBOL(d_lookup);
EXPORT_SYMBOL(d_path);
EXPORT_SYMBOL(mark_buffer_dirty);
-EXPORT_SYMBOL(end_buffer_io_sync);
+EXPORT_SYMBOL(end_buffer_read_sync);
+EXPORT_SYMBOL(end_buffer_write_sync);
EXPORT_SYMBOL(__mark_inode_dirty);
EXPORT_SYMBOL(get_empty_filp);
EXPORT_SYMBOL(init_private_file);
diff -urN -x '.patch*' -x '*.orig' orig/mm/filemap.c patched/mm/filemap.c
--- orig/mm/filemap.c 2003-01-31 12:34:05.000000000 -0600
+++ patched/mm/filemap.c 2003-01-31 12:34:09.000000000 -0600
@@ -185,6 +185,14 @@
write_lock(&mapping->page_lock);
}
write_unlock(&mapping->page_lock);
+
+ /* Check for outstanding write errors */
+ if (mapping->error)
+ {
+ ret = mapping->error;
+ mapping->error = 0;
+ }
+
return ret;
}

diff -urN -x '.patch*' -x '*.orig' orig/mm/vmscan.c patched/mm/vmscan.c
--- orig/mm/vmscan.c 2003-01-31 12:34:05.000000000 -0600
+++ patched/mm/vmscan.c 2003-01-31 12:34:09.000000000 -0600
@@ -342,6 +342,9 @@
SetPageReclaim(page);
res = mapping->a_ops->writepage(page, &wbc);

+ if (res < 0) {
+ mapping->error = res;
+ }
if (res == WRITEPAGE_ACTIVATE) {
ClearPageReclaim(page);
goto activate_locked;



--
"Love the dolphins," she advised him. "Write by W.A.S.T.E.."