This is the backport of the -ac truncate fixes we discussed
the other day. All code paths have been tested and work OK.
Please review.
--- linux-2.4.18-pre1/fs/buffer.c Fri Dec 21 11:19:14 2001
+++ linux-akpm/fs/buffer.c Sat Jan 5 01:26:38 2002
@@ -1512,6 +1512,7 @@ static int __block_write_full_page(struc
int err, i;
unsigned long block;
struct buffer_head *bh, *head;
+ int need_unlock;
if (!PageLocked(page))
BUG();
@@ -1567,8 +1568,34 @@ static int __block_write_full_page(struc
return 0;
out:
+ /*
+ * ENOSPC, or some other error. We may already have added some
+ * blocks to the file, so we need to write these out to avoid
+ * exposing stale data.
+ */
ClearPageUptodate(page);
- UnlockPage(page);
+ bh = head;
+ need_unlock = 1;
+ /* Recovery: lock and submit the mapped buffers */
+ do {
+ if (buffer_mapped(bh)) {
+ lock_buffer(bh);
+ set_buffer_async_io(bh);
+ need_unlock = 0;
+ }
+ bh = bh->b_this_page;
+ } while (bh != head);
+ do {
+ struct buffer_head *next = bh->b_this_page;
+ if (buffer_mapped(bh)) {
+ set_bit(BH_Uptodate, &bh->b_state);
+ clear_bit(BH_Dirty, &bh->b_state);
+ submit_bh(WRITE, bh);
+ }
+ bh = next;
+ } while (bh != head);
+ if (need_unlock)
+ UnlockPage(page);
return err;
}
@@ -1639,6 +1666,17 @@ static int __block_prepare_write(struct
}
return 0;
out:
+ bh = head;
+ block_start = 0;
+ do {
+ if (buffer_new(bh) && !buffer_uptodate(bh)) {
+ memset(kaddr+block_start, 0, bh->b_size);
+ set_bit(BH_Uptodate, &bh->b_state);
+ mark_buffer_dirty(bh);
+ }
+ block_start += bh->b_size;
+ bh = bh->b_this_page;
+ } while (bh != head);
return err;
}
--- linux-2.4.18-pre1/mm/filemap.c Wed Dec 26 11:47:41 2001
+++ linux-akpm/mm/filemap.c Sat Jan 5 01:26:50 2002
@@ -3004,7 +3004,7 @@ generic_file_write(struct file *file,con
kaddr = kmap(page);
status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
if (status)
- goto unlock;
+ goto sync_failure;
page_fault = __copy_from_user(kaddr+offset, buf, bytes);
flush_dcache_page(page);
status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
@@ -3029,6 +3029,7 @@ unlock:
if (status < 0)
break;
} while (count);
+done:
*ppos = pos;
if (cached_page)
@@ -3050,6 +3051,18 @@ out:
fail_write:
status = -EFAULT;
goto unlock;
+
+sync_failure:
+ /*
+ * If blocksize < pagesize, prepare_write() may have instantiated a
+ * few blocks outside i_size. Trim these off again.
+ */
+ kunmap(page);
+ UnlockPage(page);
+ page_cache_release(page);
+ if (pos + bytes > inode->i_size)
+ vmtruncate(inode, inode->i_size);
+ goto done;
o_direct:
written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
On Sat, Jan 05, 2002 at 03:08:25AM -0800, Andrew Morton wrote:
> @@ -1639,6 +1666,17 @@ static int __block_prepare_write(struct
> }
> return 0;
> out:
> + bh = head;
> + block_start = 0;
> + do {
> + if (buffer_new(bh) && !buffer_uptodate(bh)) {
> + memset(kaddr+block_start, 0, bh->b_size);
> + set_bit(BH_Uptodate, &bh->b_state);
> + mark_buffer_dirty(bh);
> + }
> + block_start += bh->b_size;
> + bh = bh->b_this_page;
> + } while (bh != head);
> return err;
> }
>
> --- linux-2.4.18-pre1/mm/filemap.c Wed Dec 26 11:47:41 2001
> +++ linux-akpm/mm/filemap.c Sat Jan 5 01:26:50 2002
> @@ -3004,7 +3004,7 @@ generic_file_write(struct file *file,con
> kaddr = kmap(page);
> status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
> if (status)
> - goto unlock;
> + goto sync_failure;
> page_fault = __copy_from_user(kaddr+offset, buf, bytes);
> flush_dcache_page(page);
> status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
> @@ -3029,6 +3029,7 @@ unlock:
> if (status < 0)
> break;
> } while (count);
> +done:
> *ppos = pos;
>
> if (cached_page)
> @@ -3050,6 +3051,18 @@ out:
> fail_write:
> status = -EFAULT;
> goto unlock;
> +
> +sync_failure:
> + /*
> + * If blocksize < pagesize, prepare_write() may have instantiated a
> + * few blocks outside i_size. Trim these off again.
> + */
> + kunmap(page);
> + UnlockPage(page);
> + page_cache_release(page);
> + if (pos + bytes > inode->i_size)
> + vmtruncate(inode, inode->i_size);
> + goto done;
>
> o_direct:
> written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
I prefer my fix that simply recalls the ->truncate callback if -ENOSPC
is returned by prepare_write. vmtruncate seems way overkill, and after
calling ->truncate the __block_prepare_changes above won't be necessary
because the leftover will be correctly deallocated (no need to clear
them out and to mark them dirty, they will just go away before any
readpage can see them).
for the writepage part of the patch (not quoted in this reply) it seems
fine to me. writepage won't corrupt i_size because it doesn't have the
ability to screwup i_size, and writing at least the successfully mapped
buffers is definitely right thing to do, even if we still can get silent
corruption with holes (but that's not a filesystem kernel side
corruption so I'm satisfied for 2.4 at least :). many thanks for sorting
it out.
Andrea
Andrea Arcangeli wrote:
>
> I prefer my fix that simply recalls the ->truncate callback if -ENOSPC
> is returned by prepare_write. vmtruncate seems way overkill,
No opinion on that here. This is what was in -ac. Perhaps Al can
comment?
> and after
> calling ->truncate the __block_prepare_changes above won't be necessary
> because the leftover will be correctly deallocated (no need to clear
> them out and to mark them dirty, they will just go away before any
> readpage can see them).
No, this code is needed if the write is _inside_ i_size, to an
uninstantiated block. truncate won't remove those blocks, and we've
gone and added them to the file.
-
On Sun, Jan 06, 2002 at 06:53:30PM -0800, Andrew Morton wrote:
> Andrea Arcangeli wrote:
> >
> > I prefer my fix that simply recalls the ->truncate callback if -ENOSPC
> > is returned by prepare_write. vmtruncate seems way overkill,
>
> No opinion on that here. This is what was in -ac. Perhaps Al can
> comment?
>
> > and after
> > calling ->truncate the __block_prepare_changes above won't be necessary
> > because the leftover will be correctly deallocated (no need to clear
> > them out and to mark them dirty, they will just go away before any
> > readpage can see them).
>
> No, this code is needed if the write is _inside_ i_size, to an
> uninstantiated block. truncate won't remove those blocks, and we've
> gone and added them to the file.
I see, I got mistaken because here we're not in a i_sem-less writepage, so
in those cases I wanted to always deallocate the blocks rather than
lefting zeroed leftovers. if the leftover blocks are over i_size (common
case I was thinking about incidentally :) that's automatica with
->truncate, but if they aren't over i_size that's not enough and we miss
a lowlevel API to decallocate a range of blocks, so your patch is the
only way indeed.
>
> -
Andrea
Andrea Arcangeli wrote:
>
> I prefer my fix that simply recalls the ->truncate callback if -ENOSPC
> is returned by prepare_write. vmtruncate seems way overkill,
Actually, vmtruncate will trim the page off the inode as well as the
blocks from the file. I don't think there's necessarily a problem
with having a page wholly outside i_size, but..
-
On Sat, Jan 05, 2002 at 03:08:25AM -0800, Andrew Morton wrote:
> }
> return 0;
> out:
> + bh = head;
> + block_start = 0;
> + do {
> + if (buffer_new(bh) && !buffer_uptodate(bh)) {
> + memset(kaddr+block_start, 0, bh->b_size);
> + set_bit(BH_Uptodate, &bh->b_state);
> + mark_buffer_dirty(bh);
> + }
> + block_start += bh->b_size;
> + bh = bh->b_this_page;
> + } while (bh != head);
> return err;
> }
the above code will end marking uptodate (zeroed) buffers relative to
blocks that cannot be read from disk. So a read-retry won't hit the disk
and that's wrong.
I think that will be fixed by additionally also return -EIO in the
wait_on_buffer loop (instead of goto out), so we won't generate zeroed
uptodate cache in case of read failure.
Andrea
Andrea Arcangeli wrote:
>
> On Sat, Jan 05, 2002 at 03:08:25AM -0800, Andrew Morton wrote:
> > }
> > return 0;
> > out:
> > + bh = head;
> > + block_start = 0;
> > + do {
> > + if (buffer_new(bh) && !buffer_uptodate(bh)) {
> > + memset(kaddr+block_start, 0, bh->b_size);
> > + set_bit(BH_Uptodate, &bh->b_state);
> > + mark_buffer_dirty(bh);
> > + }
> > + block_start += bh->b_size;
> > + bh = bh->b_this_page;
> > + } while (bh != head);
> > return err;
> > }
>
> the above code will end marking uptodate (zeroed) buffers relative to
> blocks that cannot be read from disk. So a read-retry won't hit the disk
> and that's wrong.
>
> I think that will be fixed by additionally also return -EIO in the
> wait_on_buffer loop (instead of goto out), so we won't generate zeroed
> uptodate cache in case of read failure.
>
Right. There's also the case where get_block() returns -EIO when,
for example, it fails on reading an indirect block. We end up
writing zeroes into some of the blocks. But I think that behaviour
is correct.
(I think I'll add a buffer_mapped() test to this code as well. It's
a bit redundant because the fs shouldn't go setting BH_New and not
BH_Mapped, but this code is _very_ rarely executed, and I haven't
tested all filesystems...)
@@ -1633,12 +1660,22 @@ static int __block_prepare_write(struct
*/
while(wait_bh > wait) {
wait_on_buffer(*--wait_bh);
- err = -EIO;
if (!buffer_uptodate(*wait_bh))
- goto out;
+ return -EIO;
}
return 0;
out:
+ bh = head;
+ block_start = 0;
+ do {
+ if (buffer_new(bh) && buffer_mapped(bh) && !buffer_uptodate(bh)) {
+ memset(kaddr+block_start, 0, bh->b_size);
+ set_bit(BH_Uptodate, &bh->b_state);
+ mark_buffer_dirty(bh);
+ }
+ block_start += bh->b_size;
+ bh = bh->b_this_page;
+ } while (bh != head);
return err;
}
I'll retest this, including the -EIO path.
On Sun, Jan 06, 2002 at 07:11:45PM -0800, Andrew Morton wrote:
> Andrea Arcangeli wrote:
> >
> > I prefer my fix that simply recalls the ->truncate callback if -ENOSPC
> > is returned by prepare_write. vmtruncate seems way overkill,
>
> Actually, vmtruncate will trim the page off the inode as well as the
> blocks from the file. I don't think there's necessarily a problem
> with having a page wholly outside i_size, but..
agreed, a truncate_inode_pages before ->truncate is needed, or after an
extension the old bh could be still there mapped to random in the
leftover page. But still the whole vmtruncate looks overkill, there
cannot be vm mappings in the way of our truncate because in
prepare_write we worked after the i_size if truncate will later do
something. pages in the middle if i_size cannot be mapped either, so if
it was a partial write that still couldn't be mapped.
Andrea
On Sun, Jan 06, 2002 at 07:48:38PM -0800, Andrew Morton wrote:
> Andrea Arcangeli wrote:
> >
> > On Sat, Jan 05, 2002 at 03:08:25AM -0800, Andrew Morton wrote:
> > > }
> > > return 0;
> > > out:
> > > + bh = head;
> > > + block_start = 0;
> > > + do {
> > > + if (buffer_new(bh) && !buffer_uptodate(bh)) {
> > > + memset(kaddr+block_start, 0, bh->b_size);
> > > + set_bit(BH_Uptodate, &bh->b_state);
> > > + mark_buffer_dirty(bh);
> > > + }
> > > + block_start += bh->b_size;
> > > + bh = bh->b_this_page;
> > > + } while (bh != head);
> > > return err;
> > > }
> >
> > the above code will end marking uptodate (zeroed) buffers relative to
> > blocks that cannot be read from disk. So a read-retry won't hit the disk
> > and that's wrong.
> >
> > I think that will be fixed by additionally also return -EIO in the
> > wait_on_buffer loop (instead of goto out), so we won't generate zeroed
> > uptodate cache in case of read failure.
> >
>
> Right. There's also the case where get_block() returns -EIO when,
> for example, it fails on reading an indirect block. We end up
> writing zeroes into some of the blocks. But I think that behaviour
> is correct.
yes, in such case again we've to giveup with the writes so we've to
cleanup the leftovers first.
>
> (I think I'll add a buffer_mapped() test to this code as well. It's
> a bit redundant because the fs shouldn't go setting BH_New and not
> BH_Mapped, but this code is _very_ rarely executed, and I haven't
> tested all filesystems...)
correct, it shouldn't be necessary. I wouldn't add it. if a fs breaks the
buffer_new semantics it's the one that should be fixed methinks.
>
> @@ -1633,12 +1660,22 @@ static int __block_prepare_write(struct
> */
> while(wait_bh > wait) {
> wait_on_buffer(*--wait_bh);
> - err = -EIO;
> if (!buffer_uptodate(*wait_bh))
> - goto out;
> + return -EIO;
> }
> return 0;
> out:
> + bh = head;
> + block_start = 0;
> + do {
> + if (buffer_new(bh) && buffer_mapped(bh) && !buffer_uptodate(bh)) {
> + memset(kaddr+block_start, 0, bh->b_size);
> + set_bit(BH_Uptodate, &bh->b_state);
> + mark_buffer_dirty(bh);
> + }
> + block_start += bh->b_size;
> + bh = bh->b_this_page;
> + } while (bh != head);
I found another problem, we really need to keep track of which bh are
been created by us during the failing prepare_write (buffer_new right
now, not a long time ago), or we risk to corrupt data with a write
passing over many bh, where the first bh of the page contained vaild
data since a long time ago. To do this: 1) we either keep track of it
on the kernel stack with some local variable or 2) we change
the buffer_new semantics so that they indicate an "instant buffer_new"
to clear just after checking it
> return err;
> }
>
> I'll retest this, including the -EIO path.
Andrea
Andrea Arcangeli wrote:
>
> > (I think I'll add a buffer_mapped() test to this code as well. It's
> > a bit redundant because the fs shouldn't go setting BH_New and not
> > BH_Mapped, but this code is _very_ rarely executed, and I haven't
> > tested all filesystems...)
>
> correct, it shouldn't be necessary. I wouldn't add it. if a fs breaks the
> buffer_new semantics it's the one that should be fixed methinks.
You mean "don't be lazy. Audit all the filesystems"? Sigh. OK.
> >
> > @@ -1633,12 +1660,22 @@ static int __block_prepare_write(struct
> > */
> > while(wait_bh > wait) {
> > wait_on_buffer(*--wait_bh);
> > - err = -EIO;
> > if (!buffer_uptodate(*wait_bh))
> > - goto out;
> > + return -EIO;
> > }
> > return 0;
> > out:
> > + bh = head;
> > + block_start = 0;
> > + do {
> > + if (buffer_new(bh) && buffer_mapped(bh) && !buffer_uptodate(bh)) {
> > + memset(kaddr+block_start, 0, bh->b_size);
> > + set_bit(BH_Uptodate, &bh->b_state);
> > + mark_buffer_dirty(bh);
> > + }
> > + block_start += bh->b_size;
> > + bh = bh->b_this_page;
> > + } while (bh != head);
>
> I found another problem, we really need to keep track of which bh are
> been created by us during the failing prepare_write (buffer_new right
> now, not a long time ago), or we risk to corrupt data with a write
> passing over many bh, where the first bh of the page contained vaild
> data since a long time ago. To do this: 1) we either keep track of it
> on the kernel stack with some local variable or 2) we change
> the buffer_new semantics so that they indicate an "instant buffer_new"
> to clear just after checking it
Fair enough. How does this (untested) approach look?
@@ -1600,6 +1627,7 @@ static int __block_prepare_write(struct
if (block_start >= to)
break;
if (!buffer_mapped(bh)) {
+ clear_bit(BH_New, &bh->b_state);
err = get_block(inode, block, bh, 1);
if (err)
goto out;
@@ -1633,12 +1661,30 @@ static int __block_prepare_write(struct
*/
while(wait_bh > wait) {
wait_on_buffer(*--wait_bh);
- err = -EIO;
if (!buffer_uptodate(*wait_bh))
- goto out;
+ return -EIO;
}
return 0;
out:
+ /*
+ * Zero out any newly allocated blocks to avoid exposing stale
+ * data. If BH_New is set, we know that the block was newly
+ * allocated in the above loop.
+ */
+ bh = head;
+ block_start = 0;
+ do {
+ if (buffer_new(bh)) {
+ if (buffer_uptodate(bh))
+ printk(KERN_ERR __FUNCTION__
+ ": zeroing uptodate buffer!\n");
+ memset(kaddr+block_start, 0, bh->b_size);
+ set_bit(BH_Uptodate, &bh->b_state);
+ mark_buffer_dirty(bh);
+ }
+ block_start += bh->b_size;
+ bh = bh->b_this_page;
+ } while (bh != head);
return err;
}
On Sun, Jan 06, 2002 at 08:28:37PM -0800, Andrew Morton wrote:
> Andrea Arcangeli wrote:
> >
> > > (I think I'll add a buffer_mapped() test to this code as well. It's
> > > a bit redundant because the fs shouldn't go setting BH_New and not
> > > BH_Mapped, but this code is _very_ rarely executed, and I haven't
> > > tested all filesystems...)
> >
> > correct, it shouldn't be necessary. I wouldn't add it. if a fs breaks the
> > buffer_new semantics it's the one that should be fixed methinks.
>
> You mean "don't be lazy. Audit all the filesystems"? Sigh. OK.
actually I meant more "don't care about filesystems that are broken
anwways" :) but since we're changing the semantincs ourself below in the
mainstream we'll have to check for the other internal usages of
buffer_new, should be very easy though.
>
> > >
> > > @@ -1633,12 +1660,22 @@ static int __block_prepare_write(struct
> > > */
> > > while(wait_bh > wait) {
> > > wait_on_buffer(*--wait_bh);
> > > - err = -EIO;
> > > if (!buffer_uptodate(*wait_bh))
> > > - goto out;
> > > + return -EIO;
> > > }
> > > return 0;
> > > out:
> > > + bh = head;
> > > + block_start = 0;
> > > + do {
> > > + if (buffer_new(bh) && buffer_mapped(bh) && !buffer_uptodate(bh)) {
> > > + memset(kaddr+block_start, 0, bh->b_size);
> > > + set_bit(BH_Uptodate, &bh->b_state);
> > > + mark_buffer_dirty(bh);
> > > + }
> > > + block_start += bh->b_size;
> > > + bh = bh->b_this_page;
> > > + } while (bh != head);
> >
> > I found another problem, we really need to keep track of which bh are
> > been created by us during the failing prepare_write (buffer_new right
> > now, not a long time ago), or we risk to corrupt data with a write
> > passing over many bh, where the first bh of the page contained vaild
> > data since a long time ago. To do this: 1) we either keep track of it
> > on the kernel stack with some local variable or 2) we change
> > the buffer_new semantics so that they indicate an "instant buffer_new"
> > to clear just after checking it
>
> Fair enough. How does this (untested) approach look?
>
>
> @@ -1600,6 +1627,7 @@ static int __block_prepare_write(struct
> if (block_start >= to)
> break;
> if (!buffer_mapped(bh)) {
> + clear_bit(BH_New, &bh->b_state);
> err = get_block(inode, block, bh, 1);
> if (err)
> goto out;
> @@ -1633,12 +1661,30 @@ static int __block_prepare_write(struct
> */
> while(wait_bh > wait) {
> wait_on_buffer(*--wait_bh);
> - err = -EIO;
> if (!buffer_uptodate(*wait_bh))
> - goto out;
> + return -EIO;
> }
> return 0;
> out:
> + /*
> + * Zero out any newly allocated blocks to avoid exposing stale
> + * data. If BH_New is set, we know that the block was newly
> + * allocated in the above loop.
> + */
> + bh = head;
> + block_start = 0;
> + do {
> + if (buffer_new(bh)) {
> + if (buffer_uptodate(bh))
> + printk(KERN_ERR __FUNCTION__
> + ": zeroing uptodate buffer!\n");
> + memset(kaddr+block_start, 0, bh->b_size);
> + set_bit(BH_Uptodate, &bh->b_state);
> + mark_buffer_dirty(bh);
> + }
> + block_start += bh->b_size;
> + bh = bh->b_this_page;
> + } while (bh != head);
> return err;
this is 2) and it looks fine, but we need to update write_full_page too
to clear the bit (that's pagecache). Anybody else checking for
buffer_new on visible pagecache bhs should be updated as well.
And also generic_direct_IO needs somehow to write something if get_block
fails during a write. But I guess there it is simpler to let brw_kiovec
to go ahead before the get_block failure, otherwise we've nothing to
write (patching the iobuf->length for the duration of the brw_kiovec
should be enough, so we do a short write and we notify about the error
only if we couldn't write anything, of course it would be better to
restore the length later even if probably not needed). This will also
allow to use all the blocks of the fs and it's what the userspace expects.
For the o_direct writes in my tree the ->truncate is just in place
(truncate_inode_pages isn't needed there).
Andrea
On Sun, 6 Jan 2002, Andrew Morton wrote:
> Andrea Arcangeli wrote:
> >
> > I prefer my fix that simply recalls the ->truncate callback if -ENOSPC
> > is returned by prepare_write. vmtruncate seems way overkill,
>
> No opinion on that here. This is what was in -ac. Perhaps Al can
> comment?
a) It's obviously correct
b) it's a friggin error-handling path and I'll take correctness over
anything here.
Keep in mind that you want to zero the area out, so at the very least
truncate_inode_pages() + ->truncate() are needed. And locking/ordering
consideration here are tricky enough to make duplicating them a Bad
Thing(tm).
On January 7, 2002 05:28 am, Andrew Morton wrote:
> Andrea Arcangeli wrote:
> >
> > > (I think I'll add a buffer_mapped() test to this code as well. It's
> > > a bit redundant because the fs shouldn't go setting BH_New and not
> > > BH_Mapped, but this code is _very_ rarely executed, and I haven't
> > > tested all filesystems...)
> >
> > correct, it shouldn't be necessary. I wouldn't add it. if a fs breaks the
> > buffer_new semantics it's the one that should be fixed methinks.
>
> You mean "don't be lazy. Audit all the filesystems"? Sigh. OK.
Isn't this a job for BUG?
--
Daniel