This patch corrects a punch hole bug found by xfstests
when the block size is set to 1k. Test 127 runs longer
before it fails, but that appears to be a separate bug.
This bug happens because the punch hole code only zeros
out non block aligned blocks, and then releases the pages
for data that is page aligned. This means that if the
blocks are smaller than a page, then the blocks contained
in the non page aligned regions (but still block aligned)
are left unzeroed and mapped.
This patch adds a new ext4_unmap_partial_page_buffers routine
that unmapps the block aligned buffers in a page that are
contained in a specified range.
Signed-off-by: Allison Henderson <[email protected]>
---
v1 -> v2
Added EXT4_BLOCK_ZERO_DISCARD_BUFFER flag
v2 -> v3
Moved code out of ext4_zero_block_page_range and in
to new ext4_unmap_page_range function
v3 -> v4
Renamed ext4_unmap_page_range to ext4_unmap_partial_page_buffers
Moved ext4_unmap_partial_page_buffers from inode.c to extents.c
Corrected comments for non block/page aligned handling
Added checks to avoid unnecessary page unmaps
Removed unneeded journaling and mapping from new routine
:100644 100644 4d73e11... a946023... M fs/ext4/extents.c
fs/ext4/extents.c | 142 +++++++++++++++++++++++++++++++++++++++++++++++++++--
1 files changed, 138 insertions(+), 4 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4d73e11..a946023 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4137,6 +4137,107 @@ static int ext4_xattr_fiemap(struct inode *inode,
}
/*
+ * ext4_unmap_partial_page_buffers()
+ * Unmaps a page range of length 'length' starting from offset
+ * 'from'. The range to be unmaped must be contained with in
+ * one page. If the specified range exceeds the end of the page
+ * it will be shortened to end of the page that cooresponds to
+ * 'from'. Only block aligned buffers will be unmapped and unblock
+ * aligned buffers are skipped
+ */
+static int ext4_unmap_partial_page_buffers(handle_t *handle,
+ struct address_space *mapping, loff_t from, loff_t length)
+{
+ ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
+ unsigned int offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned int blocksize, max, pos;
+ unsigned int end_of_block, range_to_unmap;
+ ext4_lblk_t iblock;
+ struct inode *inode = mapping->host;
+ struct buffer_head *bh;
+ struct page *page;
+ int err = 0;
+
+ page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+ mapping_gfp_mask(mapping) & ~__GFP_FS);
+ if (!page)
+ return -EINVAL;
+
+ blocksize = inode->i_sb->s_blocksize;
+ max = PAGE_CACHE_SIZE - offset;
+
+ /*
+ * correct length if it does not fall between
+ * 'from' and the end of the page
+ */
+ if (length > max || length < 0)
+ length = max;
+
+ iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+ if (!page_has_buffers(page))
+ goto unlock;
+
+ /* Find the buffer that contains "offset" */
+ bh = page_buffers(page);
+ pos = blocksize;
+ while (offset >= pos) {
+ bh = bh->b_this_page;
+ iblock++;
+ pos += blocksize;
+ }
+
+ pos = offset;
+ while (pos < offset + length) {
+ err = 0;
+
+ /* The length of space left to zero */
+ range_to_unmap = offset + length - pos;
+
+ /* The length of space until the end of the block */
+ end_of_block = blocksize - (pos & (blocksize-1));
+
+ /* Do not unmap past end of block */
+ if (range_to_unmap > end_of_block)
+ range_to_unmap = end_of_block;
+
+ if (buffer_freed(bh)) {
+ BUFFER_TRACE(bh, "freed: skip");
+ goto next;
+ }
+
+ if (!buffer_mapped(bh)) {
+ BUFFER_TRACE(bh, "unmapped: skip");
+ goto next;
+ }
+
+ /* If the range is not block aligned, skip */
+ if (range_to_unmap != blocksize)
+ goto next;
+
+ clear_buffer_dirty(bh);
+ bh->b_bdev = NULL;
+ clear_buffer_mapped(bh);
+ clear_buffer_req(bh);
+ clear_buffer_new(bh);
+ clear_buffer_delay(bh);
+ clear_buffer_unwritten(bh);
+ clear_buffer_uptodate(bh);
+ ClearPageUptodate(page);
+
+ BUFFER_TRACE(bh, "buffer unmapped");
+next:
+ bh = bh->b_this_page;
+ iblock++;
+ pos += range_to_unmap;
+ }
+unlock:
+ unlock_page(page);
+ page_cache_release(page);
+ return err;
+}
+
+/*
* ext4_ext_punch_hole
*
* Punches a hole of "length" bytes in a file starting
@@ -4157,7 +4258,7 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
struct address_space *mapping = inode->i_mapping;
struct ext4_map_blocks map;
handle_t *handle;
- loff_t first_block_offset, last_block_offset, block_len;
+ loff_t first_block_offset, last_block_offset, block_len, page_len;
loff_t first_page, last_page, first_page_offset, last_page_offset;
int ret, credits, blocks_released, err = 0;
@@ -4206,9 +4307,9 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
goto out;
/*
- * Now we need to zero out the un block aligned data.
- * If the file is smaller than a block, just
- * zero out the middle
+ * Now we need to zero out the non-block-aligned data.
+ * If the file space being truncated is smaller than
+ * than a block, just zero out the middle
*/
if (first_block > last_block)
ext4_block_zero_page_range(handle, mapping, offset, length);
@@ -4227,6 +4328,39 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
}
}
+ /*
+ * Now we need to unmap the non-page-aligned buffers.
+ * If the block size is smaller than the page size
+ * and the file space being truncated is not
+ * page aligned, then unmap the buffers
+ */
+ if (inode->i_sb->s_blocksize < PAGE_CACHE_SIZE &&
+ !((offset % PAGE_CACHE_SIZE == 0) &&
+ (length % PAGE_CACHE_SIZE == 0))) {
+
+ /*
+ * If the file space being truncated is smaller
+ * than a page, just unmap the middle
+ */
+ if (first_page > last_page) {
+ ext4_unmap_partial_page_buffers(handle,
+ mapping, offset, length);
+ } else {
+ /* unmap page buffers before the first aligned page */
+ page_len = first_page_offset - offset;
+ if (page_len > 0)
+ ext4_unmap_partial_page_buffers(handle,
+ mapping, offset, page_len);
+
+ /* unmap the page buffers after the last aligned page */
+ page_len = offset + length - last_page_offset;
+ if (page_len > 0) {
+ ext4_unmap_partial_page_buffers(handle,
+ mapping, last_page_offset, page_len);
+ }
+ }
+ }
+
/* If there are no blocks to remove, return now */
if (first_block >= last_block)
goto out;
--
1.7.1
Thanks, much better. I still have some questions though.
> /*
> + * ext4_unmap_partial_page_buffers()
> + * Unmaps a page range of length 'length' starting from offset
> + * 'from'. The range to be unmaped must be contained with in
> + * one page. If the specified range exceeds the end of the page
> + * it will be shortened to end of the page that cooresponds to
> + * 'from'. Only block aligned buffers will be unmapped and unblock
> + * aligned buffers are skipped
> + */
> +static int ext4_unmap_partial_page_buffers(handle_t *handle,
> + struct address_space *mapping, loff_t from, loff_t length)
Is "unmap" really the right name of this function? Isn't the real to
invalidate a buffer head that corresponds to a punched-out block?
Correct me if I'm wrong, but the primary problem that we need to worry
about here is that the block that has just been punched out could be
reused by some other inode --- but if we still have a dirty buffer
head mapped to the now-punched-out-and-released-block, writeback could
end up corrupting some other buffer. So in some sense, what we are
effectively doing is a bforget, right?
In fact, if we are doing data journalling, don't we need to call
ext4_forget()? Otherwise, the block gets reassigned, but if the block
has been data journaled, without the revoke record written by
ext4_forget(), won't we have a potential problem where the journal
replay could end up corrupting another inode's data?
Furthermore, the question I have is why don't have precisely the same
problem with truncate? If we have a 16k page size, and we open a 15k
file for writing, and then we overwrite the first 15k, and then
truncate the file down to 4k. At the moment we'll zero out the last
11k of the file, and leave the buffer heads dirty and mapped; then
suppose those blocks get reassigned and used before writeback happens.
We're not calling ext4_unmap_partial_page_buffers() now; why isn't
this a problem today? Are we just getting lucky? Why is this more of
a problem with punch, such that xfstests 75, 112, 127, etc. are
failing?
Am I missing something here?
> + /*
> + * Now we need to unmap the non-page-aligned buffers.
> + * If the block size is smaller than the page size
> + * and the file space being truncated is not
> + * page aligned, then unmap the buffers
> + */
> + if (inode->i_sb->s_blocksize < PAGE_CACHE_SIZE &&
> + !((offset % PAGE_CACHE_SIZE == 0) &&
> + (length % PAGE_CACHE_SIZE == 0))) {
How does this solve the situation I had outlined before? Suppose are
have a 4k page size, and a 4k block size, and we issue a punch
operation with offset=4092, and length=4105. In the previous section
of code, the offsets 4092--4095 and 8193--8197 will be zero'ed out.
But offset will not be a multiple of the page size, and length will
not be a multiple of page size, but what has been left to be dealt
with *is* an exactl multiple of a page size, and thus could be
optimized out.
I didn't see any changes in your patch that adjust offset and length
after calling ext4_block_zero_page_range(); so I think we still have
an optimization opportunity we're missing here.
Regards,
- Ted
On 08/07/2011 07:42 PM, Ted Ts'o wrote:
> Thanks, much better. I still have some questions though.
>
>> /*
>> + * ext4_unmap_partial_page_buffers()
>> + * Unmaps a page range of length 'length' starting from offset
>> + * 'from'. The range to be unmaped must be contained with in
>> + * one page. If the specified range exceeds the end of the page
>> + * it will be shortened to end of the page that cooresponds to
>> + * 'from'. Only block aligned buffers will be unmapped and unblock
>> + * aligned buffers are skipped
>> + */
>> +static int ext4_unmap_partial_page_buffers(handle_t *handle,
>> + struct address_space *mapping, loff_t from, loff_t length)
>
> Is "unmap" really the right name of this function? Isn't the real to
> invalidate a buffer head that corresponds to a punched-out block?
>
> Correct me if I'm wrong, but the primary problem that we need to worry
> about here is that the block that has just been punched out could be
> reused by some other inode --- but if we still have a dirty buffer
> head mapped to the now-punched-out-and-released-block, writeback could
> end up corrupting some other buffer. So in some sense, what we are
> effectively doing is a bforget, right?
Hmm, well actually when I spotted this bug, I was thinking more along the lines of: because the buffer is still mapped, we end up reading stale data out of it. They are not supposed to be dirty because we flush out all the pages before punching holes. We did this with the idea of avoiding race conditions, and also it simplified the code because the delayed extents are gone. But now that you point it out, Im seeing that we only flush the aligned pages in the hole, so maybe we need to flush the extra two non aligned pages at the beginning and end.
>
> In fact, if we are doing data journalling, don't we need to call
> ext4_forget()? Otherwise, the block gets reassigned, but if the block
> has been data journaled, without the revoke record written by
> ext4_forget(), won't we have a potential problem where the journal
> replay could end up corrupting another inode's data?
Well, I did some investigating on ext4_forget, and it looks like it gets called in ext4_free_blocks when ext4_remove_blocks is called, (which sets the forget flag). So it looks like we do end up calling it, but not until we get down to where we are actually punching out the blocks.
>
> Furthermore, the question I have is why don't have precisely the same
> problem with truncate? If we have a 16k page size, and we open a 15k
> file for writing, and then we overwrite the first 15k, and then
> truncate the file down to 4k. At the moment we'll zero out the last
> 11k of the file, and leave the buffer heads dirty and mapped; then
> suppose those blocks get reassigned and used before writeback happens.
> We're not calling ext4_unmap_partial_page_buffers() now; why isn't
> this a problem today? Are we just getting lucky? Why is this more of
> a problem with punch, such that xfstests 75, 112, 127, etc. are
> failing?
>
> Am I missing something here?
That is a really good point, I'd be surprised if we've just been lucky all this time. I am thinking maybe it is because of the fact that i_size is already trimmed down to the new size? I do not think we read beyond i_size.
Initially I was modeling punch hole from truncate, so they are similar. The truncate code is actually calling ext4_block_truncate_page, which only zeros out the end of the block, and does not bother with the remaining buffer heads appearing after i_size. Then it goes to ext4_ext_remove_space, which is pretty much calling remove_blocks like punch hole does, except that it removes everything from the end of the file to the last block that contains i_size. So I think the fact that i_size changes would would make sense.
>
>> + /*
>> + * Now we need to unmap the non-page-aligned buffers.
>> + * If the block size is smaller than the page size
>> + * and the file space being truncated is not
>> + * page aligned, then unmap the buffers
>> + */
>> + if (inode->i_sb->s_blocksize< PAGE_CACHE_SIZE&&
>> + !((offset % PAGE_CACHE_SIZE == 0)&&
>> + (length % PAGE_CACHE_SIZE == 0))) {
>
> How does this solve the situation I had outlined before? Suppose are
> have a 4k page size, and a 4k block size, and we issue a punch
> operation with offset=4092, and length=4105. In the previous section
> of code, the offsets 4092--4095 and 8193--8197 will be zero'ed out.
> But offset will not be a multiple of the page size, and length will
> not be a multiple of page size, but what has been left to be dealt
> with *is* an exactl multiple of a page size, and thus could be
> optimized out.
>
> I didn't see any changes in your patch that adjust offset and length
> after calling ext4_block_zero_page_range(); so I think we still have
> an optimization opportunity we're missing here.
>
I see, sorry I must have miss understood the first time. Maybe what we need instead is to just check to see if page_len is larger than a block instead of just larger than zero. Something like this:
/* unmap page buffers before the first aligned page */
page_len = first_page_offset - offset;
if (page_len > blocksize)
ext4_unmap_partial_page_buffers(handle,
mapping, offset, page_len);
/* unmap the page buffers after the last aligned page */
page_len = offset + length - last_page_offset;
if (page_len > blocksize) {
ext4_unmap_partial_page_buffers(handle,
mapping, last_page_offset, page_len);
In both cases, the range will start or end on a page boundary, so if the range we're unmapping is greater than a block, then there is at least one buffer head that needs unmapping. Does that catch all the corner cases?
Thx!
Allison Henderson
> Regards,
>
> - Ted
On Mon, Aug 08, 2011 at 01:54:30PM -0700, Allison Henderson wrote:
> > Correct me if I'm wrong, but the primary problem that we need to worry
> > about here is that the block that has just been punched out could be
> > reused by some other inode --- but if we still have a dirty buffer
> > head mapped to the now-punched-out-and-released-block, writeback could
> > end up corrupting some other buffer. So in some sense, what we are
> > effectively doing is a bforget, right?
>
> Hmm, well actually when I spotted this bug, I was thinking more
> along the lines of: because the buffer is still mapped, we end up
> reading stale data out of it. They are not supposed to be dirty
> because we flush out all the pages before punching holes. We did
> this with the idea of avoiding race conditions, and also it
> simplified the code because the delayed extents are gone. But now
> that you point it out, Im seeing that we only flush the aligned
> pages in the hole, so maybe we need to flush the extra two non
> aligned pages at the beginning and end.
We never use the buffer to read data "out of" the buffer. We used to
use the buffer_head as the interface to the buffer layer when reading
or writing an entire page, but the buffer heads were never part of the
buffer cache, so there was never any chance that you could read "stale
data" from a mapped buffer head. These days, we use the buffer head
as an inefficient data structure to store the mapping from the logical
block # to a physical block #. But that's really it.
But the more I look at this code, the more I think it's not quite
right, and I think that's why you're still having a failure with test
127. First of all, the comment here is wrong:
/*
* Now we need to zero out the non-block-aligned data.
* If the file space being truncated is smaller than
* than a block, just zero out the middle
*/
If blocksize != pagesize, this comment is _wrong_ in the case where
punched region is within a single page, but larger than a block:
if (first_block > last_block)
ext4_block_zero_page_range(handle, mapping, offset, length);
ext4_block_zero_page() will zero out the entire range that has been
punched out if it is within a single page. And that is in fact a good
and proper thing to do, even if it is larger than a block. For
example, assume a page size of 4096, and a block size of 1024, and the
punched-out-region begins at offset 1000 and extends for 1030 byets.
It's larger than a block, but because it's within a single page, we
zero out the entire range.
else {
/* zero out the head of the hole before the first block */
block_len = first_block_offset - offset;
if (block_len > 0)
ext4_block_zero_page_range(handle, mapping,
offset, block_len);
/* zero out the tail of the hole after the last block */
block_len = offset + length - last_block_offset;
if (block_len > 0) {
ext4_block_zero_page_range(handle, mapping,
last_block_offset, block_len);
}
}
OK, now make the same assumption, but the punch range is 2000 for a
length of 6000 bytes. And assume the file is 8192 bytes. Right now
we will only zero out the range 2000-2048, and 7168--8000, since these
are the "tails" before the "first block" and after the "last block".
But if both 4k files are mapped in the page cache, in fact we need to
zero out the ranges 2000-4095 and 4096--8000! Why? Because we don't
read things out of the buffer cache, we read things out of the page
cache. Or to make this more obvious, suppose this file is mmap'ed
into some process address space. If you don't zero out the entire
range of bytes, then when the process reads from the mmap'ed region,
it will see non-zero pages. It matters not a whit that the buffers
heads are unmapped in ext4_unmap_partial_page_buffers().
The reason why it's important to remove the mapped buffers is because
if we ever call write_page(), or read_page(), we use the mapped
buffers as a cache so we don't have to call ext4_map_blocks(). Hence,
if we remove some blocks from the logical->physical block mapping via
the punch() system call, we need to update the buffer_heads that may
be attached to the page since we have to keep the cache in sync.
Does that make sense?
> > Furthermore, the question I have is why don't have precisely the same
> > problem with truncate? If we have a 16k page size, and we open a 15k
> > file for writing, and then we overwrite the first 15k, and then
> > truncate the file down to 4k. At the moment we'll zero out the last
> > 11k of the file, and leave the buffer heads dirty and mapped; then
> > suppose those blocks get reassigned and used before writeback happens.
> > We're not calling ext4_unmap_partial_page_buffers() now; why isn't
> > this a problem today? Are we just getting lucky? Why is this more of
> > a problem with punch, such that xfstests 75, 112, 127, etc. are
> > failing?
> >
> > Am I missing something here?
>
> That is a really good point, I'd be surprised if we've just been
> lucky all this time. I am thinking maybe it is because of the fact
> that i_size is already trimmed down to the new size? I do not think
> we read beyond i_size.
It's not the read(2) that I'm worried about; it's what happens if we
call writepage() on that last block. If we have a left-over
buffer_head which is no longer mapped, and that block has been
repurposed, when we call writepage() we'll end up smashing potentially
someone else's block. If that block hasn't gotten reused at the time
of the writepage(), we'll just do some pointless I/O, but it won't
cause any harm. I suspect we're just getting lucky....
Regards,
- Ted
On 08/09/2011 09:45 AM, Ted Ts'o wrote:
> On Mon, Aug 08, 2011 at 01:54:30PM -0700, Allison Henderson wrote:
>>> Correct me if I'm wrong, but the primary problem that we need to worry
>>> about here is that the block that has just been punched out could be
>>> reused by some other inode --- but if we still have a dirty buffer
>>> head mapped to the now-punched-out-and-released-block, writeback could
>>> end up corrupting some other buffer. So in some sense, what we are
>>> effectively doing is a bforget, right?
>>
>> Hmm, well actually when I spotted this bug, I was thinking more
>> along the lines of: because the buffer is still mapped, we end up
>> reading stale data out of it. They are not supposed to be dirty
>> because we flush out all the pages before punching holes. We did
>> this with the idea of avoiding race conditions, and also it
>> simplified the code because the delayed extents are gone. But now
>> that you point it out, Im seeing that we only flush the aligned
>> pages in the hole, so maybe we need to flush the extra two non
>> aligned pages at the beginning and end.
>
> We never use the buffer to read data "out of" the buffer. We used to
> use the buffer_head as the interface to the buffer layer when reading
> or writing an entire page, but the buffer heads were never part of the
> buffer cache, so there was never any chance that you could read "stale
> data" from a mapped buffer head. These days, we use the buffer head
> as an inefficient data structure to store the mapping from the logical
> block # to a physical block #. But that's really it.
>
> But the more I look at this code, the more I think it's not quite
> right, and I think that's why you're still having a failure with test
> 127. First of all, the comment here is wrong:
>
> /*
> * Now we need to zero out the non-block-aligned data.
> * If the file space being truncated is smaller than
> * than a block, just zero out the middle
> */
>
> If blocksize != pagesize, this comment is _wrong_ in the case where
> punched region is within a single page, but larger than a block:
>
> if (first_block> last_block)
> ext4_block_zero_page_range(handle, mapping, offset, length);
>
> ext4_block_zero_page() will zero out the entire range that has been
> punched out if it is within a single page. And that is in fact a good
> and proper thing to do, even if it is larger than a block. For
> example, assume a page size of 4096, and a block size of 1024, and the
> punched-out-region begins at offset 1000 and extends for 1030 byets.
> It's larger than a block, but because it's within a single page, we
> zero out the entire range.
Hmm, for this piece here, Im not sure I quite follow you. I was pretty
sure that ext4_block_zero_page() only deals with ranges that appear with
in one block. When I modeled ext4_unmap_partial_page_buffers() after
it, I had to add a loop to get it to move over each buffer head instead
of dealing with just one. Maybe the comment should say something "If
the file space being truncated is contained in one block". And a
similar comment for the page un-mapping code too?
>
> else {
> /* zero out the head of the hole before the first block */
> block_len = first_block_offset - offset;
> if (block_len> 0)
> ext4_block_zero_page_range(handle, mapping,
> offset, block_len);
>
> /* zero out the tail of the hole after the last block */
> block_len = offset + length - last_block_offset;
> if (block_len> 0) {
> ext4_block_zero_page_range(handle, mapping,
> last_block_offset, block_len);
> }
> }
>
> OK, now make the same assumption, but the punch range is 2000 for a
> length of 6000 bytes. And assume the file is 8192 bytes. Right now
> we will only zero out the range 2000-2048, and 7168--8000, since these
> are the "tails" before the "first block" and after the "last block".
>
> But if both 4k files are mapped in the page cache, in fact we need to
> zero out the ranges 2000-4095 and 4096--8000! Why? Because we don't
> read things out of the buffer cache, we read things out of the page
> cache. Or to make this more obvious, suppose this file is mmap'ed
> into some process address space. If you don't zero out the entire
> range of bytes, then when the process reads from the mmap'ed region,
> it will see non-zero pages. It matters not a whit that the buffers
> heads are unmapped in ext4_unmap_partial_page_buffers().
>
> The reason why it's important to remove the mapped buffers is because
> if we ever call write_page(), or read_page(), we use the mapped
> buffers as a cache so we don't have to call ext4_map_blocks(). Hence,
> if we remove some blocks from the logical->physical block mapping via
> the punch() system call, we need to update the buffer_heads that may
> be attached to the page since we have to keep the cache in sync.
>
> Does that make sense?
Yes, I think this is pretty close to what is going on in test 127. :)
When I took a closer look at the code that flushes the hole, I found it
is actually flushing the entire hole (its a little misleading, I will
fix that), but beyond i_size due to a condition that matches what you
describe. So what your saying now makes a lot of sense :) So it looks
like what I need to do now is add some code to zero out the rest of the
page when i_size and the end of the hole are in the same page.
>
>>> Furthermore, the question I have is why don't have precisely the same
>>> problem with truncate? If we have a 16k page size, and we open a 15k
>>> file for writing, and then we overwrite the first 15k, and then
>>> truncate the file down to 4k. At the moment we'll zero out the last
>>> 11k of the file, and leave the buffer heads dirty and mapped; then
>>> suppose those blocks get reassigned and used before writeback happens.
>>> We're not calling ext4_unmap_partial_page_buffers() now; why isn't
>>> this a problem today? Are we just getting lucky? Why is this more of
>>> a problem with punch, such that xfstests 75, 112, 127, etc. are
>>> failing?
>>>
>>> Am I missing something here?
>>
>> That is a really good point, I'd be surprised if we've just been
>> lucky all this time. I am thinking maybe it is because of the fact
>> that i_size is already trimmed down to the new size? I do not think
>> we read beyond i_size.
>
> It's not the read(2) that I'm worried about; it's what happens if we
> call writepage() on that last block. If we have a left-over
> buffer_head which is no longer mapped, and that block has been
> repurposed, when we call writepage() we'll end up smashing potentially
> someone else's block. If that block hasn't gotten reused at the time
> of the writepage(), we'll just do some pointless I/O, but it won't
> cause any harm. I suspect we're just getting lucky....
>
Well, that does make sense, I suppose I can make a patch to flush, zero,
and unmap the buffer heads beyond i_size during a truncate, just like
how we're doing for punch hole. It would be nice to some how verify
that the bug is there though. I wonder if fsx doesnt find it because
it's busy with only one file? Or maybe we just havnt let it run long
enough yet with a 1024 block size. If the zeroing out does the trick
for 127, I will let it run over night and keep an eye out for any other
oddness.
Allison Henderson
> Regards,
>
> - Ted
On Tue, Aug 09, 2011 at 02:10:44PM -0700, Allison Henderson wrote:
> > /*
> > * Now we need to zero out the non-block-aligned data.
> > * If the file space being truncated is smaller than
> > * than a block, just zero out the middle
> > */
> >
> Hmm, for this piece here, Im not sure I quite follow you. I was
> pretty sure that ext4_block_zero_page() only deals with ranges that
> appear with in one block.
Yes, that's true. What I was objecting to in the comment is the
phrase "smaller than a block". That's not right, or it's only right
if blocksize == page size. That comment should really read, "if the
file space is smaller than a ***page***" zero out the middle.
That's what happens in ext4_block_zero_page(), and so it works
correctly; but the comment is confusing, and it makes the reader think
that all we only need to zero is based on block boundaries, when in
fact when we look at zeroing memory, we have to base it on page
boundaries.
Basically for either punch or truncate what we must do is:
*) Zero partial pages
*) Unmap full pages
*) We take buffer_heads which have been freed and either
(a) detach them, or
(b) clear the mapped flag, to indicate that the block number
in the bh is invalid
Using "unmap" for both pages and blocks can be confusing, since for
pages, unmapping means that we remove the page completely from the
page cache, so any future attempt to read from that virtual memory
address will result in a zero page getting mapped in.
However, for buffers, "unmapping" merely means that we are removing
the mapping to the disk block which has been deallocated by the punch
operation. It does not result in anything getting zero'ed out in the
_page_ cache, which is why we need to zero out partial pages.
Does that make sense?
- Ted
On 08/10/2011 08:17 PM, Ted Ts'o wrote:
> On Tue, Aug 09, 2011 at 02:10:44PM -0700, Allison Henderson wrote:
>>> /*
>>> * Now we need to zero out the non-block-aligned data.
>>> * If the file space being truncated is smaller than
>>> * than a block, just zero out the middle
>>> */
>>>
>> Hmm, for this piece here, Im not sure I quite follow you. I was
>> pretty sure that ext4_block_zero_page() only deals with ranges that
>> appear with in one block.
>
> Yes, that's true. What I was objecting to in the comment is the
> phrase "smaller than a block". That's not right, or it's only right
> if blocksize == page size. That comment should really read, "if the
> file space is smaller than a ***page***" zero out the middle.
>
> That's what happens in ext4_block_zero_page(), and so it works
> correctly; but the comment is confusing, and it makes the reader think
> that all we only need to zero is based on block boundaries, when in
> fact when we look at zeroing memory, we have to base it on page
> boundaries.
>
> Basically for either punch or truncate what we must do is:
>
> *) Zero partial pages
> *) Unmap full pages
> *) We take buffer_heads which have been freed and either
> (a) detach them, or
> (b) clear the mapped flag, to indicate that the block number
> in the bh is invalid
>
> Using "unmap" for both pages and blocks can be confusing, since for
> pages, unmapping means that we remove the page completely from the
> page cache, so any future attempt to read from that virtual memory
> address will result in a zero page getting mapped in.
>
> However, for buffers, "unmapping" merely means that we are removing
> the mapping to the disk block which has been deallocated by the punch
> operation. It does not result in anything getting zero'ed out in the
> _page_ cache, which is why we need to zero out partial pages.
>
> Does that make sense?
>
> - Ted
Ah, ok then that makes sense. So maybe what I need to do is modify the
ext4_block_zero_page routine to zero everything in the specified range
and then clear the mapped flag for any buffer header that is fully
zeroed. I'm not sure I'm clear about what it means to be detached
though. What is the difference between clearing the flag and detaching
the buffer head? Thank you for all the explaining!
Allison Henderson
On Thu, Aug 11, 2011 at 02:31:01PM -0700, Allison Henderson wrote:
> Ah, ok then that makes sense. So maybe what I need to do is modify
> the ext4_block_zero_page routine to zero everything in the specified
> range and then clear the mapped flag for any buffer header that is
> fully zeroed. I'm not sure I'm clear about what it means to be
> detached though. What is the difference between clearing the flag
> and detaching the buffer head? Thank you for all the explaining!
Technically speaking, there is no guarantee that a page has either no
buffer heads, or all of the buffer heads that cover a page --- or that
the buffer_heads are in any particular order in the linked list.
In practice create_empty_buffers() creates them in a particular order,
and always creates *all* of them --- and I'm not sure if we have any
code paths left that don't use create_empty_buffers() --- but I'm
pretty sure there used to be cases where the buffer_heads could only
be partially present, or in an arbitrary order.
So if you look at the code which iterates over the page buffers,
you'll see they are very defensively coded --- and it's complicated.
One of these days I'd really like to simplify the code so instead of
using linked list, we use an array of structures --- and something
slimmed down from a buffer_head to have only those essential fields
that ext4 uses.
One of these days....
Anyway, clearing the all of the bh_flags as is currently done is good
enough, and actually probably easier (in case there's code which is
dependent on all of the bh's being there, and in order)
- Ted