LinuxLists.cc - [PATCH] ext4: Rework the ext4_da

2008-07-31 17:34:10

Subject: [PATCH] ext4: Rework the ext4_da_writepages

With the below changes we reserve credit needed to insert only one extent
resulting from a call to single get_block. That make sure we don't take
too much journal credits during writeout. We also don't limit the pages
to write. That means we loop through the dirty pages building largest
possible contiguous block request. Then we issue a single get_block request.
We may get less block that we requested. If so we would end up not mapping
some of the buffer_heads. That means those buffer_heads are still marked delay.
Later in the writepage callback via __mpage_writepage we redirty those pages.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/inode.c | 128 +++++++++++++++++++++++++++++-------------------------
1 files changed, 69 insertions(+), 59 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5665bec..465108b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -41,6 +41,8 @@
#include "acl.h"
#include "ext4_extents.h"

+#define MPAGE_DA_EXTENT_TAIL 0x01
+
static inline int ext4_begin_ordered_truncate(struct inode *inode,
loff_t new_size)
{
@@ -1580,6 +1582,8 @@ static void ext4_da_page_release_reservation(struct page *page,
unsigned long first_page, next_page; /* extent of pages */
get_block_t *get_block;
struct writeback_control *wbc;
+ int io_done;
+ long pages_written;
};

/*
@@ -1629,6 +1633,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
index++;

err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
+ if (!err)
+ mpd->pages_written++;

/*
* In error case, we have to continue because
@@ -1748,8 +1754,8 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
*/
static void mpage_da_map_blocks(struct mpage_da_data *mpd)
{
+ int err = 0;
struct buffer_head *lbh = &mpd->lbh;
- int err = 0, remain = lbh->b_size;
sector_t next = lbh->b_blocknr;
struct buffer_head new;

@@ -1759,35 +1765,25 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
if (buffer_mapped(lbh) && !buffer_delay(lbh))
return;

- while (remain) {
- new.b_state = lbh->b_state;
- new.b_blocknr = 0;
- new.b_size = remain;
- err = mpd->get_block(mpd->inode, next, &new, 1);
- if (err) {
- /*
- * Rather than implement own error handling
- * here, we just leave remaining blocks
- * unallocated and try again with ->writepage()
- */
- break;
- }
- BUG_ON(new.b_size == 0);
+ new.b_state = lbh->b_state;
+ new.b_blocknr = 0;
+ new.b_size = lbh->b_size;
+ err = mpd->get_block(mpd->inode, next, &new, 1);
+ if (err)
+ return;
+ BUG_ON(new.b_size == 0);

- if (buffer_new(&new))
- __unmap_underlying_blocks(mpd->inode, &new);
+ if (buffer_new(&new))
+ __unmap_underlying_blocks(mpd->inode, &new);

- /*
- * If blocks are delayed marked, we need to
- * put actual blocknr and drop delayed bit
- */
- if (buffer_delay(lbh))
- mpage_put_bnr_to_bhs(mpd, next, &new);
+ /*
+ * If blocks are delayed marked, we need to
+ * put actual blocknr and drop delayed bit
+ */
+ if (buffer_delay(lbh))
+ mpage_put_bnr_to_bhs(mpd, next, &new);

- /* go for the remaining blocks */
- next += new.b_size >> mpd->inode->i_blkbits;
- remain -= new.b_size;
- }
+ return;
}

#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
@@ -1832,13 +1828,9 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
* need to flush current extent and start new one
*/
mpage_da_map_blocks(mpd);
-
- /*
- * Now start a new extent
- */
- lbh->b_size = bh->b_size;
- lbh->b_state = bh->b_state & BH_FLAGS;
- lbh->b_blocknr = logical;
+ mpage_da_submit_io(mpd);
+ mpd->io_done = 1;
+ return;
}

/*
@@ -1858,6 +1850,17 @@ static int __mpage_da_writepage(struct page *page,
struct buffer_head *bh, *head, fake;
sector_t logical;

+ if (mpd->io_done) {
+ /*
+ * Rest of the page in the page_vec
+ * redirty then and skip then. We will
+ * try to to write them again after
+ * starting a new transaction
+ */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return MPAGE_DA_EXTENT_TAIL;
+ }
/*
* Can we merge this page to current extent?
*/
@@ -1869,6 +1872,13 @@ static int __mpage_da_writepage(struct page *page,
if (mpd->next_page != mpd->first_page) {
mpage_da_map_blocks(mpd);
mpage_da_submit_io(mpd);
+ /*
+ * skip rest of the page in the page_vec
+ */
+ mpd->io_done = 1;
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return MPAGE_DA_EXTENT_TAIL;
}

/*
@@ -1899,6 +1909,8 @@ static int __mpage_da_writepage(struct page *page,
set_buffer_dirty(bh);
set_buffer_uptodate(bh);
mpage_add_bh_to_extent(mpd, logical, bh);
+ if (mpd->io_done)
+ return MPAGE_DA_EXTENT_TAIL;
} else {
/*
* Page with regular buffer heads, just add all dirty ones
@@ -1907,8 +1919,11 @@ static int __mpage_da_writepage(struct page *page,
bh = head;
do {
BUG_ON(buffer_locked(bh));
- if (buffer_dirty(bh))
+ if (buffer_dirty(bh)) {
mpage_add_bh_to_extent(mpd, logical, bh);
+ if (mpd->io_done)
+ return MPAGE_DA_EXTENT_TAIL;
+ }
logical++;
} while ((bh = bh->b_this_page) != head);
}
@@ -1943,6 +1958,7 @@ static int mpage_da_writepages(struct address_space *mapping,
get_block_t get_block)
{
struct mpage_da_data mpd;
+ long to_write;
int ret;

if (!get_block)
@@ -1956,17 +1972,22 @@ static int mpage_da_writepages(struct address_space *mapping,
mpd.first_page = 0;
mpd.next_page = 0;
mpd.get_block = get_block;
+ mpd.io_done = 0;
+ mpd.pages_written = 0;
+
+ to_write = wbc->nr_to_write;

ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);

/*
* Handle last extent of pages
*/
- if (mpd.next_page != mpd.first_page) {
+ if (!mpd.io_done && mpd.next_page != mpd.first_page) {
mpage_da_map_blocks(&mpd);
mpage_da_submit_io(&mpd);
}

+ wbc->nr_to_write = to_write - mpd.pages_written;
return ret;
}

@@ -2178,10 +2199,6 @@ static int ext4_da_writepages(struct address_space *mapping,
int ret = 0;
long to_write;
loff_t range_start = 0;
- int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
- int max_credit_blocks = ext4_journal_max_transaction_buffers(inode);
- int need_credits_per_page = ext4_writepages_trans_blocks(inode, 1);
- int max_writeback_pages = (max_credit_blocks / blocks_per_page) / need_credits_per_page;

/*
* No pages to write? This is mainly a kludge to avoid starting
@@ -2205,25 +2222,11 @@ static int ext4_da_writepages(struct address_space *mapping,
range_start = wbc->range_start;
}

- while (!ret && to_write) {
- /*
- * set the max dirty pages could be write at a time
- * to fit into the reserved transaction credits
- */
- if (wbc->nr_to_write > max_writeback_pages)
- wbc->nr_to_write = max_writeback_pages;
+ while (!ret && to_write > 0) {
+
+ BUG_ON(ext4_should_journal_data(inode));
+ needed_blocks = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);

- /*
- * Estimate the worse case needed credits to write out
- * to_write pages
- */
- needed_blocks = ext4_writepages_trans_blocks(inode,
- wbc->nr_to_write);
- while (needed_blocks > max_credit_blocks) {
- wbc->nr_to_write --;
- needed_blocks = ext4_writepages_trans_blocks(inode,
- wbc->nr_to_write);
- }
/* start a new transaction*/
handle = ext4_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
@@ -2251,7 +2254,14 @@ static int ext4_da_writepages(struct address_space *mapping,
ret = mpage_da_writepages(mapping, wbc,
ext4_da_get_block_write);
ext4_journal_stop(handle);
- if (wbc->nr_to_write) {
+ if (ret == MPAGE_DA_EXTENT_TAIL) {
+ /*
+ * got one extent now try with
+ * rest of the pages
+ */
+ to_write += wbc->nr_to_write;
+ ret = 0;
+ } else if (wbc->nr_to_write) {
/*
* There is no more writeout needed
* or we requested for a noblocking writeout
--
1.6.0.rc0.42.g186458.dirty

2008-07-31 17:48:13

by Aneesh Kumar K.V

[permalink] [raw]

Subject: Re: [PATCH] ext4: Rework the ext4_da_writepages

On Thu, Jul 31, 2008 at 11:03:25PM +0530, Aneesh Kumar K.V wrote:
> With the below changes we reserve credit needed to insert only one extent
> resulting from a call to single get_block. That make sure we don't take
> too much journal credits during writeout. We also don't limit the pages
> to write. That means we loop through the dirty pages building largest
> possible contiguous block request. Then we issue a single get_block request.
> We may get less block that we requested. If so we would end up not mapping
> some of the buffer_heads. That means those buffer_heads are still marked delay.
> Later in the writepage callback via __mpage_writepage we redirty those pages.
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>

Tested with
a) fsstress with falloc
b) fsxlinux with falloc
c) fs_inode
d) ffsb
e) cp -ax / .

with fsck after each test

-aneesh

2008-07-31 20:11:02

by Andreas Dilger

[permalink] [raw]

Subject: Re: [PATCH] ext4: Rework the ext4_da_writepages

On Jul 31, 2008 23:03 +0530, Aneesh Kumar wrote:
> With the below changes we reserve credit needed to insert only one extent
> resulting from a call to single get_block. That make sure we don't take
> too much journal credits during writeout. We also don't limit the pages
> to write. That means we loop through the dirty pages building largest
> possible contiguous block request. Then we issue a single get_block request.
> We may get less block that we requested. If so we would end up not mapping
> some of the buffer_heads. That means those buffer_heads are still marked delay.
> Later in the writepage callback via __mpage_writepage we redirty those pages.

Can you please clarify this? Does this mean we take one pass through the
dirty pages, but possibly do not allocate some subset of the pages. Then,
at some later time these holes are written out separately? This seems
like it would produce fragmentation if we do not work to ensure the pages
are allocated in sequence. Maybe I'm misunderstanding your comment and
the unmapped pages are immediately mapped on the next loop?

It is great that this will potentially allocate huge amounts of space
(up to 128MB ideally) in a single call if the pages are contiguous.

The only danger I can see of having many smaller transactions instead
of a single larger one is if this is causing many more transactions
in the case of e.g. O_SYNC or similar, but AFAIK that is handled at
a higher level and we should be OK.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

2008-08-01 03:08:42

by Theodore Ts'o

[permalink] [raw]

Subject: Re: [PATCH] ext4: Rework the ext4_da_writepages

On Thu, Jul 31, 2008 at 11:03:25PM +0530, Aneesh Kumar K.V wrote:
> With the below changes we reserve credit needed to insert only one extent
> resulting from a call to single get_block. That make sure we don't take
> too much journal credits during writeout. We also don't limit the pages
> to write. That means we loop through the dirty pages building largest
> possible contiguous block request. Then we issue a single get_block request.
> We may get less block that we requested. If so we would end up not mapping
> some of the buffer_heads. That means those buffer_heads are still marked delay.
> Later in the writepage callback via __mpage_writepage we redirty those pages.

If you're only redirtying the pages in the callback, that means they
are left clean but with the delayed flag set; is that going to be
enough to keep the mm from dropping the pages because they are clean?
Or is the mechanism which prevents this is that you've kept the
refcount on the pages bumped until after the callback?

- Ted

2008-08-01 04:06:47

by Aneesh Kumar K.V

[permalink] [raw]

Subject: Re: [PATCH] ext4: Rework the ext4_da_writepages

On Thu, Jul 31, 2008 at 11:08:25PM -0400, Theodore Tso wrote:
> On Thu, Jul 31, 2008 at 11:03:25PM +0530, Aneesh Kumar K.V wrote:
> > With the below changes we reserve credit needed to insert only one extent
> > resulting from a call to single get_block. That make sure we don't take
> > too much journal credits during writeout. We also don't limit the pages
> > to write. That means we loop through the dirty pages building largest
> > possible contiguous block request. Then we issue a single get_block request.
> > We may get less block that we requested. If so we would end up not mapping
> > some of the buffer_heads. That means those buffer_heads are still marked delay.
> > Later in the writepage callback via __mpage_writepage we redirty those pages.
>
> If you're only redirtying the pages in the callback, that means they
> are left clean but with the delayed flag set; is that going to be
> enough to keep the mm from dropping the pages because they are clean?
> Or is the mechanism which prevents this is that you've kept the
> refcount on the pages bumped until after the callback?

writepages can use redirty_page_for_writepage to skip the pages during
writeout. We do that in most of the writepage call backs.
So I guess they would be properly marked dirty .

-aneesh

2008-08-01 04:54:24

by Aneesh Kumar K.V

[permalink] [raw]

Subject: Re: [PATCH] ext4: Rework the ext4_da_writepages

On Thu, Jul 31, 2008 at 02:10:55PM -0600, Andreas Dilger wrote:
> On Jul 31, 2008 23:03 +0530, Aneesh Kumar wrote:
> > With the below changes we reserve credit needed to insert only one extent
> > resulting from a call to single get_block. That make sure we don't take
> > too much journal credits during writeout. We also don't limit the pages
> > to write. That means we loop through the dirty pages building largest
> > possible contiguous block request. Then we issue a single get_block request.
> > We may get less block that we requested. If so we would end up not mapping
> > some of the buffer_heads. That means those buffer_heads are still marked delay.
> > Later in the writepage callback via __mpage_writepage we redirty those pages.
>
> Can you please clarify this? Does this mean we take one pass through the
> dirty pages, but possibly do not allocate some subset of the pages. Then,
> at some later time these holes are written out separately? This seems
> like it would produce fragmentation if we do not work to ensure the pages
> are allocated in sequence. Maybe I'm misunderstanding your comment and
> the unmapped pages are immediately mapped on the next loop?

We take multiple pass through the dirty pages until wbc->nr_to_write is
<= 0 or we don't have anything more to write. But if get_block doesn't
return the requested number of blocks we may possibly not writeout
some of the pages. Whether this can result in a disk layout worse than
the current, I am not sure. I haven't looked at the layout yet.
But these pages which are skipped are redirtied again via
reditry_pages_for_writepage and will be forced for writeout. Well
we can do better by setting wbc->encountered_congestion = 1; even
though we are not really congested. That would cause most of the pdflush
work func to retry writeback_indoes.

for(;;) {
...
wbc.pages_skipped = 0;
writeback_inodes(&wbc);
...

if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
/* Wrote less than expected */
if (wbc.encountered_congestion || wbc.more_io)
congestion_wait(WRITE, HZ/10);
else
break;
}

}

>
> It is great that this will potentially allocate huge amounts of space
> (up to 128MB ideally) in a single call if the pages are contiguous.
>
> The only danger I can see of having many smaller transactions instead
> of a single larger one is if this is causing many more transactions
> in the case of e.g. O_SYNC or similar, but AFAIK that is handled at
> a higher level and we should be OK.
>
> Cheers, Andreas
> --
> Andreas Dilger
> Sr. Staff Engineer, Lustre Group
> Sun Microsystems of Canada, Inc.
>

-aneesh

2008-08-01 05:07:45

by Aneesh Kumar K.V

[permalink] [raw]

Subject: Re: [PATCH] ext4: Rework the ext4_da_writepages

On Fri, Aug 01, 2008 at 10:24:12AM +0530, Aneesh Kumar K.V wrote:
> On Thu, Jul 31, 2008 at 02:10:55PM -0600, Andreas Dilger wrote:
> > On Jul 31, 2008 23:03 +0530, Aneesh Kumar wrote:
> > > With the below changes we reserve credit needed to insert only one extent
> > > resulting from a call to single get_block. That make sure we don't take
> > > too much journal credits during writeout. We also don't limit the pages
> > > to write. That means we loop through the dirty pages building largest
> > > possible contiguous block request. Then we issue a single get_block request.
> > > We may get less block that we requested. If so we would end up not mapping
> > > some of the buffer_heads. That means those buffer_heads are still marked delay.
> > > Later in the writepage callback via __mpage_writepage we redirty those pages.
> >
> > Can you please clarify this? Does this mean we take one pass through the
> > dirty pages, but possibly do not allocate some subset of the pages. Then,
> > at some later time these holes are written out separately? This seems
> > like it would produce fragmentation if we do not work to ensure the pages
> > are allocated in sequence. Maybe I'm misunderstanding your comment and
> > the unmapped pages are immediately mapped on the next loop?
>
> We take multiple pass through the dirty pages until wbc->nr_to_write is
> <= 0 or we don't have anything more to write. But if get_block doesn't
> return the requested number of blocks we may possibly not writeout
> some of the pages. Whether this can result in a disk layout worse than
> the current, I am not sure. I haven't looked at the layout yet.
> But these pages which are skipped are redirtied again via
> reditry_pages_for_writepage and will be forced for writeout. Well
> we can do better by setting wbc->encountered_congestion = 1; even
> though we are not really congested. That would cause most of the pdflush
> work func to retry writeback_indoes.
>
> for(;;) {
> ...
> wbc.pages_skipped = 0;
> writeback_inodes(&wbc);
> ...
>
> if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
> /* Wrote less than expected */
> if (wbc.encountered_congestion || wbc.more_io)
> congestion_wait(WRITE, HZ/10);
> else
> break;
> }
>
> }
>

like below ?

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 53a8fc7..6fd527c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1773,6 +1773,14 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
return;
BUG_ON(new.b_size == 0);

+ if (new.b_size < lbh->b_size) {
+ /*
+ * allocated less blocks. force writepages
+ * to be called again
+ */
+ mpd->wbc->more_io = 1;
+ }
+
if (buffer_new(&new))
__unmap_underlying_blocks(mpd->inode, &new);

@@ -1876,6 +1884,8 @@ static int __mpage_da_writepage(struct page *page,
* skip rest of the page in the page_vec
*/
mpd->io_done = 1;
+ /* We want writepages to be called again */
+ wbc->more_io = 1;
redirty_page_for_writepage(wbc, page);
unlock_page(page);
return MPAGE_DA_EXTENT_TAIL;