2008-08-14 17:48:31

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH] ext4: Fix small file fragmentation

mballoc small file block allocation use per cpu prealloc
space. Use goal block when searching for the right prealloc
space. Also make sure ext4_da_writepages tries to write
all the pages for small files in single attempt

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/inode.c | 21 +++++++++++++++------
fs/ext4/mballoc.c | 44 +++++++++++++++++++++++++++++++++++++-------
fs/inode.c | 1 +
3 files changed, 53 insertions(+), 13 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e144896..0b34998 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2318,13 +2318,12 @@ static int ext4_writepages_trans_blocks(struct inode *inode)
static int ext4_da_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct inode *inode = mapping->host;
handle_t *handle = NULL;
- int needed_blocks;
- int ret = 0;
- long to_write;
loff_t range_start = 0;
- long pages_skipped = 0;
+ struct inode *inode = mapping->host;
+ int needed_blocks, ret = 0, nr_to_writebump = 0;
+ long to_write, pages_skipped = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);

/*
* No pages to write? This is mainly a kludge to avoid starting
@@ -2333,6 +2332,16 @@ static int ext4_da_writepages(struct address_space *mapping,
*/
if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
+ /*
+ * Make sure nr_to_write is >= sbi->s_mb_stream_request
+ * This make sure small files blocks are allocated in
+ * single attempt. This ensure that small files
+ * get less fragmented.
+ */
+ if (wbc->nr_to_write < sbi->s_mb_stream_request) {
+ nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
+ wbc->nr_to_write = sbi->s_mb_stream_request;
+ }

if (!wbc->range_cyclic)
/*
@@ -2413,7 +2422,7 @@ static int ext4_da_writepages(struct address_space *mapping,
}

out_writepages:
- wbc->nr_to_write = to_write;
+ wbc->nr_to_write = to_write - nr_to_writebump;
wbc->range_start = range_start;
return ret;
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b14a7c7..1afcb11 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3286,6 +3286,29 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
}

+static struct ext4_prealloc_space *
+ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
+ struct ext4_prealloc_space *pa,
+ struct ext4_prealloc_space *cpa)
+{
+ ext4_fsblk_t cur_distance, new_distance;
+
+ if (cpa == NULL) {
+ atomic_inc(&pa->pa_count);
+ return pa;
+ }
+ cur_distance = abs(goal_block - cpa->pa_pstart);
+ new_distance = abs(goal_block - pa->pa_pstart);
+
+ if (cur_distance < new_distance)
+ return cpa;
+
+ /* drop the previous reference */
+ atomic_dec(&cpa->pa_count);
+ atomic_inc(&pa->pa_count);
+ return pa;
+}
+
/*
* search goal blocks in preallocated space
*/
@@ -3295,7 +3318,8 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
int order, i;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_locality_group *lg;
- struct ext4_prealloc_space *pa;
+ struct ext4_prealloc_space *pa, *cpa = NULL;
+ ext4_fsblk_t goal_block;

/* only data can be preallocated */
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
@@ -3338,6 +3362,10 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
/* The max size of hash table is PREALLOC_TB_SIZE */
order = PREALLOC_TB_SIZE - 1;

+ goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) +
+ ac->ac_g_ex.fe_start +
+ le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
+
for (i = order; i < PREALLOC_TB_SIZE; i++) {
rcu_read_lock();
list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
@@ -3345,17 +3373,19 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
spin_lock(&pa->pa_lock);
if (pa->pa_deleted == 0 &&
pa->pa_free >= ac->ac_o_ex.fe_len) {
- atomic_inc(&pa->pa_count);
- ext4_mb_use_group_pa(ac, pa);
- spin_unlock(&pa->pa_lock);
- ac->ac_criteria = 20;
- rcu_read_unlock();
- return 1;
+
+ cpa = ext4_mb_check_group_pa(goal_block,
+ pa, cpa);
}
spin_unlock(&pa->pa_lock);
}
rcu_read_unlock();
}
+ if (cpa) {
+ ext4_mb_use_group_pa(ac, cpa);
+ ac->ac_criteria = 20;
+ return 1;
+ }
return 0;
}

diff --git a/fs/inode.c b/fs/inode.c
index b6726f6..d77f0ee 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -163,6 +163,7 @@ static struct inode *alloc_inode(struct super_block *sb)
mapping->a_ops = &empty_aops;
mapping->host = inode;
mapping->flags = 0;
+ mapping->writeback_index = 0;
mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
mapping->assoc_mapping = NULL;
mapping->backing_dev_info = &default_backing_dev_info;
--
1.6.0.rc0.42.g186458.dirty



2008-08-14 22:16:09

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH] ext4: Fix small file fragmentation


在 2008-08-14四的 23:14 +0530,Aneesh Kumar K.V写道:
> mballoc small file block allocation use per cpu prealloc
> space. Use goal block when searching for the right prealloc
> space. Also make sure ext4_da_writepages tries to write
> all the pages for small files in single attempt
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> fs/ext4/inode.c | 21 +++++++++++++++------
> fs/ext4/mballoc.c | 44 +++++++++++++++++++++++++++++++++++++-------
> fs/inode.c | 1 +
> 3 files changed, 53 insertions(+), 13 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index e144896..0b34998 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -2318,13 +2318,12 @@ static int ext4_writepages_trans_blocks(struct inode *inode)
> static int ext4_da_writepages(struct address_space *mapping,
> struct writeback_control *wbc)
> {
> - struct inode *inode = mapping->host;
> handle_t *handle = NULL;
> - int needed_blocks;
> - int ret = 0;
> - long to_write;
> loff_t range_start = 0;
> - long pages_skipped = 0;
> + struct inode *inode = mapping->host;
> + int needed_blocks, ret = 0, nr_to_writebump = 0;
> + long to_write, pages_skipped = 0;
> + struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
>
> /*
> * No pages to write? This is mainly a kludge to avoid starting
> @@ -2333,6 +2332,16 @@ static int ext4_da_writepages(struct address_space *mapping,
> */
> if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
> return 0;
> + /*
> + * Make sure nr_to_write is >= sbi->s_mb_stream_request
> + * This make sure small files blocks are allocated in
> + * single attempt. This ensure that small files
> + * get less fragmented.
> + */
> + if (wbc->nr_to_write < sbi->s_mb_stream_request) {
> + nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
> + wbc->nr_to_write = sbi->s_mb_stream_request;
> + }
>

do_writepages() could be called with wbc with a specified range, is it
okay forces da writepages to flush at last 16
pages(sbi->s_mb_stream_request) all the time, which is more than what
the caller asked for?

I assume you trying to address the fragmentation issue with small
request for da_writepages() discussed in previous email? (A little more
description will be helpful here:))

> if (!wbc->range_cyclic)
> /*
> @@ -2413,7 +2422,7 @@ static int ext4_da_writepages(struct address_space *mapping,
> }
>
> out_writepages:
> - wbc->nr_to_write = to_write;
> + wbc->nr_to_write = to_write - nr_to_writebump;
> wbc->range_start = range_start;
> return ret;
> }
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index b14a7c7..1afcb11 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -3286,6 +3286,29 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
> mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
> }
>
> +static struct ext4_prealloc_space *
> +ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
> + struct ext4_prealloc_space *pa,
> + struct ext4_prealloc_space *cpa)
> +{
> + ext4_fsblk_t cur_distance, new_distance;
> +
> + if (cpa == NULL) {
> + atomic_inc(&pa->pa_count);
> + return pa;
> + }
> + cur_distance = abs(goal_block - cpa->pa_pstart);
> + new_distance = abs(goal_block - pa->pa_pstart);
> +
> + if (cur_distance < new_distance)
> + return cpa;
> +
> + /* drop the previous reference */
> + atomic_dec(&cpa->pa_count);
> + atomic_inc(&pa->pa_count);
> + return pa;
> +}
> +
> /*
> * search goal blocks in preallocated space
> */
> @@ -3295,7 +3318,8 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
> int order, i;
> struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
> struct ext4_locality_group *lg;
> - struct ext4_prealloc_space *pa;
> + struct ext4_prealloc_space *pa, *cpa = NULL;
> + ext4_fsblk_t goal_block;
>
> /* only data can be preallocated */
> if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
> @@ -3338,6 +3362,10 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
> /* The max size of hash table is PREALLOC_TB_SIZE */
> order = PREALLOC_TB_SIZE - 1;
>
> + goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) +
> + ac->ac_g_ex.fe_start +
> + le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
> +
> for (i = order; i < PREALLOC_TB_SIZE; i++) {
> rcu_read_lock();
> list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
> @@ -3345,17 +3373,19 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
> spin_lock(&pa->pa_lock);
> if (pa->pa_deleted == 0 &&
> pa->pa_free >= ac->ac_o_ex.fe_len) {
> - atomic_inc(&pa->pa_count);
> - ext4_mb_use_group_pa(ac, pa);
> - spin_unlock(&pa->pa_lock);
> - ac->ac_criteria = 20;
> - rcu_read_unlock();
> - return 1;
> +
> + cpa = ext4_mb_check_group_pa(goal_block,
> + pa, cpa);
> }

cpa is initalized as NULL, and I could not see where we set cpa any
pointer before calling ext4_mb_check_group_pa(). If I understand
right, the code above passes a NULL pointer to
ext4_mb_check_group_pa(), which will result in just choose the pa
pointer directly, and bypass the distance calculation guided by the
goal block. Did I miss anything?

> spin_unlock(&pa->pa_lock);
> }
> rcu_read_unlock();
> }
> + if (cpa) {
> + ext4_mb_use_group_pa(ac, cpa);
> + ac->ac_criteria = 20;
> + return 1;
> + }
> return 0;
> }
>
> diff --git a/fs/inode.c b/fs/inode.c
> index b6726f6..d77f0ee 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -163,6 +163,7 @@ static struct inode *alloc_inode(struct super_block *sb)
> mapping->a_ops = &empty_aops;
> mapping->host = inode;
> mapping->flags = 0;
> + mapping->writeback_index = 0;
> mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
> mapping->assoc_mapping = NULL;
> mapping->backing_dev_info = &default_backing_dev_info;

Could you explain what's this change for?

Regards,
Mingming

2008-08-15 13:38:19

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH] ext4: Fix small file fragmentation

Here's an interesting data point. Using Chris Mason's compilebench:

http://oss.oracle.com/~mason/compilebench

If I use:

./compilebench -D /mnt -i 2 -r 0

on a 4GB machine such that I have plenty of memory (and nothing gets
forced disk due to memory pressure), I don't see hardly any of the
small file fragmentation problem (0.8% of the inodes in use on the
filesystem. This is with your patch applied.

However, if I use:

./compilebench -D /mnt -i 10 -r 0

so that data blocks are getting pushed out due to memory pressure,
then I see plenty of non-contiugous inodes (8.1% of the inodes in use
on the filesystem). So with your patch applied, it seems that we
still have a problem related to delayed allocation and how the VM
system is doing its page cleaning.

- Ted

2008-08-15 16:31:24

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH] ext4: Fix small file fragmentation

On Fri, Aug 15, 2008 at 09:38:03AM -0400, Theodore Tso wrote:
> Here's an interesting data point. Using Chris Mason's compilebench:
>
> http://oss.oracle.com/~mason/compilebench
>
> If I use:
>
> ./compilebench -D /mnt -i 2 -r 0
>
> on a 4GB machine such that I have plenty of memory (and nothing gets
> forced disk due to memory pressure), I don't see hardly any of the
> small file fragmentation problem (0.8% of the inodes in use on the
> filesystem. This is with your patch applied.
>
> However, if I use:
>
> ./compilebench -D /mnt -i 10 -r 0
>
> so that data blocks are getting pushed out due to memory pressure,
> then I see plenty of non-contiugous inodes (8.1% of the inodes in use
> on the filesystem). So with your patch applied, it seems that we
> still have a problem related to delayed allocation and how the VM
> system is doing its page cleaning.

As I explained in my previous patch the problem is due to pdflush
background_writeout. Now when pdflush does the writeout we may
have only few pages for the file and we would attempt
to write them to disk. So my attempt in the last patch was to
do the below

a) When allocation blocks try to be close to the goal block specified
b) When we call ext4_da_writepages make sure we have minimal nr_to_write
that ensures we allocate all dirty buffer_heads in a single go.
nr_to_write is set to 1024 in pdflush background_writeout and that
would mean we may end up calling some inodes writepages() with really
small values even though we have more dirty buffer_heads.

What it doesn't handle is
1) File A have 4 dirty buffer_heads.
2) pdflush try to write them. We get 4 contig blocks
3) File A now have new 5 dirty_buffer_heads
4) File B now have 6 dirty_buffer_heads
5) pdflush try to write the 6 dirty buffer_heads of file B and allocate
them next to earlier file A blocks
6) pdflush try to write the 5 dirty buffer_heads of file A and allocate
them after file B blocks resulting in discontinuity.

I am right now testing the below patch which make sure new dirty inodes
are added to the tail of the dirty inode list

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 25adfc3..a658690 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -163,7 +163,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
*/
if (!was_dirty) {
inode->dirtied_when = jiffies;
- list_move(&inode->i_list, &sb->s_dirty);
+ //list_move(&inode->i_list, &sb->s_dirty);
+ __list_del(&inode->i_list->prev, &inode->i_list->next);
+ list_add_tail(&inode->i_list, &sb->s_dirty);
}
}
out:

2008-08-15 16:33:10

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH] ext4: Fix small file fragmentation

On Fri, Aug 15, 2008 at 10:01:12PM +0530, Aneesh Kumar K.V wrote:
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index 25adfc3..a658690 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -163,7 +163,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
> */
> if (!was_dirty) {
> inode->dirtied_when = jiffies;
> - list_move(&inode->i_list, &sb->s_dirty);
> + //list_move(&inode->i_list, &sb->s_dirty);
> + __list_del(&inode->i_list->prev, &inode->i_list->next);
> + list_add_tail(&inode->i_list, &sb->s_dirty);
> }
> }
> out:

better one

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 25adfc3..91f3c54 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -163,7 +163,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
*/
if (!was_dirty) {
inode->dirtied_when = jiffies;
- list_move(&inode->i_list, &sb->s_dirty);
+ list_move_tail(&inode->i_list, &sb->s_dirty);
}
}
out:

2008-08-15 17:52:54

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH] ext4: Fix small file fragmentation

On Thu, Aug 14, 2008 at 07:18:17PM -0400, Theodore Tso wrote:
> On Thu, Aug 14, 2008 at 11:14:40PM +0530, Aneesh Kumar K.V wrote:
> > mballoc small file block allocation use per cpu prealloc
> > space. Use goal block when searching for the right prealloc
> > space. Also make sure ext4_da_writepages tries to write
> > all the pages for small files in single attempt
>
> Hi Aneesh, how are you testing your patch? I've created the following
> shell script:
>
> -------------------------------
> #!/bin/sh
> #
> # small-files-frag-test --- test for small files fragmentation
>
> DEVICE=/dev/thunk/testbench
>
> mke2fs -t ext4dev $DEVICE
> mount -t ext4dev $DEVICE /mnt
> tar -C /usr -cf - bin lib | tar -C /mnt -xpf -
> sync; sleep 5
> umount $DEVICE
> e2fsck -nfv -E fragcheck $DEVICE
> -------------------------------
>
> ... and the results show roughly the same amount of fragmentation, and the
> same pattern. In fact, it's a ltitle worse (30% vs 25%).
>
> 37912 inodes used (11.57%)
> 11468 non-contiguous inodes (30.2%)
> # of inodes with ind/dind/tind blocks: 0/0/0
> Extent depth histogram: 32638/5
> 711894 blocks used (54.31%)
>


I have better results with the below patch on top of the patch i sent.

21156 inodes used (0.47%)
158 non-contiguous inodes (0.7%)
# of inodes with ind/dind/tind blocks: 4/4/4
581216 blocks used (3.24%)
0 bad blocks
1 large file

commit 6ad9d25595aea8efa0d45c0a2dd28b4a415e34e6
Author: Aneesh Kumar K.V <[email protected]>
Date: Fri Aug 15 23:19:15 2008 +0530

move the dirty inodes to the end of the list

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 1afcb11..650b021 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4258,7 +4258,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,

static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
{
- int order, added = 0, lg_prealloc_count = 1;
+ int order, lg_prealloc_count = 1;
+ bool added = 0;
struct super_block *sb = ac->ac_sb;
struct ext4_locality_group *lg = ac->ac_lg;
struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 25adfc3..95eee62 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -163,7 +163,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
*/
if (!was_dirty) {
inode->dirtied_when = jiffies;
- list_move(&inode->i_list, &sb->s_dirty);
+ list_move_tail(&inode->i_list, &sb->s_dirty);
}
}
out:
@@ -208,7 +208,7 @@ static void redirty_tail(struct inode *inode)
*/
static void requeue_io(struct inode *inode)
{
- list_move(&inode->i_list, &inode->i_sb->s_more_io);
+ list_move_tail(&inode->i_list, &inode->i_sb->s_more_io);
}

static void inode_sync_complete(struct inode *inode)


Attachments:
(No filename) (2.76 kB)
out.bz2 (1.64 kB)
Download all attachments

2008-08-15 18:07:54

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH] ext4: Fix small file fragmentation

On Fri, Aug 15, 2008 at 11:22:43PM +0530, Aneesh Kumar K.V wrote:
> On Thu, Aug 14, 2008 at 07:18:17PM -0400, Theodore Tso wrote:
> > On Thu, Aug 14, 2008 at 11:14:40PM +0530, Aneesh Kumar K.V wrote:
> > > mballoc small file block allocation use per cpu prealloc
> > > space. Use goal block when searching for the right prealloc
> > > space. Also make sure ext4_da_writepages tries to write
> > > all the pages for small files in single attempt
> >
> > Hi Aneesh, how are you testing your patch? I've created the following
> > shell script:
> >
> > -------------------------------
> > #!/bin/sh
> > #
> > # small-files-frag-test --- test for small files fragmentation
> >
> > DEVICE=/dev/thunk/testbench
> >
> > mke2fs -t ext4dev $DEVICE
> > mount -t ext4dev $DEVICE /mnt
> > tar -C /usr -cf - bin lib | tar -C /mnt -xpf -
> > sync; sleep 5
> > umount $DEVICE
> > e2fsck -nfv -E fragcheck $DEVICE
> > -------------------------------
> >
> > ... and the results show roughly the same amount of fragmentation, and the
> > same pattern. In fact, it's a ltitle worse (30% vs 25%).
> >
> > 37912 inodes used (11.57%)
> > 11468 non-contiguous inodes (30.2%)
> > # of inodes with ind/dind/tind blocks: 0/0/0
> > Extent depth histogram: 32638/5
> > 711894 blocks used (54.31%)
> >
>
>
> I have better results with the below patch on top of the patch i sent.
>
> 21156 inodes used (0.47%)
> 158 non-contiguous inodes (0.7%)
> # of inodes with ind/dind/tind blocks: 4/4/4
> 581216 blocks used (3.24%)
> 0 bad blocks
> 1 large file
>

And the fragmented inodes are all directories for which we don't
use prealloc space

debugfs: ncheck 12
Inode Pathname
12 /bin
debugfs: ncheck 1987
Inode Pathname
1987 /lib
debugfs: ncheck 7657
Inode Pathname
7657 /lib/python2.5
debugfs: ncheck 11602
Inode Pathname
11602 /lib/X11/xserver
debugfs: ncheck 14279
Inode Pathname
14279 /lib/locale
debugfs: ncheck 20615
Inode Pathname
20615 /lib/ooo-2.0/program
debugfs:

-aneesh

2008-08-15 20:05:37

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH] ext4: Fix small file fragmentation

Aneesh, thanks! Chris Mason also pointed us at this proposed patch,
where he seems to be tracking a very similar issue for btrfs. I don't
know if you've seen it....

http://article.gmane.org/gmane.linux.file-systems/25560

- Ted

2008-08-16 04:43:18

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH] ext4: Fix small file fragmentation

On Fri, Aug 15, 2008 at 04:05:02PM -0400, Theodore Tso wrote:
> Aneesh, thanks! Chris Mason also pointed us at this proposed patch,
> where he seems to be tracking a very similar issue for btrfs. I don't
> know if you've seen it....
>
> http://article.gmane.org/gmane.linux.file-systems/25560
>

I have a done a similar change in the patch I sent. I also observed
wrong values in the writeback_index for range_cyclic mode. Well
as suggested in the thread I will move it to file system.

But I don't seen anything being mentioned about the fragmentation issue
in the thread.

-aneesh

2008-08-16 10:43:16

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH] ext4: Fix small file fragmentation

On Fri, Aug 15, 2008 at 11:22:43PM +0530, Aneesh Kumar K.V wrote:
> commit 6ad9d25595aea8efa0d45c0a2dd28b4a415e34e6
> Author: Aneesh Kumar K.V <[email protected]>
> Date: Fri Aug 15 23:19:15 2008 +0530
>
> move the dirty inodes to the end of the list
>
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 1afcb11..650b021 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -4258,7 +4258,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
>
> static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
> {
> - int order, added = 0, lg_prealloc_count = 1;
> + int order, lg_prealloc_count = 1;
> + bool added = 0;
> struct super_block *sb = ac->ac_sb;
> struct ext4_locality_group *lg = ac->ac_lg;
> struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index 25adfc3..95eee62 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -163,7 +163,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
> */
> if (!was_dirty) {
> inode->dirtied_when = jiffies;
> - list_move(&inode->i_list, &sb->s_dirty);
> + list_move_tail(&inode->i_list, &sb->s_dirty);
> }
> }
> out:
> @@ -208,7 +208,7 @@ static void redirty_tail(struct inode *inode)
> */
> static void requeue_io(struct inode *inode)
> {
> - list_move(&inode->i_list, &inode->i_sb->s_more_io);
> + list_move_tail(&inode->i_list, &inode->i_sb->s_more_io);
> }
>
> static void inode_sync_complete(struct inode *inode)

The patch is not really useful and is also wrong. Verifying the results
again I found that patch didn't make any difference. We actually read
from the s_io list from the tail not from the head (generic_sync_sb_inodes)
That means inodes are already added in the order they are dirtied.

-aneesh