2008-01-10 15:34:09

by Aneesh Kumar K.V

[permalink] [raw]
Subject: patch queue update

Hi Mingming,

New patches for patch queue can be found at
http://www.radian.org/~kvaneesh/ext4/jan-10-2008-ver2/

The changes are
------------
a) mballoc patch got an explanation about regular allocator.
b) mballoc regular allocator we changed the usage of ffs to fls. I guess
it makes sense to use fls because we want to compare it against the
tunable s_mb_order2_reqs. Only request above this order are using
criteria 0 allocation.
c) stripe.patch to use the stripe size set in the super block for block
allocation.

The diff is attached for reference.

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 0d31817..0085fde 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -468,7 +468,6 @@ static void ext4_mb_free_committed_blocks(struct super_block *);
static void ext4_mb_return_to_preallocation(struct inode *inode,
struct ext4_buddy *e4b, sector_t block,
int count);
-static void ext4_mb_show_ac(struct ext4_allocation_context *ac);
static void ext4_mb_put_pa(struct ext4_allocation_context *, struct super_block *,
struct ext4_prealloc_space *pa);
static int ext4_mb_init_per_dev_proc(struct super_block *sb);
@@ -1838,14 +1837,23 @@ static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
goto out;

- i = ffs(ac->ac_g_ex.fe_len);
+ /*
+ * ac->ac2_order is set only if the fe_len is a power of 2
+ * if ac2_order is set we also set criteria to 0 so whtat we
+ * try exact allocation using buddy.
+ */
+ i = fls(ac->ac_g_ex.fe_len);
ac->ac_2order = 0;
- /* FIXME!!
- * What happens if i is still greater than s_mb_order2_reqs
+ /*
+ * We search using buddy data only if the order of the request
+ * is greater than equal to the sbi_s_mb_order2_reqs
+ * You can tune it via /proc/fs/ext4/<partition>/order2_req
*/
if (i >= sbi->s_mb_order2_reqs) {
- i--;
- if ((ac->ac_g_ex.fe_len & (~(1 << i))) == 0)
+ /*
+ * This should tell if fe_len is exactly power of 2
+ */
+ if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
ac->ac_2order = i;
}

@@ -1865,17 +1873,17 @@ static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
spin_unlock(&sbi->s_md_lock);
}

+ /* searching for the right group start from the goal value specified */
group = ac->ac_g_ex.fe_group;

/* Let's just scan groups to find more-less suitable blocks */
cr = ac->ac_2order ? 0 : 1;
+ /*
+ * cr == 0 try to get exact allocation,
+ * cr == 3 try to get anything
+ */
repeat:
for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
- /* FIXME!!
- * We need to explain what criteria is and also
- * need to define the number 0 to 4 for criteria
- * What they actually means.
- */
ac->ac_criteria = cr;
for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
struct ext4_group_info *grp;
@@ -1889,23 +1897,28 @@ repeat:
if (grp->bb_free == 0)
continue;

+ /*
+ * if the group is already init we check whether it is
+ * a good group and if not we don't load the buddy
+ */
if (EXT4_MB_GRP_NEED_INIT(EXT4_GROUP_INFO(sb, group))) {
- /* we need full data about the group
- * to make a good selection */
+ /*
+ * we need full data about the group
+ * to make a good selection
+ */
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err)
goto out;
ext4_mb_release_desc(&e4b);
}

- /* check is group good for our criteries */
+ /*
+ * If the particular group doesn't satisfy our
+ * criteria we continue with the next group
+ */
if (!ext4_mb_good_group(ac, group, cr))
continue;

- /* FIXME!!
- * here also we are loading the buddy. so what difference
- * does EXT4_MB_GRP_NEED_INIT actually make
- */
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err)
goto out;
@@ -3726,10 +3739,9 @@ repeat:
busy = 0;
ext4_unlock_group(sb, group);
/*
- * We see this quiet rare. But if a particular workload is
- * effected by this we may need to add a waitqueue
+ * Yield the CPU here so that we don't get soft lockup
*/
- schedule_timeout(HZ);
+ schedule();
goto repeat;
}

@@ -3808,7 +3820,7 @@ repeat:
printk(KERN_ERR "uh-oh! used pa while discarding\n");
dump_stack();
current->state = TASK_UNINTERRUPTIBLE;
- schedule();
+ schedule_timeout(HZ);
goto repeat;

}
@@ -3832,8 +3844,12 @@ repeat:
* pa from inode's list may access already
* freed memory, bad-bad-bad */

+ /* XXX: if this happens too often, we can
+ * add a flag to force wait only in case
+ * of ->clear_inode(), but not in case of
+ * regular truncate */
current->state = TASK_UNINTERRUPTIBLE;
- schedule();
+ schedule_timeout(HZ);
goto repeat;
}
spin_unlock(&ei->i_prealloc_lock);
@@ -3878,7 +3894,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
{
BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
}
-
+#ifdef MB_DEBUG
static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
@@ -3928,6 +3944,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
}
printk(KERN_ERR "\n");
}
+#else
+#define ext4_mb_show_ac(x)
+#endif

/*
* We use locality group preallocation for small size file. The size of the
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c69f4e5..9d91c60 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1775,6 +1775,21 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
return (has_super + ext4_group_first_block_no(sb, bg));
}

+static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
+{
+ unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
+ unsigned long stripe_width = le32_to_cpu(sbi->s_es->s_raid_stripe_width);
+
+ if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) {
+ return sbi->s_stripe;
+ } else if (stripe_width <= sbi->s_blocks_per_group) {
+ return stripe_width;
+ } else if (stride <= sbi->s_blocks_per_group) {
+ return stride;
+ }
+
+ return 0;
+}

static int ext4_fill_super (struct super_block *sb, void *data, int silent)
__releases(kernel_sem)
@@ -2131,6 +2146,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
sbi->s_rsv_window_head.rsv_alloc_hit = 0;
sbi->s_rsv_window_head.rsv_goal_size = 0;
ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
+ /*
+ * set the stripe size. If we have specified it via mount option, then
+ * use the mount option value. If the value specified at mount time is
+ * greater than the blocks per group use the super block value.
+ * Allocator needs it be less than blocks per group.
+ */
+ sbi->s_stripe = ext4_get_stripe_size(sbi);

/*
* set up enough so that it can read an inode


2008-01-10 21:43:25

by Andreas Dilger

[permalink] [raw]
Subject: Re: patch queue update

On Jan 10, 2008 21:03 +0530, Aneesh Kumar K.V wrote:
> if (i >= sbi->s_mb_order2_reqs) {
> - i--;
> - if ((ac->ac_g_ex.fe_len & (~(1 << i))) == 0)
> + /*
> + * This should tell if fe_len is exactly power of 2
> + */
> + if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
> ac->ac_2order = i;

While you changed i to (i - 1) in the "if" you didn't change it when
setting ac_2order... Is that incorrect?

> /*
> + * Yield the CPU here so that we don't get soft lockup
> */
> - schedule_timeout(HZ);
> + schedule();
> goto repeat;
> }
>
> @@ -3808,7 +3820,7 @@ repeat:
> printk(KERN_ERR "uh-oh! used pa while discarding\n");
> dump_stack();
> current->state = TASK_UNINTERRUPTIBLE;
> - schedule();
> + schedule_timeout(HZ);
> goto repeat;

Is this change to schedule_timeout() intentional? The earlier code is
removing the use of schedule_timeout. I could be wrong, as I didn't
follow this discussion closely, but sometimes changes like this happen
accidentally and people don't look at the patch itself...

> +static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
> +{
> + unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
> + unsigned long stripe_width = le32_to_cpu(sbi->s_es->s_raid_stripe_width);
> +
> + if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) {
> + return sbi->s_stripe;
> + } else if (stripe_width <= sbi->s_blocks_per_group) {
> + return stripe_width;
> + } else if (stride <= sbi->s_blocks_per_group) {
> + return stride;
> + }

If you are doing "return XXX" you don't need "else".

> + /*
> + * set the stripe size. If we have specified it via mount option, then
> + * use the mount option value. If the value specified at mount time is
> + * greater than the blocks per group use the super block value.
> + * Allocator needs it be less than blocks per group.
> + */
> + sbi->s_stripe = ext4_get_stripe_size(sbi);

This comment should probably go by ext4_get_stripe_size() definition instead
of here at the caller.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

2008-01-11 04:09:22

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: patch queue update

On Thu, Jan 10, 2008 at 02:43:23PM -0700, Andreas Dilger wrote:
> On Jan 10, 2008 21:03 +0530, Aneesh Kumar K.V wrote:
> > if (i >= sbi->s_mb_order2_reqs) {
> > - i--;
> > - if ((ac->ac_g_ex.fe_len & (~(1 << i))) == 0)
> > + /*
> > + * This should tell if fe_len is exactly power of 2
> > + */
> > + if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
> > ac->ac_2order = i;
>
> While you changed i to (i - 1) in the "if" you didn't change it when
> setting ac_2order... Is that incorrect?

Yes that ac_2order should be i - 1;
Will fix it in the next update.

I see that the patch queue update doesn't have most of the changes I
have placed at http://www.radian.org/~kvaneesh/ext4/jan-10-2008-ver2/

>
> > /*
> > + * Yield the CPU here so that we don't get soft lockup
> > */
> > - schedule_timeout(HZ);
> > + schedule();
> > goto repeat;
> > }
> >
> > @@ -3808,7 +3820,7 @@ repeat:
> > printk(KERN_ERR "uh-oh! used pa while discarding\n");
> > dump_stack();
> > current->state = TASK_UNINTERRUPTIBLE;
> > - schedule();
> > + schedule_timeout(HZ);
> > goto repeat;
>
> Is this change to schedule_timeout() intentional? The earlier code is
> removing the use of schedule_timeout. I could be wrong, as I didn't
> follow this discussion closely, but sometimes changes like this happen
> accidentally and people don't look at the patch itself...


The patch queue had it modified from schedule_timeout to schedule(). I
am moving it back to the original version. If we have set the task state
to TASK_UNINTERRUPTIBLE it should be schedule_timeout. And at these
place we intent to wait uninterrupted for 1 sec. The place where we
wanted to just yield is ext4_mb_discard_group_preallocations.


> > +static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
> > +{
> > + unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
> > + unsigned long stripe_width = le32_to_cpu(sbi->s_es->s_raid_stripe_width);
> > +
> > + if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) {
> > + return sbi->s_stripe;
> > + } else if (stripe_width <= sbi->s_blocks_per_group) {
> > + return stripe_width;
> > + } else if (stride <= sbi->s_blocks_per_group) {
> > + return stride;
> > + }
>
> If you are doing "return XXX" you don't need "else".
>
> > + /*
> > + * set the stripe size. If we have specified it via mount option, then
> > + * use the mount option value. If the value specified at mount time is
> > + * greater than the blocks per group use the super block value.
> > + * Allocator needs it be less than blocks per group.
> > + */
> > + sbi->s_stripe = ext4_get_stripe_size(sbi);
>
> This comment should probably go by ext4_get_stripe_size() definition instead
> of here at the caller.

Will move that to the function definition.

-aneesh