2008-01-23 22:08:18

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 41/49] ext4: Add multi block allocator for ext4

> On Mon, 21 Jan 2008 22:02:20 -0500 "Theodore Ts'o" <[email protected]> wrote:
> From: Alex Tomas <[email protected]>
>
> Signed-off-by: Alex Tomas <[email protected]>
> Signed-off-by: Andreas Dilger <[email protected]>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> Signed-off-by: Eric Sandeen <[email protected]>
> Signed-off-by: "Theodore Ts'o" <[email protected]>
>
> ...
>
> +#if BITS_PER_LONG == 64
> +#define mb_correct_addr_and_bit(bit, addr) \
> +{ \
> + bit += ((unsigned long) addr & 7UL) << 3; \
> + addr = (void *) ((unsigned long) addr & ~7UL); \
> +}
> +#elif BITS_PER_LONG == 32
> +#define mb_correct_addr_and_bit(bit, addr) \
> +{ \
> + bit += ((unsigned long) addr & 3UL) << 3; \
> + addr = (void *) ((unsigned long) addr & ~3UL); \
> +}
> +#else
> +#error "how many bits you are?!"
> +#endif

Why do these exist?

> +static inline int mb_test_bit(int bit, void *addr)
> +{
> + mb_correct_addr_and_bit(bit, addr);
> + return ext4_test_bit(bit, addr);
> +}

ext2_test_bit() already handles bitnum > wordsize.

If mb_correct_addr_and_bit() is actually needed then some suitable comment
would help.

> +static inline void mb_set_bit(int bit, void *addr)
> +{
> + mb_correct_addr_and_bit(bit, addr);
> + ext4_set_bit(bit, addr);
> +}
> +
> +static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
> +{
> + mb_correct_addr_and_bit(bit, addr);
> + ext4_set_bit_atomic(lock, bit, addr);
> +}
> +
> +static inline void mb_clear_bit(int bit, void *addr)
> +{
> + mb_correct_addr_and_bit(bit, addr);
> + ext4_clear_bit(bit, addr);
> +}
> +
> +static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
> +{
> + mb_correct_addr_and_bit(bit, addr);
> + ext4_clear_bit_atomic(lock, bit, addr);
> +}
> +
> +static inline void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)

uninlining this will save about eighty squigabytes of text.

Please review all of ext4/jbd2 with a view to removig unnecessary and wrong
inlings.

> +{
> + char *bb;
> +
> + /* FIXME!! is this needed */
> + BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
> + BUG_ON(max == NULL);
> +
> + if (order > e4b->bd_blkbits + 1) {
> + *max = 0;
> + return NULL;
> + }
> +
> + /* at order 0 we see each particular block */
> + *max = 1 << (e4b->bd_blkbits + 3);
> + if (order == 0)
> + return EXT4_MB_BITMAP(e4b);
> +
> + bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
> + *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
> +
> + return bb;
> +}
> +
>
> ...
>
> +#else
> +#define mb_free_blocks_double(a, b, c, d)
> +#define mb_mark_used_double(a, b, c)
> +#define mb_cmp_bitmaps(a, b)
> +#endif

Please use the do{}while(0) thing. Or, better, proper C functions which
have typechecking (unless this will cause undefined-var compile errors,
which happens sometimes)

> +/* find most significant bit */
> +static int fmsb(unsigned short word)
> +{
> + int order;
> +
> + if (word > 255) {
> + order = 7;
> + word >>= 8;
> + } else {
> + order = -1;
> + }
> +
> + do {
> + order++;
> + word >>= 1;
> + } while (word != 0);
> +
> + return order;
> +}

Did we just reinvent fls()?

> +/* FIXME!! need more doc */
> +static void ext4_mb_mark_free_simple(struct super_block *sb,
> + void *buddy, unsigned first, int len,
> + struct ext4_group_info *grp)
> +{
> + struct ext4_sb_info *sbi = EXT4_SB(sb);
> + unsigned short min;
> + unsigned short max;
> + unsigned short chunk;
> + unsigned short border;
> +
> + BUG_ON(len >= EXT4_BLOCKS_PER_GROUP(sb));
> +
> + border = 2 << sb->s_blocksize_bits;

Won't this explode with >= 32k blocksize?

> + while (len > 0) {
> + /* find how many blocks can be covered since this position */
> + max = ffs(first | border) - 1;
> +
> + /* find how many blocks of power 2 we need to mark */
> + min = fmsb(len);
> +
> + if (max < min)
> + min = max;
> + chunk = 1 << min;
> +
> + /* mark multiblock chunks only */
> + grp->bb_counters[min]++;
> + if (min > 0)
> + mb_clear_bit(first >> min,
> + buddy + sbi->s_mb_offsets[min]);
> +
> + len -= chunk;
> + first += chunk;
> + }
> +}
> +
>
> ...
>
> +static int ext4_mb_init_cache(struct page *page, char *incore)
> +{
> + int blocksize;
> + int blocks_per_page;
> + int groups_per_page;
> + int err = 0;
> + int i;
> + ext4_group_t first_group;
> + int first_block;
> + struct super_block *sb;
> + struct buffer_head *bhs;
> + struct buffer_head **bh;
> + struct inode *inode;
> + char *data;
> + char *bitmap;
> +
> + mb_debug("init page %lu\n", page->index);
> +
> + inode = page->mapping->host;
> + sb = inode->i_sb;
> + blocksize = 1 << inode->i_blkbits;
> + blocks_per_page = PAGE_CACHE_SIZE / blocksize;
> +
> + groups_per_page = blocks_per_page >> 1;
> + if (groups_per_page == 0)
> + groups_per_page = 1;
> +
> + /* allocate buffer_heads to read bitmaps */
> + if (groups_per_page > 1) {
> + err = -ENOMEM;
> + i = sizeof(struct buffer_head *) * groups_per_page;
> + bh = kmalloc(i, GFP_NOFS);
> + if (bh == NULL)
> + goto out;
> + memset(bh, 0, i);

kzalloc()

> + } else
> + bh = &bhs;
> +
> + first_group = page->index * blocks_per_page / 2;
> +
> + /* read all groups the page covers into the cache */
> + for (i = 0; i < groups_per_page; i++) {
> + struct ext4_group_desc *desc;
> +
> + if (first_group + i >= EXT4_SB(sb)->s_groups_count)
> + break;
> +
> + err = -EIO;
> + desc = ext4_get_group_desc(sb, first_group + i, NULL);
> + if (desc == NULL)
> + goto out;
> +
> + err = -ENOMEM;
> + bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
> + if (bh[i] == NULL)
> + goto out;
> +
> + if (buffer_uptodate(bh[i]))
> + continue;
> +
> + lock_buffer(bh[i]);
> + if (buffer_uptodate(bh[i])) {
> + unlock_buffer(bh[i]);
> + continue;
> + }

Didn't we just add a helper in fs/buffer.c to do this?

> + if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
> + ext4_init_block_bitmap(sb, bh[i],
> + first_group + i, desc);
> + set_buffer_uptodate(bh[i]);
> + unlock_buffer(bh[i]);
> + continue;
> + }
> + get_bh(bh[i]);
> + bh[i]->b_end_io = end_buffer_read_sync;
> + submit_bh(READ, bh[i]);
> + mb_debug("read bitmap for group %lu\n", first_group + i);
> + }
> +
> + /* wait for I/O completion */
> + for (i = 0; i < groups_per_page && bh[i]; i++)
> + wait_on_buffer(bh[i]);
> +
> + err = -EIO;
> + for (i = 0; i < groups_per_page && bh[i]; i++)
> + if (!buffer_uptodate(bh[i]))
> + goto out;
> +
> + first_block = page->index * blocks_per_page;
> + for (i = 0; i < blocks_per_page; i++) {
> + int group;
> + struct ext4_group_info *grinfo;
> +
> + group = (first_block + i) >> 1;
> + if (group >= EXT4_SB(sb)->s_groups_count)
> + break;
> +
> + /*
> + * data carry information regarding this
> + * particular group in the format specified
> + * above
> + *
> + */
> + data = page_address(page) + (i * blocksize);
> + bitmap = bh[group - first_group]->b_data;
> +
> + /*
> + * We place the buddy block and bitmap block
> + * close together
> + */
> + if ((first_block + i) & 1) {
> + /* this is block of buddy */
> + BUG_ON(incore == NULL);
> + mb_debug("put buddy for group %u in page %lu/%x\n",
> + group, page->index, i * blocksize);
> + memset(data, 0xff, blocksize);
> + grinfo = ext4_get_group_info(sb, group);
> + grinfo->bb_fragments = 0;
> + memset(grinfo->bb_counters, 0,
> + sizeof(unsigned short)*(sb->s_blocksize_bits+2));
> + /*
> + * incore got set to the group block bitmap below
> + */
> + ext4_mb_generate_buddy(sb, data, incore, group);
> + incore = NULL;
> + } else {
> + /* this is block of bitmap */
> + BUG_ON(incore != NULL);
> + mb_debug("put bitmap for group %u in page %lu/%x\n",
> + group, page->index, i * blocksize);
> +
> + /* see comments in ext4_mb_put_pa() */
> + ext4_lock_group(sb, group);
> + memcpy(data, bitmap, blocksize);
> +
> + /* mark all preallocated blks used in in-core bitmap */
> + ext4_mb_generate_from_pa(sb, data, group);
> + ext4_unlock_group(sb, group);
> +
> + /* set incore so that the buddy information can be
> + * generated using this
> + */
> + incore = data;
> + }
> + }
> + SetPageUptodate(page);

Is the page locked here?

> +out:
> + if (bh) {
> + for (i = 0; i < groups_per_page && bh[i]; i++)
> + brelse(bh[i]);

put_bh()

> + if (bh != &bhs)
> + kfree(bh);
> + }
> + return err;
> +}
> +
>
> ...
>
> +static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
> +{
> + __u32 *addr;
> +
> + len = cur + len;
> + while (cur < len) {
> + if ((cur & 31) == 0 && (len - cur) >= 32) {
> + /* fast path: clear whole word at once */

s/clear/set/

> + addr = bm + (cur >> 3);
> + *addr = 0xffffffff;
> + cur += 32;
> + continue;
> + }
> + mb_set_bit_atomic(lock, cur, bm);
> + cur++;
> + }
> +}
> +
>
> ...
>
> +static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
> + struct ext4_buddy *e4b)
> +{
> + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
> + int ret;
> +
> + BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
> + BUG_ON(ac->ac_status == AC_STATUS_FOUND);
> +
> + ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
> + ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
> + ret = mb_mark_used(e4b, &ac->ac_b_ex);
> +
> + /* preallocation can change ac_b_ex, thus we store actually
> + * allocated blocks for history */
> + ac->ac_f_ex = ac->ac_b_ex;
> +
> + ac->ac_status = AC_STATUS_FOUND;
> + ac->ac_tail = ret & 0xffff;
> + ac->ac_buddy = ret >> 16;
> +
> + /* XXXXXXX: SUCH A HORRIBLE **CK */
> + /*FIXME!! Why ? */

?

> + ac->ac_bitmap_page = e4b->bd_bitmap_page;
> + get_page(ac->ac_bitmap_page);
> + ac->ac_buddy_page = e4b->bd_buddy_page;
> + get_page(ac->ac_buddy_page);
> +
> + /* store last allocated for subsequent stream allocation */
> + if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
> + spin_lock(&sbi->s_md_lock);
> + sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
> + sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
> + spin_unlock(&sbi->s_md_lock);
> + }
> +}
>
> ...
>
> +static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
> + ext4_group_t group)
> +{
> + struct ext4_group_info *grp = ext4_get_group_info(sb, group);
> + struct ext4_prealloc_space *pa;
> + struct list_head *cur;
> + ext4_group_t groupnr;
> + ext4_grpblk_t start;
> + int preallocated = 0;
> + int count = 0;
> + int len;
> +
> + /* all form of preallocation discards first load group,
> + * so the only competing code is preallocation use.
> + * we don't need any locking here
> + * notice we do NOT ignore preallocations with pa_deleted
> + * otherwise we could leave used blocks available for
> + * allocation in buddy when concurrent ext4_mb_put_pa()
> + * is dropping preallocation
> + */
> + list_for_each_rcu(cur, &grp->bb_prealloc_list) {
> + pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
> + spin_lock(&pa->pa_lock);
> + ext4_get_group_no_and_offset(sb, pa->pa_pstart,
> + &groupnr, &start);
> + len = pa->pa_len;
> + spin_unlock(&pa->pa_lock);
> + if (unlikely(len == 0))
> + continue;
> + BUG_ON(groupnr != group);
> + mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
> + bitmap, start, len);
> + preallocated += len;
> + count++;
> + }

Seems to be missing rcu_read_lock()

> + mb_debug("prellocated %u for group %lu\n", preallocated, group);
> +}
> +
> +static void ext4_mb_pa_callback(struct rcu_head *head)
> +{
> + struct ext4_prealloc_space *pa;
> + pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
> + kmem_cache_free(ext4_pspace_cachep, pa);
> +}
> +#define mb_call_rcu(__pa) call_rcu(&(__pa)->u.pa_rcu, ext4_mb_pa_callback)

Is there any reason why this had to be implemented as a macro?

>
> ...
>
> +static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
> +{
> + struct super_block *sb = ac->ac_sb;
> + struct ext4_prealloc_space *pa;
> + struct ext4_group_info *grp;
> + struct ext4_inode_info *ei;
> +
> + /* preallocate only when found space is larger then requested */
> + BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
> + BUG_ON(ac->ac_status != AC_STATUS_FOUND);
> + BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
> +
> + pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);

Do all the GFP_NOFS's in this code really need to be GFP_NOFS?

> + if (pa == NULL)
> + return -ENOMEM;
> +
> + if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
> + int winl;
> + int wins;
> + int win;
> + int offs;
> +
> + /* we can't allocate as much as normalizer wants.
> + * so, found space must get proper lstart
> + * to cover original request */
> + BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
> + BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
> +
> + /* we're limited by original request in that
> + * logical block must be covered any way
> + * winl is window we can move our chunk within */
> + winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
> +
> + /* also, we should cover whole original request */
> + wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
> +
> + /* the smallest one defines real window */
> + win = min(winl, wins);
> +
> + offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
> + if (offs && offs < win)
> + win = offs;
> +
> + ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
> + BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
> + BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
> + }
> +
> + /* preallocation can change ac_b_ex, thus we store actually
> + * allocated blocks for history */
> + ac->ac_f_ex = ac->ac_b_ex;
> +
> + pa->pa_lstart = ac->ac_b_ex.fe_logical;
> + pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
> + pa->pa_len = ac->ac_b_ex.fe_len;
> + pa->pa_free = pa->pa_len;
> + atomic_set(&pa->pa_count, 1);
> + spin_lock_init(&pa->pa_lock);
> + pa->pa_deleted = 0;
> + pa->pa_linear = 0;
> +
> + mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
> + pa->pa_pstart, pa->pa_len, pa->pa_lstart);
> +
> + ext4_mb_use_inode_pa(ac, pa);
> + atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
> +
> + ei = EXT4_I(ac->ac_inode);
> + grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
> +
> + pa->pa_obj_lock = &ei->i_prealloc_lock;
> + pa->pa_inode = ac->ac_inode;
> +
> + ext4_lock_group(sb, ac->ac_b_ex.fe_group);
> + list_add_rcu(&pa->pa_group_list, &grp->bb_prealloc_list);
> + ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
> +
> + spin_lock(pa->pa_obj_lock);
> + list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
> + spin_unlock(pa->pa_obj_lock);

hm. Strange to see list_add_rcu() inside spinlock like this.

> + return 0;
> +}
> +
>
> ...
>
> +static int ext4_mb_discard_group_preallocations(struct super_block *sb,
> + ext4_group_t group, int needed)
> +{
> + struct ext4_group_info *grp = ext4_get_group_info(sb, group);
> + struct buffer_head *bitmap_bh = NULL;
> + struct ext4_prealloc_space *pa, *tmp;
> + struct list_head list;
> + struct ext4_buddy e4b;
> + int err;
> + int busy = 0;
> + int free = 0;
> +
> + mb_debug("discard preallocation for group %lu\n", group);
> +
> + if (list_empty(&grp->bb_prealloc_list))
> + return 0;
> +
> + bitmap_bh = read_block_bitmap(sb, group);
> + if (bitmap_bh == NULL) {
> + /* error handling here */
> + ext4_mb_release_desc(&e4b);
> + BUG_ON(bitmap_bh == NULL);
> + }
> +
> + err = ext4_mb_load_buddy(sb, group, &e4b);
> + BUG_ON(err != 0); /* error handling here */
> +
> + if (needed == 0)
> + needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
> +
> + grp = ext4_get_group_info(sb, group);
> + INIT_LIST_HEAD(&list);
> +
> +repeat:
> + ext4_lock_group(sb, group);
> + list_for_each_entry_safe(pa, tmp,
> + &grp->bb_prealloc_list, pa_group_list) {
> + spin_lock(&pa->pa_lock);
> + if (atomic_read(&pa->pa_count)) {
> + spin_unlock(&pa->pa_lock);
> + busy = 1;
> + continue;
> + }
> + if (pa->pa_deleted) {
> + spin_unlock(&pa->pa_lock);
> + continue;
> + }
> +
> + /* seems this one can be freed ... */
> + pa->pa_deleted = 1;
> +
> + /* we can trust pa_free ... */
> + free += pa->pa_free;
> +
> + spin_unlock(&pa->pa_lock);
> +
> + list_del_rcu(&pa->pa_group_list);
> + list_add(&pa->u.pa_tmp_list, &list);
> + }

Strange to see rcu operations outside rcu_read_lock().

> + /* if we still need more blocks and some PAs were used, try again */
> + if (free < needed && busy) {
> + busy = 0;
> + ext4_unlock_group(sb, group);
> + /*
> + * Yield the CPU here so that we don't get soft lockup
> + * in non preempt case.
> + */
> + yield();

argh, no, yield() is basically unusable. schedule_timeout(1) is preferable.

Please test this code whe there are lots of cpu-intensive tasks running.

> + goto repeat;
> + }
> +
> + /* found anything to free? */
> + if (list_empty(&list)) {
> + BUG_ON(free != 0);
> + goto out;
> + }
> +
> + /* now free all selected PAs */
> + list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
> +
> + /* remove from object (inode or locality group) */
> + spin_lock(pa->pa_obj_lock);
> + list_del_rcu(&pa->pa_inode_list);
> + spin_unlock(pa->pa_obj_lock);
> +
> + if (pa->pa_linear)
> + ext4_mb_release_group_pa(&e4b, pa);
> + else
> + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
> +
> + list_del(&pa->u.pa_tmp_list);
> + mb_call_rcu(pa);
> + }
> +
> +out:
> + ext4_unlock_group(sb, group);
> + ext4_mb_release_desc(&e4b);
> + brelse(bitmap_bh);

put_bh()

> + return free;
> +}
> +
> +/*
> + * releases all non-used preallocated blocks for given inode
> + *
> + * It's important to discard preallocations under i_data_sem
> + * We don't want another block to be served from the prealloc
> + * space when we are discarding the inode prealloc space.
> + *
> + * FIXME!! Make sure it is valid at all the call sites
> + */
> +void ext4_mb_discard_inode_preallocations(struct inode *inode)
> +{
> + struct ext4_inode_info *ei = EXT4_I(inode);
> + struct super_block *sb = inode->i_sb;
> + struct buffer_head *bitmap_bh = NULL;
> + struct ext4_prealloc_space *pa, *tmp;
> + ext4_group_t group = 0;
> + struct list_head list;
> + struct ext4_buddy e4b;
> + int err;
> +
> + if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) {
> + /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
> + return;
> + }
> +
> + mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
> +
> + INIT_LIST_HEAD(&list);
> +
> +repeat:
> + /* first, collect all pa's in the inode */
> + spin_lock(&ei->i_prealloc_lock);
> + while (!list_empty(&ei->i_prealloc_list)) {
> + pa = list_entry(ei->i_prealloc_list.next,
> + struct ext4_prealloc_space, pa_inode_list);
> + BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
> + spin_lock(&pa->pa_lock);
> + if (atomic_read(&pa->pa_count)) {
> + /* this shouldn't happen often - nobody should
> + * use preallocation while we're discarding it */
> + spin_unlock(&pa->pa_lock);
> + spin_unlock(&ei->i_prealloc_lock);
> + printk(KERN_ERR "uh-oh! used pa while discarding\n");
> + dump_stack();

WARN_ON(1) would be more conventional.

> + current->state = TASK_UNINTERRUPTIBLE;
> + schedule_timeout(HZ);

schedule_timeout_uninterruptible()

> + goto repeat;
> +
> + }
> + if (pa->pa_deleted == 0) {
> + pa->pa_deleted = 1;
> + spin_unlock(&pa->pa_lock);
> + list_del_rcu(&pa->pa_inode_list);
> + list_add(&pa->u.pa_tmp_list, &list);
> + continue;
> + }
> +
> + /* someone is deleting pa right now */
> + spin_unlock(&pa->pa_lock);
> + spin_unlock(&ei->i_prealloc_lock);
> +
> + /* we have to wait here because pa_deleted
> + * doesn't mean pa is already unlinked from
> + * the list. as we might be called from
> + * ->clear_inode() the inode will get freed
> + * and concurrent thread which is unlinking
> + * pa from inode's list may access already
> + * freed memory, bad-bad-bad */
> +
> + /* XXX: if this happens too often, we can
> + * add a flag to force wait only in case
> + * of ->clear_inode(), but not in case of
> + * regular truncate */
> + current->state = TASK_UNINTERRUPTIBLE;
> + schedule_timeout(HZ);

ditto

> + goto repeat;
> + }
> + spin_unlock(&ei->i_prealloc_lock);
> +
> + list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
> + BUG_ON(pa->pa_linear != 0);
> + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
> +
> + err = ext4_mb_load_buddy(sb, group, &e4b);
> + BUG_ON(err != 0); /* error handling here */
> +
> + bitmap_bh = read_block_bitmap(sb, group);
> + if (bitmap_bh == NULL) {
> + /* error handling here */
> + ext4_mb_release_desc(&e4b);
> + BUG_ON(bitmap_bh == NULL);
> + }
> +
> + ext4_lock_group(sb, group);
> + list_del_rcu(&pa->pa_group_list);
> + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
> + ext4_unlock_group(sb, group);
> +
> + ext4_mb_release_desc(&e4b);
> + brelse(bitmap_bh);
> +
> + list_del(&pa->u.pa_tmp_list);
> + mb_call_rcu(pa);
> + }
> +}

Would be nice to ask Paul to review all the rcu usage in here. It looks odd.

>
> ...
>
> +#else
> +#define ext4_mb_show_ac(x)
> +#endif

static inlined C functions are preferred (+1e6 dittoes)

> +/*
> + * We use locality group preallocation for small size file. The size of the
> + * file is determined by the current size or the resulting size after
> + * allocation which ever is larger
> + *
> + * One can tune this size via /proc/fs/ext4/<partition>/stream_req
> + */
> +static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
> +{
> + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
> + int bsbits = ac->ac_sb->s_blocksize_bits;
> + loff_t size, isize;
> +
> + if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
> + return;
> +
> + size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
> + isize = i_size_read(ac->ac_inode) >> bsbits;
> + if (size < isize)
> + size = isize;

min()?

> + /* don't use group allocation for large files */
> + if (size >= sbi->s_mb_stream_request)
> + return;
> +
> + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
> + return;
> +
> + BUG_ON(ac->ac_lg != NULL);
> + ac->ac_lg = &sbi->s_locality_groups[get_cpu()];
> + put_cpu();

Strange-looking code. I'd be interested in a description of the per-cou
design here.

> + /* we're going to use group allocation */
> + ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
> +
> + /* serialize all allocations in the group */
> + down(&ac->ac_lg->lg_sem);

This should be a mutex, shouldn't it?

> +}
> +
>
> ...
>
> +static int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
> + ext4_group_t group, ext4_grpblk_t block, int count)
> +{
> + struct ext4_group_info *db = e4b->bd_info;
> + struct super_block *sb = e4b->bd_sb;
> + struct ext4_sb_info *sbi = EXT4_SB(sb);
> + struct ext4_free_metadata *md;
> + int i;
> +
> + BUG_ON(e4b->bd_bitmap_page == NULL);
> + BUG_ON(e4b->bd_buddy_page == NULL);
> +
> + ext4_lock_group(sb, group);
> + for (i = 0; i < count; i++) {
> + md = db->bb_md_cur;
> + if (md && db->bb_tid != handle->h_transaction->t_tid) {
> + db->bb_md_cur = NULL;
> + md = NULL;
> + }
> +
> + if (md == NULL) {
> + ext4_unlock_group(sb, group);
> + md = kmalloc(sizeof(*md), GFP_KERNEL);

Why was this one not GFP_NOFS?

> + if (md == NULL)
> + return -ENOMEM;

Did we just leak some memory?

> + md->num = 0;
> + md->group = group;
> +
> + ext4_lock_group(sb, group);
> + if (db->bb_md_cur == NULL) {
> + spin_lock(&sbi->s_md_lock);
> + list_add(&md->list, &sbi->s_active_transaction);
> + spin_unlock(&sbi->s_md_lock);
> + /* protect buddy cache from being freed,
> + * otherwise we'll refresh it from
> + * on-disk bitmap and lose not-yet-available
> + * blocks */
> + page_cache_get(e4b->bd_buddy_page);
> + page_cache_get(e4b->bd_bitmap_page);
> + db->bb_md_cur = md;
> + db->bb_tid = handle->h_transaction->t_tid;
> + mb_debug("new md 0x%p for group %lu\n",
> + md, md->group);
> + } else {
> + kfree(md);
> + md = db->bb_md_cur;
> + }
> + }
> +
> + BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
> + md->blocks[md->num] = block + i;
> + md->num++;
> + if (md->num == EXT4_BB_MAX_BLOCKS) {
> + /* no more space, put full container on a sb's list */
> + db->bb_md_cur = NULL;
> + }
> + }
> + ext4_unlock_group(sb, group);
> + return 0;
> +}
> +
>
> ...
>
> + case Opt_mballoc:
> + set_opt(sbi->s_mount_opt, MBALLOC);
> + break;
> + case Opt_nomballoc:
> + clear_opt(sbi->s_mount_opt, MBALLOC);
> + break;
> + case Opt_stripe:
> + if (match_int(&args[0], &option))
> + return 0;
> + if (option < 0)
> + return 0;
> + sbi->s_stripe = option;
> + break;

These appear to be undocumented.

> default:
> printk (KERN_ERR
> "EXT4-fs: Unrecognized mount option \"%s\" "
> @@ -1742,6 +1762,33 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
> return (has_super + ext4_group_first_block_no(sb, bg));
> }
>
> +/**
> + * ext4_get_stripe_size: Get the stripe size.
> + * @sbi: In memory super block info
> + *
> + * If we have specified it via mount option, then
> + * use the mount option value. If the value specified at mount time is
> + * greater than the blocks per group use the super block value.
> + * If the super block value is greater than blocks per group return 0.
> + * Allocator needs it be less than blocks per group.
> + *
> + */
> +static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
> +{
> + unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
> + unsigned long stripe_width =
> + le32_to_cpu(sbi->s_es->s_raid_stripe_width);
> +
> + if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) {
> + return sbi->s_stripe;
> + } else if (stripe_width <= sbi->s_blocks_per_group) {
> + return stripe_width;
> + } else if (stride <= sbi->s_blocks_per_group) {
> + return stride;
> + }

unneeded braces.

> + return 0;
> +}
>
> ...
>
> +static inline
> +struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
> + ext4_group_t group)
> +{
> + struct ext4_group_info ***grp_info;
> + long indexv, indexh;
> + grp_info = EXT4_SB(sb)->s_group_info;
> + indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
> + indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
> + return grp_info[indexv][indexh];
> +}

This should be uninlined.



Gosh what a lot of code. Is it faster?


2008-01-23 23:21:19

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH 41/49] ext4: Add multi block allocator for ext4

On Jan 23, 2008 14:07 -0800, Andrew Morton wrote:
> > +#define mb_correct_addr_and_bit(bit, addr) \
> > +{ \
> > + bit += ((unsigned long) addr & 3UL) << 3; \
> > + addr = (void *) ((unsigned long) addr & ~3UL); \
> > +}
>
> Why do these exist?

They seem to be a holdover from when mballoc stored the buddy bitmaps
on disk. That no longer happens (to avoid bitmap vs. buddy consistency
problems), so I suspect they can be removed.

I can't comment on many of the other issues because Alex wrote most
of the code.

> Gosh what a lot of code. Is it faster?

Yes, and also importantly it uses a lot less CPU to do a given amount
of allocation, which is critical in our environments where there is
very high disk bandwidth on a single node and CPU becomes the limiting
factor of the IO speed. This of course also helps any write-intensive
environment where the CPU is doing something "useful".

Some older test results include:
https://ols2006.108.redhat.com/2007/Reprints/mathur-Reprint.pdf (Section 7)

In the linux-ext4 thread "compilebench numbers for ext4":
http://www.mail-archive.com/[email protected]/msg03834.html

http://oss.oracle.com/~mason/compilebench/ext4/ext-create-compare.png
http://oss.oracle.com/~mason/compilebench/ext4/ext-compile-compare.png
http://oss.oracle.com/~mason/compilebench/ext4/ext-read-compare.png
http://oss.oracle.com/~mason/compilebench/ext4/ext-rm-compare.png

note the ext-read-compare.png graph shows lower read performance, but
a couple of bugs in mballoc were since fixed to have ext4 allocate more
contiguous extents.

In the old linux-ext4 thread "[RFC] delayed allocation testing on node zefir"
http://www.mail-archive.com/[email protected]/msg00587.html

: dd2048rw
: REAL UTIME STIME READ WRITTEN DETAILS
EXT3 : 58.46 23 1491 2572 2097292 17 extents
EXT4 : 44.56 19 1018 12 2097244 19 extents
REISERFS: 56.80 26 1370 2952 2097336 457 extents
JFS : 45.77 22 984 0 2097216 1 extents
XFS : 50.97 20 1394 0 2100825 7 extents

: kernuntar
: REAL UTIME STIME READ WRITTEN DETAILS
EXT3 : 56.99 5037 651 68 252016
EXT4 : 55.03 5034 553 36 249884
REISERFS: 52.55 4996 854 64 238068
JFS : 70.15 5057 630 496 288116
XFS : 72.84 5052 953 132 316798

: kernstat
: REAL UTIME STIME READ WRITTEN DETAILS
EXT3 : 2.83 8 15 5892 0
EXT4 : 0.51 9 10 5892 0
REISERFS: 0.81 7 49 2696 0
JFS : 6.19 11 49 12552 0
XFS : 2.09 9 61 6504 0

: kerncat
: REAL UTIME STIME READ WRITTEN DETAILS
EXT3 : 9.48 25 213 241624 0
EXT4 : 6.29 27 197 238560 0
REISERFS: 14.69 33 230 234744 0
JFS : 23.51 23 231 244596 0
XFS : 18.24 36 254 238548 0

: kernrm
: REAL UTIME STIME READ WRITTEN DETAILS
EXT3 : 4.82 4 108 9628 4672
EXT4 : 1.61 5 110 6536 4632
REISERFS: 3.15 8 276 2768 236
JFS : 33.90 7 168 14400 33048
XFS : 20.03 8 296 6632 86160


Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

2008-01-24 07:57:49

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH 41/49] ext4: Add multi block allocator for ext4

On Wed, Jan 23, 2008 at 02:07:27PM -0800, Andrew Morton wrote:
> > On Mon, 21 Jan 2008 22:02:20 -0500 "Theodore Ts'o" <[email protected]> wrote:
> > From: Alex Tomas <[email protected]>
> >
> > Signed-off-by: Alex Tomas <[email protected]>
> > Signed-off-by: Andreas Dilger <[email protected]>
> > Signed-off-by: Aneesh Kumar K.V <[email protected]>
> > Signed-off-by: Eric Sandeen <[email protected]>
> > Signed-off-by: "Theodore Ts'o" <[email protected]>
> >
> > ...
> >
> > +#if BITS_PER_LONG == 64
> > +#define mb_correct_addr_and_bit(bit, addr) \
> > +{ \
> > + bit += ((unsigned long) addr & 7UL) << 3; \
> > + addr = (void *) ((unsigned long) addr & ~7UL); \
> > +}
> > +#elif BITS_PER_LONG == 32
> > +#define mb_correct_addr_and_bit(bit, addr) \
> > +{ \
> > + bit += ((unsigned long) addr & 3UL) << 3; \
> > + addr = (void *) ((unsigned long) addr & ~3UL); \
> > +}
> > +#else
> > +#error "how many bits you are?!"
> > +#endif
>
> Why do these exist?

Initial version on mballoc supported on x86 32 this was there to give
compile warning on 64 bit platform. I guess we can remove that now.
Or may be we can keep it as such because it is harmless.


>
> > +static inline int mb_test_bit(int bit, void *addr)
> > +{
> > + mb_correct_addr_and_bit(bit, addr);
> > + return ext4_test_bit(bit, addr);
> > +}
>
> ext2_test_bit() already handles bitnum > wordsize.
>
> If mb_correct_addr_and_bit() is actually needed then some suitable comment
> would help.

ext4_test_bit on powerpc needs the addr to be 8 byte aligned. Othewise
it fails

>
> > +static inline void mb_set_bit(int bit, void *addr)
> > +{
> > + mb_correct_addr_and_bit(bit, addr);
> > + ext4_set_bit(bit, addr);
> > +}
> > +
> > +static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
> > +{
> > + mb_correct_addr_and_bit(bit, addr);
> > + ext4_set_bit_atomic(lock, bit, addr);
> > +}
> > +
> > +static inline void mb_clear_bit(int bit, void *addr)
> > +{
> > + mb_correct_addr_and_bit(bit, addr);
> > + ext4_clear_bit(bit, addr);
> > +}
> > +
> > +static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
> > +{
> > + mb_correct_addr_and_bit(bit, addr);
> > + ext4_clear_bit_atomic(lock, bit, addr);
> > +}
> > +
> > +static inline void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
>
> uninlining this will save about eighty squigabytes of text.

Fixed


>
> Please review all of ext4/jbd2 with a view to removig unnecessary and wrong
> inlings.
>
> > +{
> > + char *bb;
> > +
> > + /* FIXME!! is this needed */
> > + BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
> > + BUG_ON(max == NULL);
> > +
> > + if (order > e4b->bd_blkbits + 1) {
> > + *max = 0;
> > + return NULL;
> > + }
> > +
> > + /* at order 0 we see each particular block */
> > + *max = 1 << (e4b->bd_blkbits + 3);
> > + if (order == 0)
> > + return EXT4_MB_BITMAP(e4b);
> > +
> > + bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
> > + *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
> > +
> > + return bb;
> > +}
> > +
> >
> > ...
> >
> > +#else
> > +#define mb_free_blocks_double(a, b, c, d)
> > +#define mb_mark_used_double(a, b, c)
> > +#define mb_cmp_bitmaps(a, b)
> > +#endif
>
> Please use the do{}while(0) thing. Or, better, proper C functions which
> have typechecking (unless this will cause undefined-var compile errors,
> which happens sometimes)

makde static inline void.

>
> > +/* find most significant bit */
> > +static int fmsb(unsigned short word)
> > +{
> > + int order;
> > +
> > + if (word > 255) {
> > + order = 7;
> > + word >>= 8;
> > + } else {
> > + order = -1;
> > + }
> > +
> > + do {
> > + order++;
> > + word >>= 1;
> > + } while (word != 0);
> > +
> > + return order;
> > +}
>
> Did we just reinvent fls()?

replaced by fls.

>
> > +/* FIXME!! need more doc */
> > +static void ext4_mb_mark_free_simple(struct super_block *sb,
> > + void *buddy, unsigned first, int len,
> > + struct ext4_group_info *grp)
> > +{
> > + struct ext4_sb_info *sbi = EXT4_SB(sb);
> > + unsigned short min;
> > + unsigned short max;
> > + unsigned short chunk;
> > + unsigned short border;
> > +
> > + BUG_ON(len >= EXT4_BLOCKS_PER_GROUP(sb));
> > +
> > + border = 2 << sb->s_blocksize_bits;
>
> Won't this explode with >= 32k blocksize?
>
> > + while (len > 0) {
> > + /* find how many blocks can be covered since this position */
> > + max = ffs(first | border) - 1;
> > +
> > + /* find how many blocks of power 2 we need to mark */
> > + min = fmsb(len);
> > +
> > + if (max < min)
> > + min = max;
> > + chunk = 1 << min;
> > +
> > + /* mark multiblock chunks only */
> > + grp->bb_counters[min]++;
> > + if (min > 0)
> > + mb_clear_bit(first >> min,
> > + buddy + sbi->s_mb_offsets[min]);
> > +
> > + len -= chunk;
> > + first += chunk;
> > + }
> > +}
> > +
> >
> > ...
> >
> > +static int ext4_mb_init_cache(struct page *page, char *incore)
> > +{
> > + int blocksize;
> > + int blocks_per_page;
> > + int groups_per_page;
> > + int err = 0;
> > + int i;
> > + ext4_group_t first_group;
> > + int first_block;
> > + struct super_block *sb;
> > + struct buffer_head *bhs;
> > + struct buffer_head **bh;
> > + struct inode *inode;
> > + char *data;
> > + char *bitmap;
> > +
> > + mb_debug("init page %lu\n", page->index);
> > +
> > + inode = page->mapping->host;
> > + sb = inode->i_sb;
> > + blocksize = 1 << inode->i_blkbits;
> > + blocks_per_page = PAGE_CACHE_SIZE / blocksize;
> > +
> > + groups_per_page = blocks_per_page >> 1;
> > + if (groups_per_page == 0)
> > + groups_per_page = 1;
> > +
> > + /* allocate buffer_heads to read bitmaps */
> > + if (groups_per_page > 1) {
> > + err = -ENOMEM;
> > + i = sizeof(struct buffer_head *) * groups_per_page;
> > + bh = kmalloc(i, GFP_NOFS);
> > + if (bh == NULL)
> > + goto out;
> > + memset(bh, 0, i);
>
> kzalloc()

Fixed

>
> > + } else
> > + bh = &bhs;
> > +
> > + first_group = page->index * blocks_per_page / 2;
> > +
> > + /* read all groups the page covers into the cache */
> > + for (i = 0; i < groups_per_page; i++) {
> > + struct ext4_group_desc *desc;
> > +
> > + if (first_group + i >= EXT4_SB(sb)->s_groups_count)
> > + break;
> > +
> > + err = -EIO;
> > + desc = ext4_get_group_desc(sb, first_group + i, NULL);
> > + if (desc == NULL)
> > + goto out;
> > +
> > + err = -ENOMEM;
> > + bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
> > + if (bh[i] == NULL)
> > + goto out;
> > +
> > + if (buffer_uptodate(bh[i]))
> > + continue;
> > +
> > + lock_buffer(bh[i]);
> > + if (buffer_uptodate(bh[i])) {
> > + unlock_buffer(bh[i]);
> > + continue;
> > + }
>
> Didn't we just add a helper in fs/buffer.c to do this?
>

Fixed


> > + if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
> > + ext4_init_block_bitmap(sb, bh[i],
> > + first_group + i, desc);
> > + set_buffer_uptodate(bh[i]);
> > + unlock_buffer(bh[i]);
> > + continue;
> > + }
> > + get_bh(bh[i]);
> > + bh[i]->b_end_io = end_buffer_read_sync;

[... snip... ]
> > +
> > + /* set incore so that the buddy information can be
> > + * generated using this
> > + */
> > + incore = data;
> > + }
> > + }
> > + SetPageUptodate(page);
>
> Is the page locked here?


The page is locked via find_or_create_page

>
> > +out:
> > + if (bh) {
> > + for (i = 0; i < groups_per_page && bh[i]; i++)
> > + brelse(bh[i]);
>
> put_bh()
>
> > + if (bh != &bhs)
> > + kfree(bh);
> > + }
> > + return err;
> > +}
> > +
> >
> > ...
> >
> > +static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
> > +{
> > + __u32 *addr;
> > +
> > + len = cur + len;
> > + while (cur < len) {
> > + if ((cur & 31) == 0 && (len - cur) >= 32) {
> > + /* fast path: clear whole word at once */
>
> s/clear/set/

Fixed

>
> > + addr = bm + (cur >> 3);
> > + *addr = 0xffffffff;
> > + cur += 32;
> > + continue;
> > + }
> > + mb_set_bit_atomic(lock, cur, bm);
> > + cur++;
> > + }
> > +}
> > +
> >
> > ...
> >
> > +static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
> > + ext4_group_t group)
> > +{
> > + struct ext4_group_info *grp = ext4_get_group_info(sb, group);
> > + struct ext4_prealloc_space *pa;
> > + struct list_head *cur;
> > + ext4_group_t groupnr;
> > + ext4_grpblk_t start;
> > + int preallocated = 0;
> > + int count = 0;
> > + int len;
> > +
> > + /* all form of preallocation discards first load group,
> > + * so the only competing code is preallocation use.
> > + * we don't need any locking here
> > + * notice we do NOT ignore preallocations with pa_deleted
> > + * otherwise we could leave used blocks available for
> > + * allocation in buddy when concurrent ext4_mb_put_pa()
> > + * is dropping preallocation
> > + */
> > + list_for_each_rcu(cur, &grp->bb_prealloc_list) {
> > + pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
> > + spin_lock(&pa->pa_lock);
> > + ext4_get_group_no_and_offset(sb, pa->pa_pstart,
> > + &groupnr, &start);
> > + len = pa->pa_len;
> > + spin_unlock(&pa->pa_lock);
> > + if (unlikely(len == 0))
> > + continue;
> > + BUG_ON(groupnr != group);
> > + mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
> > + bitmap, start, len);
> > + preallocated += len;
> > + count++;
> > + }
>
> Seems to be missing rcu_read_lock()
>


bb_prealloc_list is actually modified under ext4_group_lock. So it is
not actually rcu. I this we should be using list_for_each there.
The rcu managed list are i_prealloc_list and lg_prealloc_list


> > + mb_debug("prellocated %u for group %lu\n", preallocated, group);
> > +}
> > +
> > +static void ext4_mb_pa_callback(struct rcu_head *head)
> > +{
> > + struct ext4_prealloc_space *pa;
> > + pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
> > + kmem_cache_free(ext4_pspace_cachep, pa);
> > +}
> > +#define mb_call_rcu(__pa) call_rcu(&(__pa)->u.pa_rcu, ext4_mb_pa_callback)
>
> Is there any reason why this had to be implemented as a macro?

Fixed

>
> >
> > ...
> >
> > +static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
> > +{
> > + struct super_block *sb = ac->ac_sb;
> > + struct ext4_prealloc_space *pa;
> > + struct ext4_group_info *grp;
> > + struct ext4_inode_info *ei;
> > +
> > + /* preallocate only when found space is larger then requested */
> > + BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
> > + BUG_ON(ac->ac_status != AC_STATUS_FOUND);
> > + BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
> > +
> > + pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
>
> Do all the GFP_NOFS's in this code really need to be GFP_NOFS?
>
> > + if (pa == NULL)
> > + return -ENOMEM;
> > + ext4_lock_group(sb, ac->ac_b_ex.fe_group);

....

> > + list_add_rcu(&pa->pa_group_list, &grp->bb_prealloc_list);
> > + ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
> > +
> > + spin_lock(pa->pa_obj_lock);
> > + list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
> > + spin_unlock(pa->pa_obj_lock);
>
> hm. Strange to see list_add_rcu() inside spinlock like this.

Few lines above we have
pa->pa_obj_lock = &ei->i_prealloc_lock;
So the spin_lock is there to prevent mutiple cpu's adding to the
prealloc list together.


>
> > + return 0;
> > +}
> > +
> >
> > ...
> >
> > +static int ext4_mb_discard_group_preallocations(struct super_block *sb,
> > + ext4_group_t group, int needed)
> > +{
> > + struct ext4_group_info *grp = ext4_get_group_info(sb, group);
> > + struct buffer_head *bitmap_bh = NULL;
> > + struct ext4_prealloc_space *pa, *tmp;
> > + struct list_head list;
> > + struct ext4_buddy e4b;
> > + int err;
> > + int busy = 0;
> > + int free = 0;
> > + /* seems this one can be freed ... */

....

> > + pa->pa_deleted = 1;
> > +
> > + /* we can trust pa_free ... */
> > + free += pa->pa_free;
> > +
> > + spin_unlock(&pa->pa_lock);
> > +
> > + list_del_rcu(&pa->pa_group_list);
> > + list_add(&pa->u.pa_tmp_list, &list);
> > + }
>
> Strange to see rcu operations outside rcu_read_lock().

That need not be actually list_del_rcu. As i stated above that is
holding the bb_prealloc_list. It is updated under ext4_group_lock



>
> > + /* if we still need more blocks and some PAs were used, try again */
> > + if (free < needed && busy) {
> > + busy = 0;
> > + ext4_unlock_group(sb, group);
> > + /*
> > + * Yield the CPU here so that we don't get soft lockup
> > + * in non preempt case.
> > + */
> > + yield();
>
> argh, no, yield() is basically unusable. schedule_timeout(1) is preferable.

I actually schedule_timeout(HZ); This was actually a bug fix a soft
lockup happening when we were running non preemptible kernel. Well we
just want to make sure the high priority watchdog thread gets a chance
to run. And if there are no high priority threads we ourself would like
to run. My understanding was yield is the right choice there.


>
> Please test this code whe there are lots of cpu-intensive tasks running.
>
> > + goto repeat;
> > + }
> > +
> > + /* found anything to free? */
> > + if (list_empty(&list)) {
> > + BUG_ON(free != 0);
> > + goto out;
> > + }
> > +
> > + /* now free all selected PAs */
> > + if (atomic_read(&pa->pa_count)) {
> > + /* this shouldn't happen often - nobody should

.....


> > + * use preallocation while we're discarding it */
> > + spin_unlock(&pa->pa_lock);
> > + spin_unlock(&ei->i_prealloc_lock);
> > + printk(KERN_ERR "uh-oh! used pa while discarding\n");
> > + dump_stack();
>
> WARN_ON(1) would be more conventional.

Fixed

>
> > + current->state = TASK_UNINTERRUPTIBLE;
> > + schedule_timeout(HZ);
>
> schedule_timeout_uninterruptible()
>

Fixed

> > + goto repeat;
> > +
> > + }
> > + if (pa->pa_deleted == 0) {
> > + pa->pa_deleted = 1;
> > + spin_unlock(&pa->pa_lock);
> > + list_del_rcu(&pa->pa_inode_list);
> > + list_add(&pa->u.pa_tmp_list, &list);
> > + continue;
> > + }
> > +
> > + /* someone is deleting pa right now */
> > + spin_unlock(&pa->pa_lock);
> > + spin_unlock(&ei->i_prealloc_lock);
> > +
> > + /* we have to wait here because pa_deleted
> > + * doesn't mean pa is already unlinked from
> > + * the list. as we might be called from
> > + * ->clear_inode() the inode will get freed
> > + * and concurrent thread which is unlinking
> > + * pa from inode's list may access already
> > + * freed memory, bad-bad-bad */
> > +
> > + /* XXX: if this happens too often, we can
> > + * add a flag to force wait only in case
> > + * of ->clear_inode(), but not in case of
> > + * regular truncate */
> > + current->state = TASK_UNINTERRUPTIBLE;
> > + schedule_timeout(HZ);
>
> ditto
>

Fixed

> > + goto repeat;
> > + }
> > + spin_unlock(&ei->i_prealloc_lock);
> > +
> > + list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
> > + BUG_ON(pa->pa_linear != 0);
> > + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
> > +
> > + err = ext4_mb_load_buddy(sb, group, &e4b);
> > + BUG_ON(err != 0); /* error handling here */
> > +
> > + bitmap_bh = read_block_bitmap(sb, group);
> > + if (bitmap_bh == NULL) {
> > + /* error handling here */
> > + ext4_mb_release_desc(&e4b);
> > + BUG_ON(bitmap_bh == NULL);
> > + }
> > +
> > + ext4_lock_group(sb, group);
> > + list_del_rcu(&pa->pa_group_list);
> > + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
> > + ext4_unlock_group(sb, group);
> > +
> > + ext4_mb_release_desc(&e4b);
> > + brelse(bitmap_bh);
> > +
> > + list_del(&pa->u.pa_tmp_list);
> > + mb_call_rcu(pa);
> > + }
> > +}
>
> Would be nice to ask Paul to review all the rcu usage in here. It looks odd.
>

Will add Paul to the CC

> >
> > ...
> >
> > +#else
> > +#define ext4_mb_show_ac(x)
> > +#endif
>
> static inlined C functions are preferred (+1e6 dittoes)

Fixed

>
> > +/*
> > + * We use locality group preallocation for small size file. The size of the
> > + * file is determined by the current size or the resulting size after
> > + * allocation which ever is larger
> > + *
> > + * One can tune this size via /proc/fs/ext4/<partition>/stream_req
> > + */
> > +static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
> > +{
> > + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
> > + int bsbits = ac->ac_sb->s_blocksize_bits;
> > + loff_t size, isize;
> > +
> > + if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
> > + return;
> > +
> > + size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
> > + isize = i_size_read(ac->ac_inode) >> bsbits;
> > + if (size < isize)
> > + size = isize;
>
> min()?
>

updated as size = max(size, isize);


> > + /* don't use group allocation for large files */
> > + if (size >= sbi->s_mb_stream_request)
> > + return;
> > +
> > + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
> > + return;
> > +
> > + BUG_ON(ac->ac_lg != NULL);
> > + ac->ac_lg = &sbi->s_locality_groups[get_cpu()];
> > + put_cpu();
>
> Strange-looking code. I'd be interested in a description of the per-cou
> design here.

I added the below doc


/*
* locality group prealloc space are per cpu. The reason for
* having per cpu locality group is to reduce the contention
* between block request from multiple CPUs.
*/




>
> > + /* we're going to use group allocation */
> > + ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
> > +
> > + /* serialize all allocations in the group */
> > + down(&ac->ac_lg->lg_sem);
>
> This should be a mutex, shouldn't it?
>

converted to mutex


> > +}
> > +
> >
> > ...
> >
> > +static int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
> > + ext4_group_t group, ext4_grpblk_t block, int count)
> > +{
> > + struct ext4_group_info *db = e4b->bd_info;
> > + struct super_block *sb = e4b->bd_sb;
> > + struct ext4_sb_info *sbi = EXT4_SB(sb);
> > + struct ext4_free_metadata *md;
> > + int i;
> > +
> > + BUG_ON(e4b->bd_bitmap_page == NULL);
> > + BUG_ON(e4b->bd_buddy_page == NULL);
> > +
> > + ext4_lock_group(sb, group);
> > + for (i = 0; i < count; i++) {
> > + md = db->bb_md_cur;
> > + if (md && db->bb_tid != handle->h_transaction->t_tid) {
> > + db->bb_md_cur = NULL;
> > + md = NULL;
> > + }
> > +
> > + if (md == NULL) {
> > + ext4_unlock_group(sb, group);
> > + md = kmalloc(sizeof(*md), GFP_KERNEL);
>
> Why was this one not GFP_NOFS?
>
> > + if (md == NULL)
> > + return -ENOMEM;
>
> Did we just leak some memory?
>

No the data is allocated to carry information regarding the free blocks.


> > + md->num = 0;
> > + md->group = group;
> > +
> > + ext4_lock_group(sb, group);
> > + if (db->bb_md_cur == NULL) {
> > + spin_lock(&sbi->s_md_lock);
> > + list_add(&md->list, &sbi->s_active_transaction);
> > + spin_unlock(&sbi->s_md_lock);
> > + /* protect buddy cache from being freed,
> > + * otherwise we'll refresh it from
> > + * on-disk bitmap and lose not-yet-available
> > + * blocks */
> > + page_cache_get(e4b->bd_buddy_page);
> > + page_cache_get(e4b->bd_bitmap_page);
> > + db->bb_md_cur = md;
> > + db->bb_tid = handle->h_transaction->t_tid;
> > + mb_debug("new md 0x%p for group %lu\n",
> > + md, md->group);
> > + } else {
> > + kfree(md);
> > + md = db->bb_md_cur;
> > + }
> > + }
> > +
> > + BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
> > + md->blocks[md->num] = block + i;
> > + md->num++;
> > + if (md->num == EXT4_BB_MAX_BLOCKS) {
> > + /* no more space, put full container on a sb's list */
> > + db->bb_md_cur = NULL;
> > + }
> > + }
> > + ext4_unlock_group(sb, group);
> > + return 0;
> > +}
> > +
> >
> > ...
> >
> > + case Opt_mballoc:
> > + set_opt(sbi->s_mount_opt, MBALLOC);
> > + break;
> > + case Opt_nomballoc:
> > + clear_opt(sbi->s_mount_opt, MBALLOC);
> > + break;
> > + case Opt_stripe:
> > + if (match_int(&args[0], &option))
> > + return 0;
> > + if (option < 0)
> > + return 0;
> > + sbi->s_stripe = option;
> > + break;
>
> These appear to be undocumented.

Updated


>
> > default:
> > printk (KERN_ERR
> > "EXT4-fs: Unrecognized mount option \"%s\" "
> > @@ -1742,6 +1762,33 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
> > return (has_super + ext4_group_first_block_no(sb, bg));
> > }
> >
> > +/**
> > + * ext4_get_stripe_size: Get the stripe size.
> > + * @sbi: In memory super block info
> > + *
> > + * If we have specified it via mount option, then
> > + * use the mount option value. If the value specified at mount time is
> > + * greater than the blocks per group use the super block value.
> > + * If the super block value is greater than blocks per group return 0.
> > + * Allocator needs it be less than blocks per group.
> > + *
> > + */
> > +static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
> > +{
> > + unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
> > + unsigned long stripe_width =
> > + le32_to_cpu(sbi->s_es->s_raid_stripe_width);
> > +
> > + if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) {
> > + return sbi->s_stripe;
> > + } else if (stripe_width <= sbi->s_blocks_per_group) {
> > + return stripe_width;
> > + } else if (stride <= sbi->s_blocks_per_group) {
> > + return stride;
> > + }
>
> unneeded braces.

I was thinking it is ok these days. checkpatch didn't warn and i had
multiple else if. I could remove those else if


>
> > + return 0;
> > +}
> >
> > ...
> >
> > +static inline
> > +struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
> > + ext4_group_t group)
> > +{
> > + struct ext4_group_info ***grp_info;
> > + long indexv, indexh;
> > + grp_info = EXT4_SB(sb)->s_group_info;
> > + indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
> > + indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
> > + return grp_info[indexv][indexh];
> > +}
>
> This should be uninlined.
>
>
>
> Gosh what a lot of code. Is it faster?

Performance numbers with compile bench http://ext4.wiki.kernel.org/index.php/Performance_results

2008-01-24 09:04:30

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH 41/49] ext4: Add multi block allocator for ext4

updated patch. Waiting for the test results.

I am only attaching the diff. Mballoc patch is really large.

-aneesh
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 4f329af..ec7d349 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -89,6 +89,8 @@ When mounting an ext4 filesystem, the following option are accepted:
extents ext4 will use extents to address file data. The
file system will no longer be mountable by ext3.

+noextents ext4 will not use extents for new files created.
+
journal_checksum Enable checksumming of the journal transactions.
This will allow the recovery code in e2fsck and the
kernel to detect corruption in the kernel. It is a
@@ -206,6 +208,10 @@ nobh (a) cache disk block mapping information
"nobh" option tries to avoid associating buffer
heads (supported only for "writeback" mode).

+mballoc (*) Use the mutliblock allocator for block allocation
+nomballoc disabled multiblock allocator for block allocation.
+stripe=n filesystem blocks per stripe for a RAID configuration.
+

Data Mode
---------
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index dec9945..4413a2d 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -857,6 +857,45 @@ CPUs.
The "procs_blocked" line gives the number of processes currently blocked,
waiting for I/O to complete.

+1.9 Ext4 file system parameters
+------------------------------
+Ext4 file system have one directory per partition under /proc/fs/ext4/
+# ls /proc/fs/ext4/hdc/
+group_prealloc max_to_scan mb_groups mb_history min_to_scan order2_req
+stats stream_req
+
+mb_groups:
+This file gives the details of mutiblock allocator buddy cache of free blocks
+
+mb_history:
+Multiblock allocation history.
+
+stats:
+This file indicate whether the multiblock allocator should start collecting
+statistics. The statistics are shown during unmount
+
+group_prealloc:
+The multiblock allocator normalize the block allocation request to
+group_prealloc filesystem blocks if we don't have strip value set.
+The stripe value can be specified at mount time or during mke2fs.
+
+max_to_scan:
+How long multiblock allocator can look for a best extent (in found extents)
+
+min_to_scan:
+How long multiblock allocator must look for a best extent
+
+order2_req:
+Multiblock allocator use 2^N search using buddies only for requests greater
+than or equal to order2_req. The request size is specfied in file system
+blocks. A value of 2 indicate only if the requests are greater than or equal
+to 4 blocks.
+
+stream_req:
+Files smaller than stream_req are served by the stream allocator, whose
+purpose is to pack requests as close each to other as possible to
+produce smooth I/O traffic. Avalue of 16 indicate that file smaller than 16
+filesystem block size will use group based preallocation.

------------------------------------------------------------------------------
Summary
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 0398aa0..310bad6 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -489,7 +489,7 @@ struct ext4_free_extent {
*/
struct ext4_locality_group {
/* for allocator */
- struct semaphore lg_sem; /* to serialize allocates */
+ struct mutex lg_sem; /* to serialize allocates */
struct list_head lg_prealloc_list;/* list of preallocations */
spinlock_t lg_prealloc_lock;
};
@@ -563,7 +563,10 @@ struct ext4_buddy {
#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)

#ifndef EXT4_MB_HISTORY
-#define ext4_mb_store_history(ac)
+static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
+{
+ return;
+}
#else
static void ext4_mb_store_history(struct ext4_allocation_context *ac);
#endif
@@ -641,6 +644,10 @@ static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,

static inline int mb_test_bit(int bit, void *addr)
{
+ /*
+ * ext4_test_bit on architecture like powerpc
+ * needs unsigned long aligned address
+ */
mb_correct_addr_and_bit(bit, addr);
return ext4_test_bit(bit, addr);
}
@@ -669,7 +676,7 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
ext4_clear_bit_atomic(lock, bit, addr);
}

-static inline void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
+static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
{
char *bb;

@@ -752,9 +759,20 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
}

#else
-#define mb_free_blocks_double(a, b, c, d)
-#define mb_mark_used_double(a, b, c)
-#define mb_cmp_bitmaps(a, b)
+static inline void mb_free_blocks_double(struct inode *inode,
+ struct ext4_buddy *e4b, int first, int count)
+{
+ return;
+}
+static inline void mb_mark_used_double(struct ext4_buddy *e4b,
+ int first, int count)
+{
+ return;
+}
+static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
+{
+ return;
+}
#endif

#ifdef AGGRESSIVE_CHECK
@@ -877,26 +895,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
#define mb_check_buddy(e4b)
#endif

-/* find most significant bit */
-static int fmsb(unsigned short word)
-{
- int order;
-
- if (word > 255) {
- order = 7;
- word >>= 8;
- } else {
- order = -1;
- }
-
- do {
- order++;
- word >>= 1;
- } while (word != 0);
-
- return order;
-}
-
/* FIXME!! need more doc */
static void ext4_mb_mark_free_simple(struct super_block *sb,
void *buddy, unsigned first, int len,
@@ -917,7 +915,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
max = ffs(first | border) - 1;

/* find how many blocks of power 2 we need to mark */
- min = fmsb(len);
+ min = fls(len);

if (max < min)
min = max;
@@ -1029,10 +1027,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if (groups_per_page > 1) {
err = -ENOMEM;
i = sizeof(struct buffer_head *) * groups_per_page;
- bh = kmalloc(i, GFP_NOFS);
+ bh = kzalloc(i, GFP_NOFS);
if (bh == NULL)
goto out;
- memset(bh, 0, i);
} else
bh = &bhs;

@@ -1055,15 +1052,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if (bh[i] == NULL)
goto out;

- if (buffer_uptodate(bh[i]))
+ if (bh_uptodate_or_lock(bh[i]))
continue;

- lock_buffer(bh[i]);
- if (buffer_uptodate(bh[i])) {
- unlock_buffer(bh[i]);
- continue;
- }
-
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
ext4_init_block_bitmap(sb, bh[i],
first_group + i, desc);
@@ -1302,7 +1293,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
len = cur + len;
while (cur < len) {
if ((cur & 31) == 0 && (len - cur) >= 32) {
- /* fast path: clear whole word at once */
+ /* fast path: set whole word at once */
addr = bm + (cur >> 3);
*addr = 0xffffffff;
cur += 32;
@@ -2675,7 +2666,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
for (i = 0; i < NR_CPUS; i++) {
struct ext4_locality_group *lg;
lg = &sbi->s_locality_groups[i];
- sema_init(&lg->lg_sem, 1);
+ mutex_init(&lg->lg_sem);
INIT_LIST_HEAD(&lg->lg_prealloc_list);
spin_lock_init(&lg->lg_prealloc_lock);
}
@@ -2687,6 +2678,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
return 0;
}

+/* need to called with ext4 group lock (ext4_lock_group) */
static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
{
struct ext4_prealloc_space *pa;
@@ -2695,7 +2687,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)

list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
- list_del_rcu(&pa->pa_group_list);
+ list_del(&pa->pa_group_list);
count++;
kfree(pa);
}
@@ -3441,6 +3433,7 @@ static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
/*
* the function goes through all preallocation in this group and marks them
* used in in-core bitmap. buddy must be generated from this bitmap
+ * Need to be called with ext4 group lock (ext4_lock_group)
*/
static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group)
@@ -3462,7 +3455,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
* allocation in buddy when concurrent ext4_mb_put_pa()
* is dropping preallocation
*/
- list_for_each_rcu(cur, &grp->bb_prealloc_list) {
+ list_for_each(cur, &grp->bb_prealloc_list) {
pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
spin_lock(&pa->pa_lock);
ext4_get_group_no_and_offset(sb, pa->pa_pstart,
@@ -3486,7 +3479,6 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
kmem_cache_free(ext4_pspace_cachep, pa);
}
-#define mb_call_rcu(__pa) call_rcu(&(__pa)->u.pa_rcu, ext4_mb_pa_callback)

/*
* drops a reference to preallocated space descriptor
@@ -3528,14 +3520,14 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
* against that pair
*/
ext4_lock_group(sb, grp);
- list_del_rcu(&pa->pa_group_list);
+ list_del(&pa->pa_group_list);
ext4_unlock_group(sb, grp);

spin_lock(pa->pa_obj_lock);
list_del_rcu(&pa->pa_inode_list);
spin_unlock(pa->pa_obj_lock);

- mb_call_rcu(pa);
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}

/*
@@ -3615,7 +3607,7 @@ static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa->pa_inode = ac->ac_inode;

ext4_lock_group(sb, ac->ac_b_ex.fe_group);
- list_add_rcu(&pa->pa_group_list, &grp->bb_prealloc_list);
+ list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);

spin_lock(pa->pa_obj_lock);
@@ -3672,7 +3664,7 @@ static int ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
pa->pa_inode = NULL;

ext4_lock_group(sb, ac->ac_b_ex.fe_group);
- list_add_rcu(&pa->pa_group_list, &grp->bb_prealloc_list);
+ list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);

spin_lock(pa->pa_obj_lock);
@@ -3853,7 +3845,7 @@ repeat:

spin_unlock(&pa->pa_lock);

- list_del_rcu(&pa->pa_group_list);
+ list_del(&pa->pa_group_list);
list_add(&pa->u.pa_tmp_list, &list);
}

@@ -3889,7 +3881,7 @@ repeat:
ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);

list_del(&pa->u.pa_tmp_list);
- mb_call_rcu(pa);
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}

out:
@@ -3942,9 +3934,8 @@ repeat:
spin_unlock(&pa->pa_lock);
spin_unlock(&ei->i_prealloc_lock);
printk(KERN_ERR "uh-oh! used pa while discarding\n");
- dump_stack();
- current->state = TASK_UNINTERRUPTIBLE;
- schedule_timeout(HZ);
+ WARN_ON(1);
+ schedule_timeout_uninterruptible(HZ);
goto repeat;

}
@@ -3972,8 +3963,7 @@ repeat:
* add a flag to force wait only in case
* of ->clear_inode(), but not in case of
* regular truncate */
- current->state = TASK_UNINTERRUPTIBLE;
- schedule_timeout(HZ);
+ schedule_timeout_uninterruptible(HZ);
goto repeat;
}
spin_unlock(&ei->i_prealloc_lock);
@@ -3993,7 +3983,7 @@ repeat:
}

ext4_lock_group(sb, group);
- list_del_rcu(&pa->pa_group_list);
+ list_del(&pa->pa_group_list);
ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
ext4_unlock_group(sb, group);

@@ -4001,7 +3991,7 @@ repeat:
brelse(bitmap_bh);

list_del(&pa->u.pa_tmp_list);
- mb_call_rcu(pa);
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}
}

@@ -4051,7 +4041,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
struct ext4_prealloc_space *pa;
ext4_grpblk_t start;
struct list_head *cur;
- list_for_each_rcu(cur, &grp->bb_prealloc_list) {
+ ext4_lock_group(sb, i);
+ list_for_each(cur, &grp->bb_prealloc_list) {
pa = list_entry(cur, struct ext4_prealloc_space,
pa_group_list);
spin_lock(&pa->pa_lock);
@@ -4061,6 +4052,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
printk(KERN_ERR "PA:%lu:%d:%u \n", i,
start, pa->pa_len);
}
+ ext4_lock_group(sb, i);

if (grp->bb_free == 0)
continue;
@@ -4070,7 +4062,10 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
printk(KERN_ERR "\n");
}
#else
-#define ext4_mb_show_ac(x)
+static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
+{
+ return;
+}
#endif

/*
@@ -4091,8 +4086,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)

size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
isize = i_size_read(ac->ac_inode) >> bsbits;
- if (size < isize)
- size = isize;
+ size = max(size, isize);

/* don't use group allocation for large files */
if (size >= sbi->s_mb_stream_request)
@@ -4102,6 +4096,11 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
return;

BUG_ON(ac->ac_lg != NULL);
+ /*
+ * locality group prealloc space are per cpu. The reason for having
+ * per cpu locality group is to reduce the contention between block
+ * request from multiple CPUs.
+ */
ac->ac_lg = &sbi->s_locality_groups[get_cpu()];
put_cpu();

@@ -4109,7 +4108,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;

/* serialize all allocations in the group */
- down(&ac->ac_lg->lg_sem);
+ mutex_lock(&ac->ac_lg->lg_sem);
}

static int ext4_mb_initialize_context(struct ext4_allocation_context *ac,
@@ -4202,7 +4201,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
if (ac->ac_buddy_page)
page_cache_release(ac->ac_buddy_page);
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
- up(&ac->ac_lg->lg_sem);
+ mutex_unlock(&ac->ac_lg->lg_sem);
ext4_mb_collect_stats(ac);
return 0;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 136d095..3a51ffc 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1779,13 +1779,14 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
unsigned long stripe_width =
le32_to_cpu(sbi->s_es->s_raid_stripe_width);

- if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) {
+ if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
return sbi->s_stripe;
- } else if (stripe_width <= sbi->s_blocks_per_group) {
+
+ if (stripe_width <= sbi->s_blocks_per_group)
return stripe_width;
- } else if (stride <= sbi->s_blocks_per_group) {
+
+ if (stride <= sbi->s_blocks_per_group)
return stride;
- }

return 0;
}

2008-01-24 14:53:06

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH 41/49] ext4: Add multi block allocator for ext4

On Thu, Jan 24, 2008 at 01:26:14PM +0530, Aneesh Kumar K.V wrote:
>
> >
> > > +/* find most significant bit */
> > > +static int fmsb(unsigned short word)
> > > +{
> > > + int order;
> > > +
> > > + if (word > 255) {
> > > + order = 7;
> > > + word >>= 8;
> > > + } else {
> > > + order = -1;
> > > + }
> > > +
> > > + do {
> > > + order++;
> > > + word >>= 1;
> > > + } while (word != 0);
> > > +
> > > + return order;
> > > +}
> >
> > Did we just reinvent fls()?
>
> replaced by fls.
>
> >

That should be fls() - 1;

The full patch is at

http://www.radian.org/~kvaneesh/ext4/jan-24-2008/mballoc-core.patch

The patch is too big to inline.

-aneesh