2008-01-07 18:28:36

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH] mballoc changes from ldiskfs

Hi,

This patch is not even compile tested. I am sending it over to find out
whether some of the changes are even needed and to make sure i didn't
drop any bug fix in the merge.

something I noticed.

a) prealloc table is completely gone.
b) ext4_mb_put_pa change. ( I guess that is a bug with ldiskfs ).


now by default request less that 64K use locality group preallocation.

The ldiskfs change i looked at is from
lustre/ldiskfs/kernel_patches/patches/ext3-mballoc3-core.patch

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 58a70a1..cb84516 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -532,10 +532,10 @@ static inline void mb_set_bit(int bit, void *addr)
ext4_set_bit(bit, addr);
}

-static inline void mb_set_bit_atomic(int bit, void *addr)
+static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
{
mb_correct_addr_and_bit(bit, addr);
- ext4_set_bit_atomic(NULL, bit, addr);
+ ext4_set_bit_atomic(lock, bit, addr);
}

static inline void mb_clear_bit(int bit, void *addr)
@@ -544,10 +544,10 @@ static inline void mb_clear_bit(int bit, void *addr)
ext4_clear_bit(bit, addr);
}

-static inline void mb_clear_bit_atomic(int bit, void *addr)
+static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
{
mb_correct_addr_and_bit(bit, addr);
- ext4_clear_bit_atomic(NULL, bit, addr);
+ ext4_clear_bit_atomic(lock, bit, addr);
}

static inline void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
@@ -1155,7 +1155,7 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
return 0;
}

-static void mb_clear_bits(void *bm, int cur, int len)
+static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
{
__u32 *addr;

@@ -1168,12 +1168,12 @@ static void mb_clear_bits(void *bm, int cur, int len)
cur += 32;
continue;
}
- mb_clear_bit_atomic(cur, bm);
+ mb_clear_bit_atomic(lock, cur, bm);
cur++;
}
}

-static void mb_set_bits(void *bm, int cur, int len)
+static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
{
__u32 *addr;

@@ -1186,7 +1186,7 @@ static void mb_set_bits(void *bm, int cur, int len)
cur += 32;
continue;
}
- mb_set_bit_atomic(cur, bm);
+ mb_set_bit_atomic(lock, cur, bm);
cur++;
}
}
@@ -1403,7 +1403,8 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
e4b->bd_info->bb_counters[ord]++;
}

- mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+ mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group),
+ EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
mb_check_buddy(e4b);

return ret;
@@ -1439,14 +1440,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
get_page(ac->ac_bitmap_page);
ac->ac_buddy_page = e4b->bd_buddy_page;
get_page(ac->ac_buddy_page);
-
- /* store last allocated for subsequent stream allocation */
- if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
- spin_lock(&sbi->s_md_lock);
- sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
- sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
- spin_unlock(&sbi->s_md_lock);
- }
}

/*
@@ -1509,8 +1502,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
struct ext4_free_extent *gex = &ac->ac_g_ex;

BUG_ON(ex->fe_len <= 0);
- BUG_ON(ex->fe_len >= (1 << ac->ac_sb->s_blocksize_bits) * 8);
- BUG_ON(ex->fe_start >= (1 << ac->ac_sb->s_blocksize_bits) * 8);
+ BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+ BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);

ac->ac_found++;
@@ -1553,7 +1546,6 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
/* if the request is satisfied, then we try to find
* an extent that still satisfy the request, but is
* smaller than previous one */
- if (ex->fe_len < bex->fe_len)
*bex = *ex;
}

@@ -1702,8 +1694,8 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
i = e4b->bd_info->bb_first_free;

while (free && ac->ac_status == AC_STATUS_CONTINUE) {
- i = ext4_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i);
- if (i >= sb->s_blocksize * 8) {
+ i = ext4_find_next_zero_bit(bitmap, EXT4_BLOCKS_PER_GROUP(sb), i);
+ if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
BUG_ON(free != 0);
break;
}
@@ -1744,7 +1736,7 @@ static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
i = (i - le32_to_cpu(sbi->s_es->s_first_data_block))
% EXT4_BLOCKS_PER_GROUP(sb);

- while (i < sb->s_blocksize * 8) {
+ while (i < EXT4_BLOCKS_PER_GROUP(sb)) {
if (!mb_test_bit(i, bitmap)) {
max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
if (max >= sbi->s_stripe) {
@@ -1839,20 +1831,6 @@ static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
ac->ac_2order = i;
}

- /* if stream allocation is enabled, use global goal */
-
- /* FIXME!!
- * Need more explanation on what it is and how stream
- * allocation is represented by the below conditional
- */
- if ((ac->ac_g_ex.fe_len < sbi->s_mb_large_req) &&
- (ac->ac_flags & EXT4_MB_HINT_DATA)) {
- /* TBD: may be hot point */
- spin_lock(&sbi->s_md_lock);
- ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
- ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
- spin_unlock(&sbi->s_md_lock);
- }

group = ac->ac_g_ex.fe_group;

@@ -2291,7 +2269,8 @@ static void ext4_mb_history_init(struct super_block *sb)
spin_lock_init(&sbi->s_mb_history_lock);
i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
- memset(sbi->s_mb_history, 0, i);
+ if (likely(sbi->s_mb_history != NULL))
+ memset(sbi->s_mb_history, 0, i);
/* if we can't allocate history, then we simple won't use it */
}

@@ -2300,7 +2279,7 @@ static void ext4_mb_store_history(struct ext4_allocation_context *ac)
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_mb_history h;

- if (likely(sbi->s_mb_history == NULL))
+ if (unlikely(sbi->s_mb_history == NULL))
return;

if (!(ac->ac_op & sbi->s_mb_history_filter))
@@ -2312,11 +2291,6 @@ static void ext4_mb_store_history(struct ext4_allocation_context *ac)
h.orig = ac->ac_o_ex;
h.result = ac->ac_b_ex;
h.flags = ac->ac_flags;
- h.found = ac->ac_found;
- h.groups = ac->ac_groups_scanned;
- h.cr = ac->ac_criteria;
- h.tail = ac->ac_tail;
- h.buddy = ac->ac_buddy;
h.merged = 0;
if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
@@ -2404,6 +2378,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
"EXT4-fs: can't read descriptor %lu\n", i);
goto err_freebuddy;
}
+ memset(meta_group_info[j], 0, len);
set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
&meta_group_info[j]->bb_state);

@@ -2512,32 +2487,10 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
sbi->s_mb_max_groups_to_scan = MB_DEFAULT_MAX_GROUPS_TO_SCAN;
sbi->s_mb_stats = MB_DEFAULT_STATS;
+ sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;

- sbi->s_mb_prealloc_table_size = 7;
- i = sbi->s_mb_prealloc_table_size;
- sbi->s_mb_prealloc_table = kmalloc(sizeof(unsigned long) * i,
- GFP_NOFS);
- if (sbi->s_mb_prealloc_table == NULL) {
- clear_opt(sbi->s_mount_opt, MBALLOC);
- kfree(sbi->s_mb_offsets);
- kfree(sbi->s_mb_maxs);
- return -ENOMEM;
- }
-
- sbi->s_mb_prealloc_table[0] = 4;
- sbi->s_mb_prealloc_table[1] = 8;
- sbi->s_mb_prealloc_table[2] = 16;
- sbi->s_mb_prealloc_table[3] = 32;
- sbi->s_mb_prealloc_table[4] = 64;
- sbi->s_mb_prealloc_table[5] = 128;
- sbi->s_mb_prealloc_table[6] = 256;
-
- sbi->s_mb_small_req = 256;
- sbi->s_mb_large_req = 1024;
- sbi->s_mb_group_prealloc = 512;
-
i = sizeof(struct ext4_locality_group) * NR_CPUS;
sbi->s_locality_groups = kmalloc(i, GFP_NOFS);
if (sbi->s_locality_groups == NULL) {
@@ -2713,75 +2666,9 @@ static void ext4_mb_free_committed_blocks(struct super_block *sb)
#define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan"
#define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan"
#define EXT4_MB_ORDER2_REQ "order2_req"
-#define EXT4_MB_SMALL_REQ "small_req"
-#define EXT4_MB_LARGE_REQ "large_req"
-#define EXT4_MB_PREALLOC_TABLE "prealloc_table"
-#define EXT4_MB_GROUP_PREALLOC "group_prealloc"
-
-static int ext4_mb_read_prealloc_table(char *page, char **start,
- off_t off, int count, int *eof, void *data)
-{
- struct ext4_sb_info *sbi = data;
- int len = 0;
- int i;
-
- *eof = 1;
- if (off != 0)
- return 0;
- for (i = 0; i < sbi->s_mb_prealloc_table_size; i++)
- len += sprintf(page + len, "%ld ",
- sbi->s_mb_prealloc_table[i]);
- len += sprintf(page + len, "\n");
- *start = page;
- return len;
-}
-
-static int ext4_mb_write_prealloc_table(struct file *file,
- const char __user *buf, unsigned long cnt, void *data)
-{
- struct ext4_sb_info *sbi = data;
- unsigned long value;
- unsigned long prev = 0;
- char str[128];
- char *cur;
- char *end;
- unsigned long *new_table;
- int num = 0;
- int i = 0;
-
- if (cnt >= sizeof(str))
- return -EINVAL;
- if (copy_from_user(str, buf, cnt))
- return -EFAULT;
+#define EXT4_MB_STREAM_REQ "stream_req"

- num = 0;
- cur = str;
- end = str + cnt;
- while (cur < end) {
- while ((cur < end) && (*cur == ' ')) cur++;
- value = simple_strtol(cur, &cur, 0);
- if (value == 0)
- break;
- if (value <= prev)
- return -EINVAL;
- prev = value;
- num++;
- }

- new_table = kmalloc(num * sizeof(*new_table), GFP_KERNEL);
- if (new_table == NULL)
- return -ENOMEM;
- kfree(sbi->s_mb_prealloc_table);
- sbi->s_mb_prealloc_table = new_table;
- sbi->s_mb_prealloc_table_size = num;
- cur = str;
- end = str + cnt;
- while (cur < end && i < num) {
- while ((cur < end) && (*cur == ' ')) cur++;
- new_table[i++] = simple_strtol(cur, &cur, 0);
- }
- return cnt;
-}

#define MB_PROC_VALUE_READ(name) \
static int ext4_mb_read_##name(char *page, char **start, \
@@ -2823,12 +2710,8 @@ MB_PROC_VALUE_READ(min_to_scan);
MB_PROC_VALUE_WRITE(min_to_scan);
MB_PROC_VALUE_READ(order2_reqs);
MB_PROC_VALUE_WRITE(order2_reqs);
-MB_PROC_VALUE_READ(small_req);
-MB_PROC_VALUE_WRITE(small_req);
-MB_PROC_VALUE_READ(large_req);
-MB_PROC_VALUE_WRITE(large_req);
-MB_PROC_VALUE_READ(group_prealloc);
-MB_PROC_VALUE_WRITE(group_prealloc);
+MB_PROC_VALUE_READ(stream_request);
+MB_PROC_VALUE_WRITE(stream_request);

#define MB_PROC_HANDLER(name, var) \
do { \
@@ -2857,18 +2740,13 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan);
MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs);
- MB_PROC_HANDLER(EXT4_MB_SMALL_REQ, small_req);
- MB_PROC_HANDLER(EXT4_MB_LARGE_REQ, large_req);
- MB_PROC_HANDLER(EXT4_MB_PREALLOC_TABLE, prealloc_table);
- MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc);
+ MB_PROC_HANDLER(EXT3_MB_STREAM_REQ, stream_request);

return 0;

err_out:
- remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_LARGE_REQ, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_SMALL_REQ, sbi->s_mb_proc);
+ printk(KERN_ERR "EXT4-fs: Unable to create %s\n", name);
+ remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
@@ -2889,10 +2767,7 @@ static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)

snprintf(devname, sizeof(devname) - 1, "%s",
bdevname(sb->s_bdev, devname));
- remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_SMALL_REQ, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_LARGE_REQ, sbi->s_mb_proc);
+ remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
@@ -3035,7 +2910,11 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
struct ext4_locality_group *lg = ac->ac_lg;

BUG_ON(lg == NULL);
- ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
+ if (EXT4_SB(sb)->s_stripe)
+ ac->ac_g_ex.fe_len = EXT3_SB(sb)->s_stripe;
+ else
+ ac->ac_g_ex.fe_len = (1024 * 1024) >> sb->s_blocksize_bits;
+
mb_debug("#%u: goal %lu blocks for locality group\n",
current->pid, ac->ac_g_ex.fe_len);
}
@@ -3085,34 +2964,50 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
size = i_size_read(ac->ac_inode);
size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits;

- start = 0;
- wind = 0;
+ /* max available blocks in a free group */
+ max = EXT3_BLOCKS_PER_GROUP(ac->ac_sb) - 1 - 1
+ - EXT3_SB(ac->ac_sb)->s_itb_per_group;

- /* let's choose preallocation window depending on file size */
- for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) {
- if (size <= sbi->s_mb_prealloc_table[i]) {
- wind = sbi->s_mb_prealloc_table[i];
- break;
- }
- }
- size = wind;
-
- if (wind == 0) {
- __u64 tstart, tend;
- /* file is quite large, we now preallocate with
- * the biggest configured window with regart to
- * logical offset */
- wind = sbi->s_mb_prealloc_table[i - 1];
- tstart = ac->ac_o_ex.fe_logical;
- do_div(tstart, wind);
- start = tstart * wind;
- tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1;
- do_div(tend, wind);
- tend = tend * wind + wind;
- size = tend - start;
+#define NRL_CHECK_SIZE(req,size,max,bits) \
+ (req <= (size) || max <= ((size) >> bits))
+
+ /* first, try to predict filesize */
+ /* XXX: should this table be tunable? */
+ start = 0;
+ if (size <= 16 * 1024) {
+ size = 16 * 1024;
+ } else if (size <= 32 * 1024) {
+ size = 32 * 1024;
+ } else if (size <= 64 * 1024) {
+ size = 64 * 1024;
+ } else if (size <= 128 * 1024) {
+ size = 128 * 1024;
+ } else if (size <= 256 * 1024) {
+ size = 256 * 1024;
+ } else if (size <= 512 * 1024) {
+ size = 512 * 1024;
+ } else if (size <= 1024 * 1024) {
+ size = 1024 * 1024;
+ } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, bsbits)) {
+ start = ac->ac_o_ex.fe_logical << bsbits;
+ start = (start / (1024 * 1024)) * (1024 * 1024);
+ size = 1024 * 1024;
+ } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, bsbits)) {
+ start = ac->ac_o_ex.fe_logical << bsbits;
+ start = (start / (4 * (1024 * 1024))) * 4 * (1024 * 1024);
+ size = 4 * 1024 * 1024;
+ } else if(NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,(8<<20)>>bsbits,max,bsbits)){
+ start = ac->ac_o_ex.fe_logical;
+ start = start << bsbits;
+ start = (start / (8 * (1024 * 1024))) * 8 * (1024 * 1024);
+ size = 8 * 1024 * 1024;
+ } else {
+ start = ac->ac_o_ex.fe_logical;
+ start = start << bsbits;
+ size = ac->ac_o_ex.fe_len << bsbits;
}
- orig_size = size;
- orig_start = start;
+ orig_size = size = size >> bsbits;
+ orig_start = start = start >> bsbits;

/* don't cover already allocated blocks in selected range */
if (ar->pleft && start <= ar->lleft) {
@@ -3203,22 +3098,6 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
ac->ac_g_ex.fe_logical = start;
ac->ac_g_ex.fe_len = size;

- /* define goal start in order to merge */
- if (ar->pright && (ar->lright == (start + size))) {
- /* merge to the right */
- ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
- &ac->ac_f_ex.fe_group,
- &ac->ac_f_ex.fe_start);
- ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
- }
- if (ar->pleft && (ar->lleft + 1 == start)) {
- /* merge to the left */
- ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
- &ac->ac_f_ex.fe_group,
- &ac->ac_f_ex.fe_start);
- ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
- }
-
mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size,
(unsigned) orig_size, (unsigned) start);
}
@@ -3395,8 +3274,10 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
&groupnr, &start);
len = pa->pa_len;
spin_unlock(&pa->pa_lock);
+ if (unlikely(len == 0))
+ continue;
BUG_ON(groupnr != group);
- mb_set_bits(bitmap, start, len);
+ mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), bitmap, start, len);
preallocated += len;
count++;
}
@@ -3425,7 +3306,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,

/* in this short window concurrent discard can set pa_deleted */
spin_lock(&pa->pa_lock);
- if (pa->pa_deleted == 1) {
+ if (pa->pa_deleted == 0) {
spin_unlock(&pa->pa_lock);
return;
}
@@ -3641,7 +3522,7 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,

BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
- BUG_ON(group != e4b->bd_group);
+ BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
end = bit + pa->pa_len;

ac.ac_sb = sb;
@@ -3696,7 +3577,7 @@ static int ext4_mb_release_group_pa(struct ext4_buddy *e4b,

BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
- BUG_ON(group != e4b->bd_group);
+ BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);

@@ -3997,19 +3878,19 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ int bsbits = ac->ac_sb->s_blocksize_bits;
+ loff_t size, isize;

if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
return;

- /* request is so large that we don't care about
- * streaming - it overweights any possible seek */
- if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req)
- return;
+ size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+ isize = i_size_read(ac->ac_inode) >> bsbits;
+ if (size < isize)
+ size = isize;

- /* FIXME!!
- * is this >= considering the above ?
- */
- if (ac->ac_o_ex.fe_len >= sbi->s_mb_small_req)
+ /* don't use group allocation for large files */
+ if (size >= sbi->s_mb_stream_request)
return;

if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
@@ -4419,7 +4300,8 @@ do_more:
BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
}
#endif
- mb_clear_bits(bitmap_bh->b_data, bit, count);
+ mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+ bit, count);

/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
diff --git a/include/linux/ext4_fs_sb.h b/include/linux/ext4_fs_sb.h
index 85100ea..4d4ad18 100644
--- a/include/linux/ext4_fs_sb.h
+++ b/include/linux/ext4_fs_sb.h
@@ -107,19 +107,12 @@ struct ext4_sb_info {
/* tunables */
unsigned long s_mb_factor;
unsigned long s_stripe;
- unsigned long s_mb_small_req;
- unsigned long s_mb_large_req;
+ unsigned long s_mb_stream_request;
unsigned long s_mb_max_to_scan;
unsigned long s_mb_min_to_scan;
unsigned long s_mb_max_groups_to_scan;
unsigned long s_mb_stats;
unsigned long s_mb_order2_reqs;
- unsigned long *s_mb_prealloc_table;
- unsigned long s_mb_prealloc_table_size;
- unsigned long s_mb_group_prealloc;
- /* where last allocation was done - for stream allocation */
- unsigned long s_mb_last_group;
- unsigned long s_mb_last_start;

/* history to debug policy */
struct ext4_mb_history *s_mb_history;


2008-01-07 18:58:07

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH] mballoc changes from ldiskfs

On Mon, Jan 07, 2008 at 11:58:00PM +0530, Aneesh Kumar K.V wrote:
> Hi,
>
> This patch is not even compile tested. I am sending it over to find out
> whether some of the changes are even needed and to make sure i didn't
> drop any bug fix in the merge.
>
> something I noticed.
>
> a) prealloc table is completely gone.
> b) ext4_mb_put_pa change. ( I guess that is a bug with ldiskfs ).
>
>
> now by default request less that 64K use locality group preallocation.
>
> The ldiskfs change i looked at is from
> lustre/ldiskfs/kernel_patches/patches/ext3-mballoc3-core.patch

[... snip... ]

>
> BUG_ON(ex->fe_len <= 0);
> - BUG_ON(ex->fe_len >= (1 << ac->ac_sb->s_blocksize_bits) * 8);
> - BUG_ON(ex->fe_start >= (1 << ac->ac_sb->s_blocksize_bits) * 8);
> + BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
> + BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
> BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
>
> ac->ac_found++;
> @@ -1553,7 +1546,6 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
> /* if the request is satisfied, then we try to find
> * an extent that still satisfy the request, but is
> * smaller than previous one */
> - if (ex->fe_len < bex->fe_len)
> *bex = *ex;
> }


This one is a bug fix for ext4 patch queue from Alex. So this change is needed.


>
> @@ -1702,8 +1694,8 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
> i = e4b->bd_info->bb_first_free;
>
> while (free && ac->ac_status == AC_STATUS_CONTINUE) {
> - i = ext4_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i);
> - if (i >= sb->s_blocksize * 8) {
> + i = ext4_find_next_zero_bit(bitmap, EXT4_BLOCKS_PER_GROUP(sb), i);
> + if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
> BUG_ON(free != 0);
> break;
> }
> @@ -1744,7 +1736,7 @@ static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
> i = (i - le32_to_cpu(sbi->s_es->s_first_data_block))
> % EXT4_BLOCKS_PER_GROUP(sb);
>
> - while (i < sb->s_blocksize * 8) {
> + while (i < EXT4_BLOCKS_PER_GROUP(sb)) {
> if (!mb_test_bit(i, bitmap)) {
> max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
> if (max >= sbi->s_stripe) {
> @@ -1839,20 +1831,6 @@ static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
> ac->ac_2order = i;
> }
>
> - /* if stream allocation is enabled, use global goal */
> -
> - /* FIXME!!
> - * Need more explanation on what it is and how stream
> - * allocation is represented by the below conditional
> - */
> - if ((ac->ac_g_ex.fe_len < sbi->s_mb_large_req) &&
> - (ac->ac_flags & EXT4_MB_HINT_DATA)) {
> - /* TBD: may be hot point */
> - spin_lock(&sbi->s_md_lock);
> - ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
> - ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
> - spin_unlock(&sbi->s_md_lock);
> - }
>
> group = ac->ac_g_ex.fe_group;
>
> @@ -2291,7 +2269,8 @@ static void ext4_mb_history_init(struct super_block *sb)
> spin_lock_init(&sbi->s_mb_history_lock);
> i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
> sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
> - memset(sbi->s_mb_history, 0, i);
> + if (likely(sbi->s_mb_history != NULL))
> + memset(sbi->s_mb_history, 0, i);
> /* if we can't allocate history, then we simple won't use it */
> }
>
> @@ -2300,7 +2279,7 @@ static void ext4_mb_store_history(struct ext4_allocation_context *ac)
> struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
> struct ext4_mb_history h;
>
> - if (likely(sbi->s_mb_history == NULL))
> + if (unlikely(sbi->s_mb_history == NULL))
> return;
>
> if (!(ac->ac_op & sbi->s_mb_history_filter))
> @@ -2312,11 +2291,6 @@ static void ext4_mb_store_history(struct ext4_allocation_context *ac)
> h.orig = ac->ac_o_ex;
> h.result = ac->ac_b_ex;
> h.flags = ac->ac_flags;
> - h.found = ac->ac_found;
> - h.groups = ac->ac_groups_scanned;
> - h.cr = ac->ac_criteria;
> - h.tail = ac->ac_tail;
> - h.buddy = ac->ac_buddy;
> h.merged = 0;


This one is a bug for ext4 patch queue from Alex. So this change is
needed.


[....]
> +
> + /* first, try to predict filesize */
> + /* XXX: should this table be tunable? */

Here we says we need to have tunables. Does that mean prealloc table is
needed ?


> + start = 0;
> + if (size <= 16 * 1024) {
> + size = 16 * 1024;
> + } else if (size <= 32 * 1024) {
> + size = 32 * 1024;
> + } else if (size <= 64 * 1024) {
> + size = 64 * 1024;
> + } else if (size <= 128 * 1024) {
> + size = 128 * 1024;
> + } else if (size <= 256 * 1024) {
> + size = 256 * 1024;
> + } else if (size <= 512 * 1024) {
> + size = 512 * 1024;
> + } else if (size <= 1024 * 1024) {
> + size = 1024 * 1024;
> + } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, bsbits)) {
> + start = ac->ac_o_ex.fe_logical << bsbits;
> + start = (start / (1024 * 1024)) * (1024 * 1024);
> + size = 1024 * 1024;
> + } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, bsbits)) {
> + start = ac->ac_o_ex.fe_logical << bsbits;
> + start = (start / (4 * (1024 * 1024))) * 4 * (1024 * 1024);
> + size = 4 * 1024 * 1024;
> + } else if(NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,(8<<20)>>bsbits,max,bsbits)){
> + start = ac->ac_o_ex.fe_logical;
> + start = start << bsbits;
> + start = (start / (8 * (1024 * 1024))) * 8 * (1024 * 1024);
> + size = 8 * 1024 * 1024;
> + } else {
> + start = ac->ac_o_ex.fe_logical;
> + start = start << bsbits;
> + size = ac->ac_o_ex.fe_len << bsbits;
> }
> - orig_size = size;
> - orig_start = start;
> + orig_size = size = size >> bsbits;
> + orig_start = start = start >> bsbits;
>
> /* don't cover already allocated blocks in selected range */
> if (ar->pleft && start <= ar->lleft) {
> @@ -3203,22 +3098,6 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
> ac->ac_g_ex.fe_logical = start;
> ac->ac_g_ex.fe_len = size;
>
> - /* define goal start in order to merge */
> - if (ar->pright && (ar->lright == (start + size))) {
> - /* merge to the right */
> - ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
> - &ac->ac_f_ex.fe_group,
> - &ac->ac_f_ex.fe_start);
> - ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
> - }
> - if (ar->pleft && (ar->lleft + 1 == start)) {
> - /* merge to the left */
> - ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
> - &ac->ac_f_ex.fe_group,
> - &ac->ac_f_ex.fe_start);
> - ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
> - }
> -
> mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size,
> (unsigned) orig_size, (unsigned) start);
> }
> @@ -3395,8 +3274,10 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
> &groupnr, &start);
> len = pa->pa_len;
> spin_unlock(&pa->pa_lock);
> + if (unlikely(len == 0))
> + continue;
> BUG_ON(groupnr != group);
> - mb_set_bits(bitmap, start, len);
> + mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), bitmap, start, len);
> preallocated += len;
> count++;
> }
> @@ -3425,7 +3306,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
>
> /* in this short window concurrent discard can set pa_deleted */
> spin_lock(&pa->pa_lock);
> - if (pa->pa_deleted == 1) {
> + if (pa->pa_deleted == 0) {
> spin_unlock(&pa->pa_lock);
> return;
> }


I think that should == 1 (ldiskfs have it == 0)