2008-08-27 15:28:26

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture

fbc->count is of type s64. The change was introduced by
0216bfcffe424a5473daa4da47440881b36c1f4 which changed the type
from long to s64. Moving to s64 also means on 32 bit architectures
we can get wrong values on fbc->count. Since fbc->count is read
more frequently and updated rarely use seqlocks. This should
reduce the impact of locking in the read path for 32bit arch.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
CC: Peter Zijlstra <[email protected]>
CC: Andrew Morton <[email protected]>
CC: [email protected]
---
include/linux/percpu_counter.h | 28 ++++++++++++++++++++++++----
lib/percpu_counter.c | 20 ++++++++++----------
2 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 9007ccd..1b711a1 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -6,7 +6,7 @@
* WARNING: these things are HUGE. 4 kbytes per counter on 32-way P4.
*/

-#include <linux/spinlock.h>
+#include <linux/seqlock.h>
#include <linux/smp.h>
#include <linux/list.h>
#include <linux/threads.h>
@@ -16,7 +16,7 @@
#ifdef CONFIG_SMP

struct percpu_counter {
- spinlock_t lock;
+ seqlock_t lock;
s64 count;
#ifdef CONFIG_HOTPLUG_CPU
struct list_head list; /* All percpu_counters are on a list */
@@ -53,10 +53,30 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
return __percpu_counter_sum(fbc);
}

-static inline s64 percpu_counter_read(struct percpu_counter *fbc)
+#if BITS_PER_LONG == 64
+static inline s64 fbc_count(struct percpu_counter *fbc)
{
return fbc->count;
}
+#else
+/* doesn't have atomic 64 bit operation */
+static inline s64 fbc_count(struct percpu_counter *fbc)
+{
+ s64 ret;
+ unsigned seq;
+ do {
+ seq = read_seqbegin(&fbc->lock);
+ ret = fbc->count;
+ } while (read_seqretry(&fbc->lock, seq));
+ return ret;
+
+}
+#endif
+
+static inline s64 percpu_counter_read(struct percpu_counter *fbc)
+{
+ return fbc_count(fbc);
+}

/*
* It is possible for the percpu_counter_read() to return a small negative
@@ -65,7 +85,7 @@ static inline s64 percpu_counter_read(struct percpu_counter *fbc)
*/
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
- s64 ret = fbc->count;
+ s64 ret = fbc_count(fbc);

barrier(); /* Prevent reloads of fbc->count */
if (ret >= 0)
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index a866389..83bb809 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -18,13 +18,13 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
{
int cpu;

- spin_lock(&fbc->lock);
+ write_seqlock(&fbc->lock);
for_each_possible_cpu(cpu) {
s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
*pcount = 0;
}
fbc->count = amount;
- spin_unlock(&fbc->lock);
+ write_sequnlock(&fbc->lock);
}
EXPORT_SYMBOL(percpu_counter_set);

@@ -37,10 +37,10 @@ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
pcount = per_cpu_ptr(fbc->counters, cpu);
count = *pcount + amount;
if (count >= batch || count <= -batch) {
- spin_lock(&fbc->lock);
+ write_seqlock(&fbc->lock);
fbc->count += count;
*pcount = 0;
- spin_unlock(&fbc->lock);
+ write_sequnlock(&fbc->lock);
} else {
*pcount = count;
}
@@ -57,7 +57,7 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
s64 ret;
int cpu;

- spin_lock(&fbc->lock);
+ write_seqlock(&fbc->lock);
ret = fbc->count;
for_each_online_cpu(cpu) {
s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
@@ -66,7 +66,7 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
}
fbc->count = ret;

- spin_unlock(&fbc->lock);
+ write_sequnlock(&fbc->lock);
return ret;
}
EXPORT_SYMBOL(__percpu_counter_sum);
@@ -75,7 +75,7 @@ EXPORT_SYMBOL(__percpu_counter_sum);

int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
{
- spin_lock_init(&fbc->lock);
+ seqlock_init(&fbc->lock);
fbc->count = amount;
fbc->counters = alloc_percpu(s32);
if (!fbc->counters)
@@ -95,7 +95,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount)

err = percpu_counter_init(fbc, amount);
if (!err)
- lockdep_set_class(&fbc->lock, &percpu_counter_irqsafe);
+ lockdep_set_class(&fbc->lock.lock, &percpu_counter_irqsafe);
return err;
}

@@ -130,11 +130,11 @@ static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb,
s32 *pcount;
unsigned long flags;

- spin_lock_irqsave(&fbc->lock, flags);
+ write_seqlock_irqsave(&fbc->lock, flags);
pcount = per_cpu_ptr(fbc->counters, cpu);
fbc->count += *pcount;
*pcount = 0;
- spin_unlock_irqrestore(&fbc->lock, flags);
+ write_sequnlock_irqrestore(&fbc->lock, flags);
}
mutex_unlock(&percpu_counters_lock);
return NOTIFY_OK;
--
1.6.0.1.90.g27a6e


2008-08-27 15:28:58

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V3 04/11] ext4: Add percpu dirty block accounting.

This patch add dirty block accounting using percpu_counters.
Delayed allocation block reservation is now done by updating
dirty block counter. In the later patch we switch to non
delalloc mode if the filesystem free blocks is < that
150 % of total filesystem dirty blocks

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/balloc.c | 59 +++++++++++++++++++++++++++++++++-------------------
fs/ext4/ext4_sb.h | 1 +
fs/ext4/inode.c | 22 +++++++++---------
fs/ext4/mballoc.c | 17 ++------------
fs/ext4/super.c | 8 ++++++-
5 files changed, 59 insertions(+), 48 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 5767332..b19346a 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1605,26 +1605,38 @@ ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
ext4_fsblk_t nblocks)
{
- s64 free_blocks;
+ s64 free_blocks, dirty_blocks;
ext4_fsblk_t root_blocks = 0;
struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
+ struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;

- free_blocks = percpu_counter_read(fbc);
+ free_blocks = percpu_counter_read_positive(fbc);
+ dirty_blocks = percpu_counter_read_positive(dbc);

if (!capable(CAP_SYS_RESOURCE) &&
sbi->s_resuid != current->fsuid &&
(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
root_blocks = ext4_r_blocks_count(sbi->s_es);

- if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK)
- free_blocks = percpu_counter_sum(&sbi->s_freeblocks_counter);
-
- if (free_blocks < (root_blocks + nblocks))
+ if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
+ EXT4_FREEBLOCKS_WATERMARK) {
+ free_blocks = percpu_counter_sum(fbc);
+ dirty_blocks = percpu_counter_sum(dbc);
+ if (dirty_blocks < 0) {
+ printk(KERN_CRIT "Dirty block accounting "
+ "went wrong %lld\n",
+ dirty_blocks);
+ }
+ }
+ /* Check whether we have space after
+ * accounting for current dirty blocks
+ */
+ if (free_blocks < ((s64)(root_blocks + nblocks) + dirty_blocks))
/* we don't have free space */
return -ENOSPC;

- /* reduce fs free blocks counter */
- percpu_counter_sub(fbc, nblocks);
+ /* Add the blocks to nblocks */
+ percpu_counter_add(dbc, nblocks);
return 0;
}

@@ -1640,23 +1652,28 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
ext4_fsblk_t nblocks)
{
- ext4_fsblk_t free_blocks;
+ ext4_fsblk_t free_blocks, dirty_blocks;
ext4_fsblk_t root_blocks = 0;
+ struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
+ struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;

- free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+ free_blocks = percpu_counter_read_positive(fbc);
+ dirty_blocks = percpu_counter_read_positive(dbc);

if (!capable(CAP_SYS_RESOURCE) &&
sbi->s_resuid != current->fsuid &&
(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
root_blocks = ext4_r_blocks_count(sbi->s_es);

- if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK)
- free_blocks = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
-
- if (free_blocks <= root_blocks)
+ if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
+ EXT4_FREEBLOCKS_WATERMARK) {
+ free_blocks = percpu_counter_sum_positive(fbc);
+ dirty_blocks = percpu_counter_sum_positive(dbc);
+ }
+ if (free_blocks <= (root_blocks + dirty_blocks))
/* we don't have free space */
return 0;
- if (free_blocks - root_blocks < nblocks)
+ if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
return free_blocks - root_blocks;
return nblocks;
}
@@ -1943,13 +1960,11 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
le16_add_cpu(&gdp->bg_free_blocks_count, -num);
gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
spin_unlock(sb_bgl_lock(sbi, group_no));
- if (!EXT4_I(inode)->i_delalloc_reserved_flag && (*count != num)) {
- /*
- * we allocated less blocks than we
- * claimed. Add the difference back.
- */
- percpu_counter_add(&sbi->s_freeblocks_counter, *count - num);
- }
+ percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+ /*
+ * Now reduce the dirty block count also. Should not go negative
+ */
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, num);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
spin_lock(sb_bgl_lock(sbi, flex_group));
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 6300226..0fa3762 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -59,6 +59,7 @@ struct ext4_sb_info {
struct percpu_counter s_freeblocks_counter;
struct percpu_counter s_freeinodes_counter;
struct percpu_counter s_dirs_counter;
+ struct percpu_counter s_dirtyblocks_counter;
struct blockgroup_lock s_blockgroup_lock;

/* root of the per fs reservation window tree */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 98a998b..14ec7d1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1030,19 +1030,20 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;

- /* Account for allocated meta_blocks */
- mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
-
- /* update fs free blocks counter for truncate case */
- percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free);
+ if (mdb_free) {
+ /* Account for allocated meta_blocks */
+ mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+
+ /* update fs dirty blocks counter */
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
+ EXT4_I(inode)->i_allocated_meta_blocks = 0;
+ EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+ }

/* update per-inode reservations */
BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
EXT4_I(inode)->i_reserved_data_blocks -= used;

- BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
- EXT4_I(inode)->i_reserved_meta_blocks = mdb;
- EXT4_I(inode)->i_allocated_meta_blocks = 0;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
}

@@ -1588,8 +1589,8 @@ static void ext4_da_release_space(struct inode *inode, int to_free)

release = to_free + mdb_free;

- /* update fs free blocks counter for truncate case */
- percpu_counter_add(&sbi->s_freeblocks_counter, release);
+ /* update fs dirty blocks counter for truncate case */
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);

/* update per-inode reservations */
BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
@@ -2471,7 +2472,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
index = pos >> PAGE_CACHE_SHIFT;
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
-
retry:
/*
* With delayed allocation, we don't log the i_disksize update
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 419009f..4da4b9a 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2971,22 +2971,11 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
-
+ percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
/*
- * free blocks account has already be reduced/reserved
- * at write_begin() time for delayed allocation
- * do not double accounting
+ * Now reduce the dirty block count also. Should not go negative
*/
- if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) &&
- ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) {
- /*
- * we allocated less blocks than we calimed
- * Add the difference back
- */
- percpu_counter_add(&sbi->s_freeblocks_counter,
- ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len);
- }
-
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, ac->ac_b_ex.fe_len);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi,
ac->ac_b_ex.fe_group);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ed77786..7b9db51 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -520,6 +520,7 @@ static void ext4_put_super(struct super_block *sb)
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
+ percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
brelse(sbi->s_sbh);
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
@@ -2259,6 +2260,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
err = percpu_counter_init(&sbi->s_dirs_counter,
ext4_count_dirs(sb));
}
+ if (!err) {
+ err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+ }
if (err) {
printk(KERN_ERR "EXT4-fs: insufficient memory\n");
goto failed_mount3;
@@ -2491,6 +2495,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
+ percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
failed_mount2:
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
@@ -3164,7 +3169,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = EXT4_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
- buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
+ buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
+ percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
ext4_free_blocks_count_set(es, buf->f_bfree);
buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
if (buf->f_bfree < ext4_r_blocks_count(es))
--
1.6.0.1.90.g27a6e


2008-08-27 15:28:51

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V3 02/11] ext4: Make sure all the block allocation paths reserve blocks

With delayed allocation we need to make sure block are reserved
before we attempt to allocate them. Otherwise we get block
allocation failure (ENOSPC) during writepages which cannot
be handled. This would mean silent data loss (We do a printk
stating data will be lost). This patch update the DIO
and fallocate code path to do block reservation before block
allocation. This is needed to make sure parallel DIO and
fallocate request doesn't take block out of delayed reserve
space.

When free blocks count go below a threshold we switch to
a slow patch which looks at other CPU's accumulated percpu
counter values.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/balloc.c | 58 ++++++++++++++++++++++++++++++++++++++--------------
fs/ext4/ext4.h | 13 +++++++++++
fs/ext4/inode.c | 5 +---
fs/ext4/mballoc.c | 23 +++++++++++---------
4 files changed, 69 insertions(+), 30 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index cfed283..dc10bfd 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1602,6 +1602,32 @@ ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
return ret;
}

+int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+ ext4_fsblk_t nblocks)
+{
+ s64 free_blocks;
+ ext4_fsblk_t root_blocks = 0;
+ struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
+
+ free_blocks = percpu_counter_read(fbc);
+
+ if (!capable(CAP_SYS_RESOURCE) &&
+ sbi->s_resuid != current->fsuid &&
+ (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+ root_blocks = ext4_r_blocks_count(sbi->s_es);
+
+ if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK)
+ free_blocks = percpu_counter_sum(&sbi->s_freeblocks_counter);
+
+ if (free_blocks < (root_blocks + nblocks))
+ /* we don't have free space */
+ return -ENOSPC;
+
+ /* reduce fs free blocks counter */
+ percpu_counter_sub(fbc, nblocks);
+ return 0;
+}
+
/**
* ext4_has_free_blocks()
* @sbi: in-core super block structure.
@@ -1623,18 +1649,17 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
sbi->s_resuid != current->fsuid &&
(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
root_blocks = ext4_r_blocks_count(sbi->s_es);
-#ifdef CONFIG_SMP
- if (free_blocks - root_blocks < FBC_BATCH)
- free_blocks =
- percpu_counter_sum(&sbi->s_freeblocks_counter);
-#endif
+
+ if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK)
+ free_blocks = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
+
if (free_blocks <= root_blocks)
/* we don't have free space */
return 0;
if (free_blocks - root_blocks < nblocks)
return free_blocks - root_blocks;
return nblocks;
- }
+}


/**
@@ -1713,14 +1738,11 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
/*
* With delalloc we already reserved the blocks
*/
- *count = ext4_has_free_blocks(sbi, *count);
- }
- if (*count == 0) {
- *errp = -ENOSPC;
- return 0; /*return with ENOSPC error */
+ if (ext4_claim_free_blocks(sbi, *count)) {
+ *errp = -ENOSPC;
+ return 0; /*return with ENOSPC error */
+ }
}
- num = *count;
-
/*
* Check quota for allocation of this block.
*/
@@ -1915,9 +1937,13 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
le16_add_cpu(&gdp->bg_free_blocks_count, -num);
gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
spin_unlock(sb_bgl_lock(sbi, group_no));
- if (!EXT4_I(inode)->i_delalloc_reserved_flag)
- percpu_counter_sub(&sbi->s_freeblocks_counter, num);
-
+ if (!EXT4_I(inode)->i_delalloc_reserved_flag && (*count != num)) {
+ /*
+ * we allocated less blocks than we
+ * claimed. Add the difference back.
+ */
+ percpu_counter_add(&sbi->s_freeblocks_counter, *count - num);
+ }
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
spin_lock(sb_bgl_lock(sbi, flex_group));
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 7f11b25..71a4fde 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1047,6 +1047,8 @@ extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
unsigned long *count, int *errp);
extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+ ext4_fsblk_t nblocks);
extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
ext4_fsblk_t nblocks);
extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
@@ -1295,6 +1297,17 @@ do { \
__ext4_std_error((sb), __func__, (errno)); \
} while (0)

+#ifdef CONFIG_SMP
+/* Each CPU can accumulate FBC_BATCH blocks in their local
+ * counters. So we need to make sure we have free blocks more
+ * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times.
+ */
+#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids))
+#else
+#define EXT4_FREEBLOCKS_WATERMARK 0
+#endif
+
+
/*
* Inodes and files operations
*/
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1c289c1..d965a05 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1537,13 +1537,10 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
total = md_needed + nrblocks;

- if (ext4_has_free_blocks(sbi, total) < total) {
+ if (ext4_claim_free_blocks(sbi, total)) {
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
return -ENOSPC;
}
- /* reduce fs free blocks counter */
- percpu_counter_sub(&sbi->s_freeblocks_counter, total);
-
EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 82dd0e4..4404b46 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2977,9 +2977,15 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
* at write_begin() time for delayed allocation
* do not double accounting
*/
- if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
- percpu_counter_sub(&sbi->s_freeblocks_counter,
- ac->ac_b_ex.fe_len);
+ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) &&
+ ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) {
+ /*
+ * we allocated less blocks than we calimed
+ * Add the difference back
+ */
+ percpu_counter_add(&sbi->s_freeblocks_counter,
+ ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len);
+ }

if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -4391,14 +4397,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
/*
* With delalloc we already reserved the blocks
*/
- ar->len = ext4_has_free_blocks(sbi, ar->len);
- }
-
- if (ar->len == 0) {
- *errp = -ENOSPC;
- return 0;
+ if (ext4_claim_free_blocks(sbi, ar->len)) {
+ *errp = -ENOSPC;
+ return 0;
+ }
}

2008-08-27 15:29:19

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V3 11/11] ext4: Retry block allocation if we have free blocks left

When we truncate files, the meta-data blocks released
are not reused untill we commit the truncate transaction.
That means delayed get_block request will return ENOSPC
even if we have free blocks left. Force a journal commit
and retry block allocation if we get ENOSPC with free
blocks left.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/inode.c | 81 ++++++++++++++++++++++++++++++++++++++----------------
1 files changed, 57 insertions(+), 24 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 24381bb..bbe7fac 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1634,6 +1634,7 @@ static void ext4_da_page_release_reservation(struct page *page,
struct writeback_control *wbc;
int io_done;
long pages_written;
+ int retval;
};

/*
@@ -1820,6 +1821,24 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
return;
}

+static void ext4_print_free_blocks(struct inode *inode)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ printk(KERN_EMERG "Total free blocks count %lld\n",
+ ext4_count_free_blocks(inode->i_sb));
+ printk(KERN_EMERG "Free/Dirty block details\n");
+ printk(KERN_EMERG "free_blocks=%lld\n",
+ percpu_counter_sum(&sbi->s_freeblocks_counter));
+ printk(KERN_EMERG "dirty_blocks=%lld\n",
+ percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+ printk(KERN_EMERG "Block reservation details\n");
+ printk(KERN_EMERG "i_reserved_data_blocks=%lu\n",
+ EXT4_I(inode)->i_reserved_data_blocks);
+ printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n",
+ EXT4_I(inode)->i_reserved_meta_blocks);
+ return;
+}
+
/*
* mpage_da_map_blocks - go through given space
*
@@ -1834,7 +1853,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
int err = 0;
struct buffer_head new;
struct buffer_head *lbh = &mpd->lbh;
- sector_t next = lbh->b_blocknr;
+ sector_t next;

/*
* We consider only non-mapped and non-allocated blocks
@@ -1844,6 +1863,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
new.b_state = lbh->b_state;
new.b_blocknr = 0;
new.b_size = lbh->b_size;
+ next = lbh->b_blocknr;
/*
* If we didn't accumulate anything
* to write simply return
@@ -1860,6 +1880,13 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
*/
if (err == -EAGAIN)
return 0;
+
+ if (err == -ENOSPC &&
+ ext4_count_free_blocks(mpd->inode->i_sb)) {
+ mpd->retval = err;
+ return 0;
+ }
+
/*
* get block failure will cause us
* to loop in writepages. Because
@@ -1877,8 +1904,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
printk(KERN_EMERG "This should not happen.!! "
"Data will be lost\n");
if (err == -ENOSPC) {
- printk(KERN_CRIT "Total free blocks count %lld\n",
- ext4_count_free_blocks(mpd->inode->i_sb));
+ ext4_print_free_blocks(mpd->inode);
}
/* invlaidate all the pages */
ext4_da_block_invalidatepages(mpd, next,
@@ -2085,39 +2111,36 @@ static int __mpage_da_writepage(struct page *page,
*/
static int mpage_da_writepages(struct address_space *mapping,
struct writeback_control *wbc,
- get_block_t get_block)
+ struct mpage_da_data *mpd)
{
- struct mpage_da_data mpd;
long to_write;
int ret;

- if (!get_block)
+ if (!mpd->get_block)
return generic_writepages(mapping, wbc);

- mpd.wbc = wbc;
- mpd.inode = mapping->host;
- mpd.lbh.b_size = 0;
- mpd.lbh.b_state = 0;
- mpd.lbh.b_blocknr = 0;
- mpd.first_page = 0;
- mpd.next_page = 0;
- mpd.get_block = get_block;
- mpd.io_done = 0;
- mpd.pages_written = 0;
+ mpd->lbh.b_size = 0;
+ mpd->lbh.b_state = 0;
+ mpd->lbh.b_blocknr = 0;
+ mpd->first_page = 0;
+ mpd->next_page = 0;
+ mpd->io_done = 0;
+ mpd->pages_written = 0;
+ mpd->retval = 0;

to_write = wbc->nr_to_write;

- ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
+ ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);

/*
* Handle last extent of pages
*/
- if (!mpd.io_done && mpd.next_page != mpd.first_page) {
- if (mpage_da_map_blocks(&mpd) == 0)
- mpage_da_submit_io(&mpd);
+ if (!mpd->io_done && mpd->next_page != mpd->first_page) {
+ if (mpage_da_map_blocks(mpd) == 0)
+ mpage_da_submit_io(mpd);
}

- wbc->nr_to_write = to_write - mpd.pages_written;
+ wbc->nr_to_write = to_write - mpd->pages_written;
return ret;
}

@@ -2356,6 +2379,7 @@ static int ext4_da_writepages(struct address_space *mapping,
{
handle_t *handle = NULL;
loff_t range_start = 0;
+ struct mpage_da_data mpd;
struct inode *inode = mapping->host;
int needed_blocks, ret = 0, nr_to_writebump = 0;
long to_write, pages_skipped = 0;
@@ -2389,6 +2413,9 @@ static int ext4_da_writepages(struct address_space *mapping,
range_start = wbc->range_start;
pages_skipped = wbc->pages_skipped;

+ mpd.wbc = wbc;
+ mpd.inode = mapping->host;
+
restart_loop:
to_write = wbc->nr_to_write;
while (!ret && to_write > 0) {
@@ -2412,11 +2439,17 @@ static int ext4_da_writepages(struct address_space *mapping,
dump_stack();
goto out_writepages;
}

2008-08-27 15:29:05

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V3 06/11] ext4: Update meta-data reservation with delalloc.

This makes the meta-data reservation simpler. The logic
followed is simpler. After each block allocation request
if we have allocated some meta-data blocks subtract the
same from the reserved meta-data blocks. If the total
reserved data blocks after allocation is zero, free the
remaining meta-data blocks reserved. During reservation
if the total reserved blocks need more meta-data blocks
add the extra meta-data blocks needed to the reserve_meta_blocks

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/inode.c | 75 +++++++++++++++++++++++++++----------------------------
1 files changed, 37 insertions(+), 38 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a45121f..3ef0822 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1019,31 +1019,34 @@ static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
static void ext4_da_update_reserve_space(struct inode *inode, int used)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- int total, mdb, mdb_free;

spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- /* recalculate the number of metablocks still need to be reserved */
- total = EXT4_I(inode)->i_reserved_data_blocks - used;
- mdb = ext4_calc_metadata_amount(inode, total);
-
- /* figure out how many metablocks to release */
- BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
- mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
-
- if (mdb_free) {
- /* Account for allocated meta_blocks */
- mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
-
- /* update fs dirty blocks counter */
- percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
+ if (EXT4_I(inode)->i_allocated_meta_blocks) {
+ /* update the reseved meta- blocks */
+ BUG_ON(EXT4_I(inode)->i_allocated_meta_blocks >
+ EXT4_I(inode)->i_reserved_meta_blocks);
+ EXT4_I(inode)->i_reserved_meta_blocks -=
+ EXT4_I(inode)->i_allocated_meta_blocks;
EXT4_I(inode)->i_allocated_meta_blocks = 0;
- EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+ /*
+ * We already updated the percpu dirty
+ * block counter in the block allocation path.
+ */
}
-
/* update per-inode reservations */
BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
EXT4_I(inode)->i_reserved_data_blocks -= used;
+ if (!EXT4_I(inode)->i_reserved_data_blocks) {
+ /*
+ * If we don't have any reserved data blocks
+ * release all the resered meta data blocks
+ * update percpu dirty block counter also.
+ */
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+ EXT4_I(inode)->i_reserved_meta_blocks);
+ EXT4_I(inode)->i_reserved_meta_blocks = 0;

+ }
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
}

@@ -1524,7 +1527,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
{
int retries = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- unsigned long md_needed, mdblocks, total = 0;
+ unsigned long md_needed = 0, mdblocks, total = 0;

/*
* recalculate the amount of metadata blocks to reserve
@@ -1535,9 +1538,9 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
mdblocks = ext4_calc_metadata_amount(inode, total);
- BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
+ if (mdblocks > EXT4_I(inode)->i_reserved_meta_blocks)
+ md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;

- md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
total = md_needed + nrblocks;

if (ext4_claim_free_blocks(sbi, total)) {
@@ -1549,7 +1552,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
return -ENOSPC;
}
EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
- EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+ EXT4_I(inode)->i_reserved_meta_blocks += md_needed;

spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
return 0; /* success */
@@ -1558,13 +1561,12 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
static void ext4_da_release_space(struct inode *inode, int to_free)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- int total, mdb, mdb_free, release;
+ int release, mdb_free = 0;

if (!to_free)
return; /* Nothing to release, exit */

spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-
if (!EXT4_I(inode)->i_reserved_data_blocks) {
/*
* if there is no reserved blocks, but we try to free some
@@ -1578,26 +1580,23 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
return;
}
+ /* update per-inode reservations */
+ BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
+ EXT4_I(inode)->i_reserved_data_blocks -= to_free;
+ if (!EXT4_I(inode)->i_reserved_data_blocks) {
+ /*
+ * If we don't have any reserved data blocks
+ * release all the resered meta data blocks
+ * update percpu dirty block counter also.
+ */
+ mdb_free = EXT4_I(inode)->i_reserved_meta_blocks;
+ EXT4_I(inode)->i_reserved_meta_blocks = 0;

- /* recalculate the number of metablocks still need to be reserved */
- total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
- mdb = ext4_calc_metadata_amount(inode, total);
-
- /* figure out how many metablocks to release */
- BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
- mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
-
+ }
release = to_free + mdb_free;

/* update fs dirty blocks counter for truncate case */
percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
-
- /* update per-inode reservations */
- BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
- EXT4_I(inode)->i_reserved_data_blocks -= to_free;

2008-08-27 15:28:53

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V3 03/11] ext4: Retry block reservation

During block reservation if we don't have enough
blocks left, retry block reservation with smaller
block count. This make sure we try fallocate
and DIO with smaller request size and don't fail early.
The delayed allocation reservation cannot try with smaller
block count. So retry block reservation to handle temporary
disk full conditions. Also print free blocks details if we
fail block allocation during writepages.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/balloc.c | 8 +++++++-
fs/ext4/inode.c | 14 +++++++++++---
fs/ext4/mballoc.c | 7 ++++++-
3 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index dc10bfd..5767332 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1738,10 +1738,16 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
/*
* With delalloc we already reserved the blocks
*/
- if (ext4_claim_free_blocks(sbi, *count)) {
+ while (*count && ext4_claim_free_blocks(sbi, *count)) {
+ /* let others to free the space */
+ yield();
+ *count = *count >> 1;
+ }
+ if (!*count) {
*errp = -ENOSPC;
return 0; /*return with ENOSPC error */
}
+ num = *count;
}
/*
* Check quota for allocation of this block.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d965a05..98a998b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1521,6 +1521,7 @@ static int ext4_journalled_write_end(struct file *file,

static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
{
+ int retries = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned long md_needed, mdblocks, total = 0;

@@ -1529,6 +1530,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
* in order to allocate nrblocks
* worse case is one extent per block
*/
+repeat:
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
mdblocks = ext4_calc_metadata_amount(inode, total);
@@ -1539,6 +1541,10 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)

if (ext4_claim_free_blocks(sbi, total)) {
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+ yield();
+ goto repeat;
+ }
return -ENOSPC;
}
EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
@@ -1825,20 +1831,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
static int mpage_da_map_blocks(struct mpage_da_data *mpd)
{
int err = 0;
+ struct buffer_head new;
struct buffer_head *lbh = &mpd->lbh;
sector_t next = lbh->b_blocknr;
- struct buffer_head new;

/*
* We consider only non-mapped and non-allocated blocks
*/
if (buffer_mapped(lbh) && !buffer_delay(lbh))
return 0;
-
new.b_state = lbh->b_state;
new.b_blocknr = 0;
new.b_size = lbh->b_size;
-
/*
* If we didn't accumulate anything
* to write simply return
@@ -1871,6 +1875,10 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
lbh->b_size >> mpd->inode->i_blkbits, err);
printk(KERN_EMERG "This should not happen.!! "
"Data will be lost\n");
+ if (err == -ENOSPC) {
+ printk(KERN_CRIT "Total free blocks count %lld\n",
+ ext4_count_free_blocks(mpd->inode->i_sb));
+ }
/* invlaidate all the pages */
ext4_da_block_invalidatepages(mpd, next,
lbh->b_size >> mpd->inode->i_blkbits);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4404b46..419009f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4397,7 +4397,12 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
/*
* With delalloc we already reserved the blocks
*/
- if (ext4_claim_free_blocks(sbi, ar->len)) {
+ while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
+ /* let others to free the space */
+ yield();
+ ar->len = ar->len >> 1;
+ }
+ if (!ar->len) {
*errp = -ENOSPC;
return 0;
}
--
1.6.0.1.90.g27a6e


2008-08-27 15:29:01

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V3 05/11] ext4: Switch to non delalloc mode when we are low on free blocks count.

delayed allocation allocate blocks during writepages. That also
means we cannot handle block allocation failures. Switch to
non - delalloc when we are running low on free blocks.
Delayed allocation need to do aggressive meta-data block reservation
considering that the requested blocks can all be discontiguous.
Switching to non-delalloc avoids that. Also we can satisfy
partial write in non-delalloc mode.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/inode.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++--
1 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 14ec7d1..a45121f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2458,6 +2458,33 @@ static int ext4_da_writepages(struct address_space *mapping,
return ret;
}

+#define FALL_BACK_TO_NONDELALLOC 1
+static int ext4_nonda_switch(struct super_block *sb)
+{
+ s64 free_blocks, dirty_blocks;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ /*
+ * switch to non delalloc mode if we are running low
+ * on free block. The free block accounting via percpu
+ * counters can get slightly wrong with FBC_BATCH getting
+ * accumulated on each CPU without updating global counters
+ * Delalloc need an accurate free block accounting. So switch
+ * to non delalloc when we are near to error range.
+ */
+ free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+ dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
+ if (2 * free_blocks < 3 * dirty_blocks ||
+ free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
+ /*
+ * free block count is less that 150% of dirty blocks
+ * or free blocks is less that watermark
+ */
+ return 1;
+ }
+ return 0;
+}
+
static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -2472,6 +2499,13 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
index = pos >> PAGE_CACHE_SHIFT;
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
+
+ if (ext4_nonda_switch(inode->i_sb)) {
+ *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
+ return ext4_write_begin(file, mapping, pos,
+ len, flags, pagep, fsdata);
+ }
+ *fsdata = (void *)0;
retry:
/*
* With delayed allocation, we don't log the i_disksize update
@@ -2540,6 +2574,19 @@ static int ext4_da_write_end(struct file *file,
handle_t *handle = ext4_journal_current_handle();
loff_t new_i_size;
unsigned long start, end;
+ int write_mode = (int)fsdata;
+
+ if (write_mode == FALL_BACK_TO_NONDELALLOC) {
+ if (ext4_should_order_data(inode)) {
+ return ext4_ordered_write_end(file, mapping, pos,
+ len, copied, page, fsdata);
+ } else if (ext4_should_writeback_data(inode)) {
+ return ext4_writeback_write_end(file, mapping, pos,
+ len, copied, page, fsdata);
+ } else {
+ BUG();
+ }
+ }

start = pos & (PAGE_CACHE_SIZE - 1);
end = start + copied -1;
@@ -4877,6 +4924,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
loff_t size;
unsigned long len;
int ret = -EINVAL;
+ void *fsdata;
struct file *file = vma->vm_file;
struct inode *inode = file->f_path.dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
@@ -4915,11 +4963,11 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
* on the same page though
*/
ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
- len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+ len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
if (ret < 0)
goto out_unlock;
ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
- len, len, page, NULL);
+ len, len, page, fsdata);
if (ret < 0)
goto out_unlock;
ret = 0;
--
1.6.0.1.90.g27a6e


2008-08-27 15:29:07

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V3 07/11] ext4: request for blocks with ar.excepted_group = -1

Otherwise we skip group 0 during block allocation.
This cause ENOSPC even if we have free blocks in
group 0. This should be merged with defrag. The
expected_group changes are introduced by defrag patches.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/balloc.c | 1 +
fs/ext4/extents.c | 1 +
2 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index b19346a..53fdb05 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -2023,6 +2023,7 @@ static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
ar.goal = goal;
ar.len = *count;
ar.logical = iblock;
+ ar.excepted_group = -1;

if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
/* enable in-core preallocation for data block allocation */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bf612a7..268e96d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2879,6 +2879,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
ar.goal = ext4_ext_find_goal(inode, path, iblock);
ar.logical = iblock;
ar.len = allocated;
+ ar.excepted_group = -1;
if (S_ISREG(inode->i_mode))
ar.flags = EXT4_MB_HINT_DATA;
else
--
1.6.0.1.90.g27a6e


2008-08-27 15:29:11

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V3 08/11] ext4: Signed arithematic fix

This patch converts some usage of ext4_fsblk_t to s64
This is needed so that some of the sign conversion works
as expected in if loops.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/balloc.c | 19 ++++++++++---------
fs/ext4/ext4.h | 4 ++--
2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 53fdb05..7fdc236 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1603,10 +1603,10 @@ ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
}

int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
- ext4_fsblk_t nblocks)
+ s64 nblocks)
{
s64 free_blocks, dirty_blocks;
- ext4_fsblk_t root_blocks = 0;
+ s64 root_blocks = 0;
struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;

@@ -1631,7 +1631,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
/* Check whether we have space after
* accounting for current dirty blocks
*/
- if (free_blocks < ((s64)(root_blocks + nblocks) + dirty_blocks))
+ if (free_blocks < ((root_blocks + nblocks) + dirty_blocks))
/* we don't have free space */
return -ENOSPC;

@@ -1650,10 +1650,10 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
* On success, return nblocks
*/
ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
- ext4_fsblk_t nblocks)
+ s64 nblocks)
{
- ext4_fsblk_t free_blocks, dirty_blocks;
- ext4_fsblk_t root_blocks = 0;
+ s64 free_blocks, dirty_blocks;
+ s64 root_blocks = 0;
struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;

@@ -1667,14 +1667,15 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,

if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
EXT4_FREEBLOCKS_WATERMARK) {
- free_blocks = percpu_counter_sum_positive(fbc);
- dirty_blocks = percpu_counter_sum_positive(dbc);
+ free_blocks = percpu_counter_sum(fbc);
+ dirty_blocks = percpu_counter_sum(dbc);
}
if (free_blocks <= (root_blocks + dirty_blocks))
/* we don't have free space */
return 0;
+
if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
- return free_blocks - root_blocks;
+ return free_blocks - (root_blocks + dirty_blocks);
return nblocks;
}

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 71a4fde..13c69ed 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1048,9 +1048,9 @@ extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp);
extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
- ext4_fsblk_t nblocks);
+ s64 nblocks);
extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
- ext4_fsblk_t nblocks);
+ s64 nblocks);
extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
ext4_fsblk_t block, unsigned long count, int metadata);
extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
--
1.6.0.1.90.g27a6e


2008-08-27 15:29:17

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V3 10/11] ext4: Add inode to journal handle after block allocation for ordered mode

This make sure when we have block allocation failure
we don't have inode inode added to the journal handle.
So journal commit will not include the inode for which
block allocation failed.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/balloc.c | 2 +-
fs/ext4/inode.c | 36 +++++++++++++++---------------------
2 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index a52fde3..9a0239e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -2061,7 +2061,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
/*
* Account for the allocated meta blocks
*/
- if (!(*errp)) {
+ if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
EXT4_I(inode)->i_allocated_meta_blocks += *count;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3ef0822..24381bb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1591,6 +1591,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
*/
mdb_free = EXT4_I(inode)->i_reserved_meta_blocks;
EXT4_I(inode)->i_reserved_meta_blocks = 0;
+ EXT4_I(inode)->i_allocated_meta_blocks = 0;

}
release = to_free + mdb_free;
@@ -2169,18 +2170,23 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
handle_t *handle = NULL;

handle = ext4_journal_current_handle();
- if (!handle) {
- ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
- bh_result, 0, 0, 0);
- BUG_ON(!ret);
- } else {
- ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
- bh_result, create, 0, EXT4_DELALLOC_RSVED);
- }
-
+ BUG_ON(!handle);
+ ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+ bh_result, create, 0, EXT4_DELALLOC_RSVED);
if (ret > 0) {
+
bh_result->b_size = (ret << inode->i_blkbits);

+ if (ext4_should_order_data(inode)) {
+ ret = ext4_jbd2_file_inode(handle, inode);
+ if (ret)
+ /*
+ * Failed to add inode for ordered
+ * mode. Don't update file size
+ */
+ return ret;
+ }
+
/*
* Update on-disk size along with block allocation
* we don't use 'extend_disksize' as size may change
@@ -2406,18 +2412,6 @@ static int ext4_da_writepages(struct address_space *mapping,
dump_stack();
goto out_writepages;
}
- if (ext4_should_order_data(inode)) {
- /*
- * With ordered mode we need to add
- * the inode to the journal handl
- * when we do block allocation.
- */
- ret = ext4_jbd2_file_inode(handle, inode);
- if (ret) {
- ext4_journal_stop(handle);
- goto out_writepages;
- }
- }

to_write -= wbc->nr_to_write;
ret = mpage_da_writepages(mapping, wbc,
--
1.6.0.1.90.g27a6e


2008-08-27 15:29:14

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V3 09/11] ext4: Fix ext4 nomballoc allocator for ENOSPC

Make sure we set windowsize to zero if the free
blocks left is less that window size. Otherwise
we skip some group with low freeblock count during
block allocation

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/balloc.c | 4 +++-
1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 7fdc236..a52fde3 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1809,8 +1809,10 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
* turn off reservation for this allocation
*/
if (my_rsv && (free_blocks < windowsz)
- && (rsv_is_empty(&my_rsv->rsv_window)))
+ && (rsv_is_empty(&my_rsv->rsv_window))) {
my_rsv = NULL;
+ windowsz = 0;
+ }

if (free_blocks > 0) {
bitmap_bh = ext4_read_block_bitmap(sb, group_no);
--
1.6.0.1.90.g27a6e


2008-08-27 19:07:25

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Wed, 27 Aug 2008 20:58:26 +0530
"Aneesh Kumar K.V" <[email protected]> wrote:

> fbc->count is of type s64. The change was introduced by
> 0216bfcffe424a5473daa4da47440881b36c1f4 which changed the type
> from long to s64. Moving to s64 also means on 32 bit architectures
> we can get wrong values on fbc->count. Since fbc->count is read
> more frequently and updated rarely use seqlocks. This should
> reduce the impact of locking in the read path for 32bit arch.
>

So... yesterday's suggestionm to investigate implementing this at a
lower level wasn't popular?

> include/linux/percpu_counter.h | 28 ++++++++++++++++++++++++----
> lib/percpu_counter.c | 20 ++++++++++----------
> 2 files changed, 34 insertions(+), 14 deletions(-)
>
> diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
> index 9007ccd..1b711a1 100644
> --- a/include/linux/percpu_counter.h
> +++ b/include/linux/percpu_counter.h
> @@ -6,7 +6,7 @@
> * WARNING: these things are HUGE. 4 kbytes per counter on 32-way P4.
> */
>
> -#include <linux/spinlock.h>
> +#include <linux/seqlock.h>
> #include <linux/smp.h>
> #include <linux/list.h>
> #include <linux/threads.h>
> @@ -16,7 +16,7 @@
> #ifdef CONFIG_SMP
>
> struct percpu_counter {
> - spinlock_t lock;
> + seqlock_t lock;
> s64 count;
> #ifdef CONFIG_HOTPLUG_CPU
> struct list_head list; /* All percpu_counters are on a list */
> @@ -53,10 +53,30 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
> return __percpu_counter_sum(fbc);
> }
>
> -static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> +#if BITS_PER_LONG == 64
> +static inline s64 fbc_count(struct percpu_counter *fbc)
> {
> return fbc->count;
> }
> +#else
> +/* doesn't have atomic 64 bit operation */
> +static inline s64 fbc_count(struct percpu_counter *fbc)
> +{
> + s64 ret;
> + unsigned seq;
> + do {
> + seq = read_seqbegin(&fbc->lock);
> + ret = fbc->count;
> + } while (read_seqretry(&fbc->lock, seq));
> + return ret;
> +

Please don't put unneeded blank lines into random places.

> +}
> +#endif

This is now too large to be inlined.

> +static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> +{
> + return fbc_count(fbc);
> +}

This change means that a percpu_counter_read() from interrupt context
on a 32-bit machine is now deadlockable, whereas it previously was not
deadlockable on either 32-bit or 64-bit.

This flows on to the lib/proportions.c, which uses
percpu_counter_read() and also does spin_lock_irqsave() internally,
indicating that it is (or was) designed to be used in IRQ contexts.

It means that bdi_stat() can no longer be used from interrupt context.

So a whole lot of thought and review and checking is needed here. It
should all be spelled out in the changelog. This will be a horridly
rare deadlock, so suitable WARN_ON()s should be added to detect when
callers are vulnerable to it.

Or we make the whole thing irq-safe.


2008-08-27 21:02:07

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Wed, 2008-08-27 at 12:05 -0700, Andrew Morton wrote:
> On Wed, 27 Aug 2008 20:58:26 +0530
> "Aneesh Kumar K.V" <[email protected]> wrote:
>
> > fbc->count is of type s64. The change was introduced by
> > 0216bfcffe424a5473daa4da47440881b36c1f4 which changed the type
> > from long to s64. Moving to s64 also means on 32 bit architectures
> > we can get wrong values on fbc->count. Since fbc->count is read
> > more frequently and updated rarely use seqlocks. This should
> > reduce the impact of locking in the read path for 32bit arch.
> >
>
> So... yesterday's suggestionm to investigate implementing this at a
> lower level wasn't popular?

I think its a good idea to investigate a generic atomic64_t type.

i386 could possibly use cmpxchg8 if and when available, although using
that to read might be rather too expensive.

Doing something like:

struct atomic64_t {
seqlock_t lock;
s64 val;
};

might be somewhat unexpected from the sizeof() angle of things. Then
there is of course the possiblity of hashing the locks...



> > include/linux/percpu_counter.h | 28 ++++++++++++++++++++++++----
> > lib/percpu_counter.c | 20 ++++++++++----------
> > 2 files changed, 34 insertions(+), 14 deletions(-)
> >
> > diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
> > index 9007ccd..1b711a1 100644
> > --- a/include/linux/percpu_counter.h
> > +++ b/include/linux/percpu_counter.h
> > @@ -6,7 +6,7 @@
> > * WARNING: these things are HUGE. 4 kbytes per counter on 32-way P4.
> > */
> >
> > -#include <linux/spinlock.h>
> > +#include <linux/seqlock.h>
> > #include <linux/smp.h>
> > #include <linux/list.h>
> > #include <linux/threads.h>
> > @@ -16,7 +16,7 @@
> > #ifdef CONFIG_SMP
> >
> > struct percpu_counter {
> > - spinlock_t lock;
> > + seqlock_t lock;
> > s64 count;
> > #ifdef CONFIG_HOTPLUG_CPU
> > struct list_head list; /* All percpu_counters are on a list */
> > @@ -53,10 +53,30 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
> > return __percpu_counter_sum(fbc);
> > }
> >
> > -static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> > +#if BITS_PER_LONG == 64
> > +static inline s64 fbc_count(struct percpu_counter *fbc)
> > {
> > return fbc->count;
> > }
> > +#else
> > +/* doesn't have atomic 64 bit operation */
> > +static inline s64 fbc_count(struct percpu_counter *fbc)
> > +{
> > + s64 ret;
> > + unsigned seq;
> > + do {
> > + seq = read_seqbegin(&fbc->lock);
> > + ret = fbc->count;
> > + } while (read_seqretry(&fbc->lock, seq));
> > + return ret;
> > +
>
> Please don't put unneeded blank lines into random places.
>
> > +}
> > +#endif
>
> This is now too large to be inlined.
>
> > +static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> > +{
> > + return fbc_count(fbc);
> > +}
>
> This change means that a percpu_counter_read() from interrupt context
> on a 32-bit machine is now deadlockable, whereas it previously was not
> deadlockable on either 32-bit or 64-bit.
>
> This flows on to the lib/proportions.c, which uses
> percpu_counter_read() and also does spin_lock_irqsave() internally,
> indicating that it is (or was) designed to be used in IRQ contexts.

percpu_counter() never was irq safe, which is why the proportion stuff
does all the irq disabling bits by hand.

> It means that bdi_stat() can no longer be used from interrupt context.

Actually, as long as the write side of the seqlock usage is done with
IRQs disabled, the read side should be good.

If the read loop gets preempted by a write action, the seq count will
not match up and we'll just try again.

The only lethal combination is trying to do the read loop while inside
the write side.

If you look at backing-dev.h, you'll see that all modifying operations
disable IRQs.

> So a whole lot of thought and review and checking is needed here. It
> should all be spelled out in the changelog. This will be a horridly
> rare deadlock, so suitable WARN_ON()s should be added to detect when
> callers are vulnerable to it.
>
> Or we make the whole thing irq-safe.

That'd rather substantially penalize those cases where we don't need it.
>From what I understood this whole pushf/popf stuff is insanely expensive
on a few archs.


2008-08-27 21:24:27

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Wed, 27 Aug 2008 23:01:52 +0200
Peter Zijlstra <[email protected]> wrote:

> >
> > > +static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> > > +{
> > > + return fbc_count(fbc);
> > > +}
> >
> > This change means that a percpu_counter_read() from interrupt context
> > on a 32-bit machine is now deadlockable, whereas it previously was not
> > deadlockable on either 32-bit or 64-bit.
> >
> > This flows on to the lib/proportions.c, which uses
> > percpu_counter_read() and also does spin_lock_irqsave() internally,
> > indicating that it is (or was) designed to be used in IRQ contexts.
>
> percpu_counter() never was irq safe, which is why the proportion stuff
> does all the irq disabling bits by hand.

percpu_counter_read() was irq-safe. That changes here. Needs careful
review, changelogging and, preferably, runtime checks. But perhaps
they should be inside some CONFIG_thing which won't normally be done in
production.

otoh, percpu_counter_read() is in fact a rare operation, so a bit of
overhead probably won't matter.

(write-often, read-rarely is the whole point. This patch's changelog's
assertion that "Since fbc->count is read more frequently and updated
rarely" is probably wrong. Most percpu_counters will have their
fbc->count modified far more frequently than having it read from).

> > It means that bdi_stat() can no longer be used from interrupt context.
>
> Actually, as long as the write side of the seqlock usage is done with
> IRQs disabled, the read side should be good.

yup.

> If the read loop gets preempted by a write action, the seq count will
> not match up and we'll just try again.
>
> The only lethal combination is trying to do the read loop while inside
> the write side.

yup

> If you look at backing-dev.h, you'll see that all modifying operations
> disable IRQs.

OK.

> > So a whole lot of thought and review and checking is needed here. It
> > should all be spelled out in the changelog. This will be a horridly
> > rare deadlock, so suitable WARN_ON()s should be added to detect when
> > callers are vulnerable to it.
> >
> > Or we make the whole thing irq-safe.
>
> That'd rather substantially penalize those cases where we don't need it.
> >From what I understood this whole pushf/popf stuff is insanely expensive
> on a few archs.

Sure. I _expect_ that this interface change won't actually break
anything. But it adds a restriction which we should think about, and
document.



btw, what the heck is percpu_counter_init_irq()? Some mysterious
lockdep-specific thing?

<does git-fiddle. Oh. crappy changelog.>

I let that one leak through uncommented. Must be getting old.
Probably it will need an EXPORT_SYMBOL() sometime.


I expect that if we're going to go ahead and make percpu_counter_read()
no longer usable from interrupt context then we'll eventually end up
needing the full suite of _irq() and _irqsave() interface functions.
percpu_counter_add_irqsave(), etc.

2008-08-28 03:48:32

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Wed, Aug 27, 2008 at 12:05:53PM -0700, Andrew Morton wrote:
> On Wed, 27 Aug 2008 20:58:26 +0530
> "Aneesh Kumar K.V" <[email protected]> wrote:
>
> > fbc->count is of type s64. The change was introduced by
> > 0216bfcffe424a5473daa4da47440881b36c1f4 which changed the type
> > from long to s64. Moving to s64 also means on 32 bit architectures
> > we can get wrong values on fbc->count. Since fbc->count is read
> > more frequently and updated rarely use seqlocks. This should
> > reduce the impact of locking in the read path for 32bit arch.
> >
>
> So... yesterday's suggestionm to investigate implementing this at a
> lower level wasn't popular?

I wanted to sent the entire patchset which fixes ENOSPC issues with
delalloc. It happened to be on the next day you looked at the previous
mail. Sending the patch again in now way mean we should not have
generic atomic64_t;


>
> > include/linux/percpu_counter.h | 28 ++++++++++++++++++++++++----
> > lib/percpu_counter.c | 20 ++++++++++----------
> > 2 files changed, 34 insertions(+), 14 deletions(-)
> >
> > diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
> > index 9007ccd..1b711a1 100644
> > --- a/include/linux/percpu_counter.h
> > +++ b/include/linux/percpu_counter.h
> > @@ -6,7 +6,7 @@
> > * WARNING: these things are HUGE. 4 kbytes per counter on 32-way P4.
> > */
> >
> > -#include <linux/spinlock.h>
> > +#include <linux/seqlock.h>
> > #include <linux/smp.h>
> > #include <linux/list.h>
> > #include <linux/threads.h>
> > @@ -16,7 +16,7 @@
> > #ifdef CONFIG_SMP
> >
> > struct percpu_counter {
> > - spinlock_t lock;
> > + seqlock_t lock;
> > s64 count;
> > #ifdef CONFIG_HOTPLUG_CPU
> > struct list_head list; /* All percpu_counters are on a list */
> > @@ -53,10 +53,30 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
> > return __percpu_counter_sum(fbc);
> > }
> >
> > -static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> > +#if BITS_PER_LONG == 64
> > +static inline s64 fbc_count(struct percpu_counter *fbc)
> > {
> > return fbc->count;
> > }
> > +#else
> > +/* doesn't have atomic 64 bit operation */
> > +static inline s64 fbc_count(struct percpu_counter *fbc)
> > +{
> > + s64 ret;
> > + unsigned seq;
> > + do {
> > + seq = read_seqbegin(&fbc->lock);
> > + ret = fbc->count;
> > + } while (read_seqretry(&fbc->lock, seq));
> > + return ret;
> > +
>
> Please don't put unneeded blank lines into random places.
>

Will fix

> > +}
> > +#endif
>
> This is now too large to be inlined.
>


How do we actually figure that out ? I have been making that mistakes
quiet often.

> > +static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> > +{
> > + return fbc_count(fbc);
> > +}
>

-aneesh

2008-08-28 03:52:11

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Wed, Aug 27, 2008 at 02:22:50PM -0700, Andrew Morton wrote:
> On Wed, 27 Aug 2008 23:01:52 +0200
> Peter Zijlstra <[email protected]> wrote:
>
> > >
> > > > +static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> > > > +{
> > > > + return fbc_count(fbc);
> > > > +}
> > >
> > > This change means that a percpu_counter_read() from interrupt context
> > > on a 32-bit machine is now deadlockable, whereas it previously was not
> > > deadlockable on either 32-bit or 64-bit.
> > >
> > > This flows on to the lib/proportions.c, which uses
> > > percpu_counter_read() and also does spin_lock_irqsave() internally,
> > > indicating that it is (or was) designed to be used in IRQ contexts.
> >
> > percpu_counter() never was irq safe, which is why the proportion stuff
> > does all the irq disabling bits by hand.
>
> percpu_counter_read() was irq-safe. That changes here. Needs careful
> review, changelogging and, preferably, runtime checks. But perhaps
> they should be inside some CONFIG_thing which won't normally be done in
> production.
>
> otoh, percpu_counter_read() is in fact a rare operation, so a bit of
> overhead probably won't matter.
>
> (write-often, read-rarely is the whole point. This patch's changelog's
> assertion that "Since fbc->count is read more frequently and updated
> rarely" is probably wrong. Most percpu_counters will have their
> fbc->count modified far more frequently than having it read from).

we may actually be doing percpu_counter_add. But that doesn't update
fbc->count. Only if the local percpu values cross FBC_BATCH we update
fbc->count. If we are modifying fbc->count more frequently than
reading fbc->count then i guess we would be contenting of fbc->lock more.


-aneesh

2008-08-28 04:07:53

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Thu, 28 Aug 2008 09:18:16 +0530 "Aneesh Kumar K.V" <[email protected]> wrote:

> > This is now too large to be inlined.
> >
>
>
> How do we actually figure that out ? I have been making that mistakes
> quiet often.

Well. Experience and guesswork, mainly.

But a useful metric is to look and the /bin/size output before and
after the inlining. In this case fs/ext3/ialloc.o's text shrunk 40-odd
bytes, which we think is a net benefit due to reduced CPU cache
pressure.



2008-08-28 04:10:31

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Thu, 28 Aug 2008 09:22:00 +0530 "Aneesh Kumar K.V" <[email protected]> wrote:

> On Wed, Aug 27, 2008 at 02:22:50PM -0700, Andrew Morton wrote:
> > On Wed, 27 Aug 2008 23:01:52 +0200
> > Peter Zijlstra <[email protected]> wrote:
> >
> > > >
> > > > > +static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> > > > > +{
> > > > > + return fbc_count(fbc);
> > > > > +}
> > > >
> > > > This change means that a percpu_counter_read() from interrupt context
> > > > on a 32-bit machine is now deadlockable, whereas it previously was not
> > > > deadlockable on either 32-bit or 64-bit.
> > > >
> > > > This flows on to the lib/proportions.c, which uses
> > > > percpu_counter_read() and also does spin_lock_irqsave() internally,
> > > > indicating that it is (or was) designed to be used in IRQ contexts.
> > >
> > > percpu_counter() never was irq safe, which is why the proportion stuff
> > > does all the irq disabling bits by hand.
> >
> > percpu_counter_read() was irq-safe. That changes here. Needs careful
> > review, changelogging and, preferably, runtime checks. But perhaps
> > they should be inside some CONFIG_thing which won't normally be done in
> > production.
> >
> > otoh, percpu_counter_read() is in fact a rare operation, so a bit of
> > overhead probably won't matter.
> >
> > (write-often, read-rarely is the whole point. This patch's changelog's
> > assertion that "Since fbc->count is read more frequently and updated
> > rarely" is probably wrong. Most percpu_counters will have their
> > fbc->count modified far more frequently than having it read from).
>
> we may actually be doing percpu_counter_add. But that doesn't update
> fbc->count. Only if the local percpu values cross FBC_BATCH we update
> fbc->count. If we are modifying fbc->count more frequently than
> reading fbc->count then i guess we would be contenting of fbc->lock more.
>
>

Yep. The frequency of modification of fbc->count is of the order of a
tenth or a hundredth of the frequency of
precpu_counter_<modification>() calls.

But in many cases the frequency of percpu_counter_read() calls is far
far less than this. For example, the percpu_counter_read() may only
happen when userspace polls a /proc file.



2008-08-28 04:19:40

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Thursday 28 August 2008 14:06, Andrew Morton wrote:
> On Thu, 28 Aug 2008 09:18:16 +0530 "Aneesh Kumar K.V"
<[email protected]> wrote:
> > > This is now too large to be inlined.
> >
> > How do we actually figure that out ? I have been making that mistakes
> > quiet often.
>
> Well. Experience and guesswork, mainly.
>
> But a useful metric is to look and the /bin/size output before and
> after the inlining. In this case fs/ext3/ialloc.o's text shrunk 40-odd
> bytes, which we think is a net benefit due to reduced CPU cache
> pressure.

Weighed against register save/restore, compiler barrier, and function
call cost of uninlined. These can add up to 10s of cycles per call I've
seen, so if it is called several times between each icache miss it can
easily be worth inlining. Basically, measurement is required, and if it
isn't important enough to measure policy tends to default to uninline if
that saves space.

2008-08-28 20:41:41

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH -V3 02/11] ext4: Make sure all the block allocation paths reserve blocks


在 2008-08-27三的 20:58 +0530,Aneesh Kumar K.V写道:
> With delayed allocation we need to make sure block are reserved
> before we attempt to allocate them. Otherwise we get block
> allocation failure (ENOSPC) during writepages which cannot
> be handled. This would mean silent data loss (We do a printk
> stating data will be lost). This patch update the DIO
> and fallocate code path to do block reservation before block
> allocation. This is needed to make sure parallel DIO and
> fallocate request doesn't take block out of delayed reserve
> space.
>
> When free blocks count go below a threshold we switch to
> a slow patch which looks at other CPU's accumulated percpu
> counter values.
>

Added this patch to patch queue.

> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> fs/ext4/balloc.c | 58 ++++++++++++++++++++++++++++++++++++++--------------
> fs/ext4/ext4.h | 13 +++++++++++
> fs/ext4/inode.c | 5 +---
> fs/ext4/mballoc.c | 23 +++++++++++---------
> 4 files changed, 69 insertions(+), 30 deletions(-)
>
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index cfed283..dc10bfd 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -1602,6 +1602,32 @@ ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
> return ret;
> }
>
> +int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
> + ext4_fsblk_t nblocks)
> +{
> + s64 free_blocks;
> + ext4_fsblk_t root_blocks = 0;
> + struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
> +
> + free_blocks = percpu_counter_read(fbc);
> +
> + if (!capable(CAP_SYS_RESOURCE) &&
> + sbi->s_resuid != current->fsuid &&
> + (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
> + root_blocks = ext4_r_blocks_count(sbi->s_es);
> +
> + if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK)
> + free_blocks = percpu_counter_sum(&sbi->s_freeblocks_counter);
> +
> + if (free_blocks < (root_blocks + nblocks))
> + /* we don't have free space */
> + return -ENOSPC;
> +
> + /* reduce fs free blocks counter */
> + percpu_counter_sub(fbc, nblocks);
> + return 0;
> +}
> +

Any reason that we don't do percpu_counter_sum_and_sub() together? I
thought this fixed the parallel reservation race before.

> /**
> * ext4_has_free_blocks()
> * @sbi: in-core super block structure.
> @@ -1623,18 +1649,17 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
> sbi->s_resuid != current->fsuid &&
> (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
> root_blocks = ext4_r_blocks_count(sbi->s_es);
> -#ifdef CONFIG_SMP
> - if (free_blocks - root_blocks < FBC_BATCH)
> - free_blocks =
> - percpu_counter_sum(&sbi->s_freeblocks_counter);
> -#endif
> +
> + if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK)
> + free_blocks = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
> +
> if (free_blocks <= root_blocks)
> /* we don't have free space */
> return 0;
> if (free_blocks - root_blocks < nblocks)
> return free_blocks - root_blocks;
> return nblocks;
> - }
> +}
>
>
> /**
> @@ -1713,14 +1738,11 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
> /*
> * With delalloc we already reserved the blocks
> */
> - *count = ext4_has_free_blocks(sbi, *count);
> - }
> - if (*count == 0) {
> - *errp = -ENOSPC;
> - return 0; /*return with ENOSPC error */
> + if (ext4_claim_free_blocks(sbi, *count)) {
> + *errp = -ENOSPC;
> + return 0; /*return with ENOSPC error */
> + }
> }
> - num = *count;
> -
> /*
> * Check quota for allocation of this block.
> */
> @@ -1915,9 +1937,13 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
> le16_add_cpu(&gdp->bg_free_blocks_count, -num);
> gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
> spin_unlock(sb_bgl_lock(sbi, group_no));
> - if (!EXT4_I(inode)->i_delalloc_reserved_flag)
> - percpu_counter_sub(&sbi->s_freeblocks_counter, num);
> -
> + if (!EXT4_I(inode)->i_delalloc_reserved_flag && (*count != num)) {
> + /*
> + * we allocated less blocks than we
> + * claimed. Add the difference back.
> + */
> + percpu_counter_add(&sbi->s_freeblocks_counter, *count - num);
> + }
> if (sbi->s_log_groups_per_flex) {
> ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
> spin_lock(sb_bgl_lock(sbi, flex_group));
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 7f11b25..71a4fde 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1047,6 +1047,8 @@ extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
> unsigned long *count, int *errp);
> extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
> ext4_fsblk_t goal, unsigned long *count, int *errp);
> +extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
> + ext4_fsblk_t nblocks);
> extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
> ext4_fsblk_t nblocks);
> extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
> @@ -1295,6 +1297,17 @@ do { \
> __ext4_std_error((sb), __func__, (errno)); \
> } while (0)
>
> +#ifdef CONFIG_SMP
> +/* Each CPU can accumulate FBC_BATCH blocks in their local
> + * counters. So we need to make sure we have free blocks more
> + * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times.
> + */
> +#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids))
> +#else
> +#define EXT4_FREEBLOCKS_WATERMARK 0
> +#endif
> +
> +
> /*
> * Inodes and files operations
> */
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 1c289c1..d965a05 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1537,13 +1537,10 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
> md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
> total = md_needed + nrblocks;
>
> - if (ext4_has_free_blocks(sbi, total) < total) {
> + if (ext4_claim_free_blocks(sbi, total)) {
> spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
> return -ENOSPC;
> }
> - /* reduce fs free blocks counter */
> - percpu_counter_sub(&sbi->s_freeblocks_counter, total);
> -
> EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
> EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
>
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 82dd0e4..4404b46 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2977,9 +2977,15 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
> * at write_begin() time for delayed allocation
> * do not double accounting
> */
> - if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
> - percpu_counter_sub(&sbi->s_freeblocks_counter,
> - ac->ac_b_ex.fe_len);
> + if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) &&
> + ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) {
> + /*
> + * we allocated less blocks than we calimed
> + * Add the difference back
> + */
> + percpu_counter_add(&sbi->s_freeblocks_counter,
> + ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len);
> + }
>
> if (sbi->s_log_groups_per_flex) {
> ext4_group_t flex_group = ext4_flex_group(sbi,
> @@ -4391,14 +4397,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
> /*
> * With delalloc we already reserved the blocks
> */
> - ar->len = ext4_has_free_blocks(sbi, ar->len);
> - }
> -
> - if (ar->len == 0) {
> - *errp = -ENOSPC;
> - return 0;
> + if (ext4_claim_free_blocks(sbi, ar->len)) {
> + *errp = -ENOSPC;
> + return 0;
> + }
> }
> -
> while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
> ar->flags |= EXT4_MB_HINT_NOPREALLOC;
> ar->len--;

2008-08-28 20:42:39

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH -V3 03/11] ext4: Retry block reservation


在 2008-08-27三的 20:58 +0530,Aneesh Kumar K.V写道:
> During block reservation if we don't have enough
> blocks left, retry block reservation with smaller
> block count. This make sure we try fallocate
> and DIO with smaller request size and don't fail early.
> The delayed allocation reservation cannot try with smaller
> block count. So retry block reservation to handle temporary
> disk full conditions. Also print free blocks details if we
> fail block allocation during writepages.
>
Added to patch queue.
Reviewed-by: Mingming Cao <[email protected]>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> fs/ext4/balloc.c | 8 +++++++-
> fs/ext4/inode.c | 14 +++++++++++---
> fs/ext4/mballoc.c | 7 ++++++-
> 3 files changed, 24 insertions(+), 5 deletions(-)
>
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index dc10bfd..5767332 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -1738,10 +1738,16 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
> /*
> * With delalloc we already reserved the blocks
> */
> - if (ext4_claim_free_blocks(sbi, *count)) {
> + while (*count && ext4_claim_free_blocks(sbi, *count)) {
> + /* let others to free the space */
> + yield();
> + *count = *count >> 1;
> + }
> + if (!*count) {
> *errp = -ENOSPC;
> return 0; /*return with ENOSPC error */
> }
> + num = *count;
> }
> /*
> * Check quota for allocation of this block.
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index d965a05..98a998b 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1521,6 +1521,7 @@ static int ext4_journalled_write_end(struct file *file,
>
> static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
> {
> + int retries = 0;
> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> unsigned long md_needed, mdblocks, total = 0;
>
> @@ -1529,6 +1530,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
> * in order to allocate nrblocks
> * worse case is one extent per block
> */
> +repeat:
> spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
> total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
> mdblocks = ext4_calc_metadata_amount(inode, total);
> @@ -1539,6 +1541,10 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
>
> if (ext4_claim_free_blocks(sbi, total)) {
> spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
> + if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
> + yield();
> + goto repeat;
> + }
> return -ENOSPC;
> }
> EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
> @@ -1825,20 +1831,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
> static int mpage_da_map_blocks(struct mpage_da_data *mpd)
> {
> int err = 0;
> + struct buffer_head new;
> struct buffer_head *lbh = &mpd->lbh;
> sector_t next = lbh->b_blocknr;
> - struct buffer_head new;
>
> /*
> * We consider only non-mapped and non-allocated blocks
> */
> if (buffer_mapped(lbh) && !buffer_delay(lbh))
> return 0;
> -
> new.b_state = lbh->b_state;
> new.b_blocknr = 0;
> new.b_size = lbh->b_size;
> -
> /*
> * If we didn't accumulate anything
> * to write simply return
> @@ -1871,6 +1875,10 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
> lbh->b_size >> mpd->inode->i_blkbits, err);
> printk(KERN_EMERG "This should not happen.!! "
> "Data will be lost\n");
> + if (err == -ENOSPC) {
> + printk(KERN_CRIT "Total free blocks count %lld\n",
> + ext4_count_free_blocks(mpd->inode->i_sb));
> + }
> /* invlaidate all the pages */
> ext4_da_block_invalidatepages(mpd, next,
> lbh->b_size >> mpd->inode->i_blkbits);
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 4404b46..419009f 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -4397,7 +4397,12 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
> /*
> * With delalloc we already reserved the blocks
> */
> - if (ext4_claim_free_blocks(sbi, ar->len)) {
> + while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
> + /* let others to free the space */
> + yield();
> + ar->len = ar->len >> 1;
> + }
> + if (!ar->len) {
> *errp = -ENOSPC;
> return 0;
> }

2008-08-28 20:56:30

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH -V3 04/11] ext4: Add percpu dirty block accounting.

Added to patch queue

在 2008-08-27三的 20:58 +0530,Aneesh Kumar K.V写道:
> This patch add dirty block accounting using percpu_counters.
> Delayed allocation block reservation is now done by updating
> dirty block counter. In the later patch we switch to non
> delalloc mode if the filesystem free blocks is < that
> 150 % of total filesystem dirty blocks
>


Reviewed-by: Mingming Cao <[email protected]>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> fs/ext4/balloc.c | 59 +++++++++++++++++++++++++++++++++-------------------
> fs/ext4/ext4_sb.h | 1 +
> fs/ext4/inode.c | 22 +++++++++---------
> fs/ext4/mballoc.c | 17 ++------------
> fs/ext4/super.c | 8 ++++++-
> 5 files changed, 59 insertions(+), 48 deletions(-)
>
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index 5767332..b19346a 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -1605,26 +1605,38 @@ ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
> int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
> ext4_fsblk_t nblocks)
> {
> - s64 free_blocks;
> + s64 free_blocks, dirty_blocks;
> ext4_fsblk_t root_blocks = 0;
> struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
> + struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
>
> - free_blocks = percpu_counter_read(fbc);
> + free_blocks = percpu_counter_read_positive(fbc);
> + dirty_blocks = percpu_counter_read_positive(dbc);
>
> if (!capable(CAP_SYS_RESOURCE) &&
> sbi->s_resuid != current->fsuid &&
> (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
> root_blocks = ext4_r_blocks_count(sbi->s_es);
>
> - if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK)
> - free_blocks = percpu_counter_sum(&sbi->s_freeblocks_counter);
> -
> - if (free_blocks < (root_blocks + nblocks))
> + if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
> + EXT4_FREEBLOCKS_WATERMARK) {
> + free_blocks = percpu_counter_sum(fbc);
> + dirty_blocks = percpu_counter_sum(dbc);
> + if (dirty_blocks < 0) {
> + printk(KERN_CRIT "Dirty block accounting "
> + "went wrong %lld\n",
> + dirty_blocks);
> + }
> + }
> + /* Check whether we have space after
> + * accounting for current dirty blocks
> + */
> + if (free_blocks < ((s64)(root_blocks + nblocks) + dirty_blocks))
> /* we don't have free space */
> return -ENOSPC;
>
> - /* reduce fs free blocks counter */
> - percpu_counter_sub(fbc, nblocks);
> + /* Add the blocks to nblocks */
> + percpu_counter_add(dbc, nblocks);
> return 0;
> }
>
> @@ -1640,23 +1652,28 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
> ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
> ext4_fsblk_t nblocks)
> {
> - ext4_fsblk_t free_blocks;
> + ext4_fsblk_t free_blocks, dirty_blocks;
> ext4_fsblk_t root_blocks = 0;
> + struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
> + struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
>
> - free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
> + free_blocks = percpu_counter_read_positive(fbc);
> + dirty_blocks = percpu_counter_read_positive(dbc);
>
> if (!capable(CAP_SYS_RESOURCE) &&
> sbi->s_resuid != current->fsuid &&
> (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
> root_blocks = ext4_r_blocks_count(sbi->s_es);
>
> - if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK)
> - free_blocks = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
> -
> - if (free_blocks <= root_blocks)
> + if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
> + EXT4_FREEBLOCKS_WATERMARK) {
> + free_blocks = percpu_counter_sum_positive(fbc);
> + dirty_blocks = percpu_counter_sum_positive(dbc);
> + }
> + if (free_blocks <= (root_blocks + dirty_blocks))
> /* we don't have free space */
> return 0;
> - if (free_blocks - root_blocks < nblocks)
> + if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
> return free_blocks - root_blocks;
> return nblocks;
> }
> @@ -1943,13 +1960,11 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
> le16_add_cpu(&gdp->bg_free_blocks_count, -num);
> gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
> spin_unlock(sb_bgl_lock(sbi, group_no));
> - if (!EXT4_I(inode)->i_delalloc_reserved_flag && (*count != num)) {
> - /*
> - * we allocated less blocks than we
> - * claimed. Add the difference back.
> - */
> - percpu_counter_add(&sbi->s_freeblocks_counter, *count - num);
> - }
> + percpu_counter_sub(&sbi->s_freeblocks_counter, num);
> + /*
> + * Now reduce the dirty block count also. Should not go negative
> + */
> + percpu_counter_sub(&sbi->s_dirtyblocks_counter, num);
> if (sbi->s_log_groups_per_flex) {
> ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
> spin_lock(sb_bgl_lock(sbi, flex_group));
> diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
> index 6300226..0fa3762 100644
> --- a/fs/ext4/ext4_sb.h
> +++ b/fs/ext4/ext4_sb.h
> @@ -59,6 +59,7 @@ struct ext4_sb_info {
> struct percpu_counter s_freeblocks_counter;
> struct percpu_counter s_freeinodes_counter;
> struct percpu_counter s_dirs_counter;
> + struct percpu_counter s_dirtyblocks_counter;
> struct blockgroup_lock s_blockgroup_lock;
>
> /* root of the per fs reservation window tree */
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 98a998b..14ec7d1 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1030,19 +1030,20 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
> BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
> mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
>
> - /* Account for allocated meta_blocks */
> - mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
> -
> - /* update fs free blocks counter for truncate case */
> - percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free);
> + if (mdb_free) {
> + /* Account for allocated meta_blocks */
> + mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
> +
> + /* update fs dirty blocks counter */
> + percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
> + EXT4_I(inode)->i_allocated_meta_blocks = 0;
> + EXT4_I(inode)->i_reserved_meta_blocks = mdb;
> + }
>
> /* update per-inode reservations */
> BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
> EXT4_I(inode)->i_reserved_data_blocks -= used;
>
> - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
> - EXT4_I(inode)->i_reserved_meta_blocks = mdb;
> - EXT4_I(inode)->i_allocated_meta_blocks = 0;
> spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
> }
>
> @@ -1588,8 +1589,8 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
>
> release = to_free + mdb_free;
>
> - /* update fs free blocks counter for truncate case */
> - percpu_counter_add(&sbi->s_freeblocks_counter, release);
> + /* update fs dirty blocks counter for truncate case */
> + percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
>
> /* update per-inode reservations */
> BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
> @@ -2471,7 +2472,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
> index = pos >> PAGE_CACHE_SHIFT;
> from = pos & (PAGE_CACHE_SIZE - 1);
> to = from + len;
> -
> retry:
> /*
> * With delayed allocation, we don't log the i_disksize update
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 419009f..4da4b9a 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2971,22 +2971,11 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
> le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
> gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
> spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
> -
> + percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
> /*
> - * free blocks account has already be reduced/reserved
> - * at write_begin() time for delayed allocation
> - * do not double accounting
> + * Now reduce the dirty block count also. Should not go negative
> */
> - if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) &&
> - ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) {
> - /*
> - * we allocated less blocks than we calimed
> - * Add the difference back
> - */
> - percpu_counter_add(&sbi->s_freeblocks_counter,
> - ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len);
> - }
> -
> + percpu_counter_sub(&sbi->s_dirtyblocks_counter, ac->ac_b_ex.fe_len);
> if (sbi->s_log_groups_per_flex) {
> ext4_group_t flex_group = ext4_flex_group(sbi,
> ac->ac_b_ex.fe_group);
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index ed77786..7b9db51 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -520,6 +520,7 @@ static void ext4_put_super(struct super_block *sb)
> percpu_counter_destroy(&sbi->s_freeblocks_counter);
> percpu_counter_destroy(&sbi->s_freeinodes_counter);
> percpu_counter_destroy(&sbi->s_dirs_counter);
> + percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
> brelse(sbi->s_sbh);
> #ifdef CONFIG_QUOTA
> for (i = 0; i < MAXQUOTAS; i++)
> @@ -2259,6 +2260,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> err = percpu_counter_init(&sbi->s_dirs_counter,
> ext4_count_dirs(sb));
> }
> + if (!err) {
> + err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
> + }
> if (err) {
> printk(KERN_ERR "EXT4-fs: insufficient memory\n");
> goto failed_mount3;
> @@ -2491,6 +2495,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> percpu_counter_destroy(&sbi->s_freeblocks_counter);
> percpu_counter_destroy(&sbi->s_freeinodes_counter);
> percpu_counter_destroy(&sbi->s_dirs_counter);
> + percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
> failed_mount2:
> for (i = 0; i < db_count; i++)
> brelse(sbi->s_group_desc[i]);
> @@ -3164,7 +3169,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
> buf->f_type = EXT4_SUPER_MAGIC;
> buf->f_bsize = sb->s_blocksize;
> buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
> - buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
> + buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
> + percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
> ext4_free_blocks_count_set(es, buf->f_bfree);
> buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
> if (buf->f_bfree < ext4_r_blocks_count(es))

2008-08-28 20:58:03

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH -V3 05/11] ext4: Switch to non delalloc mode when we are low on free blocks count.


在 2008-08-27三的 20:58 +0530,Aneesh Kumar K.V写道:
> delayed allocation allocate blocks during writepages. That also
> means we cannot handle block allocation failures. Switch to
> non - delalloc when we are running low on free blocks.
> Delayed allocation need to do aggressive meta-data block reservation
> considering that the requested blocks can all be discontiguous.
> Switching to non-delalloc avoids that. Also we can satisfy
> partial write in non-delalloc mode.
>
Added to patch queue

Reviewed-by: Mingming Cao <[email protected]>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> fs/ext4/inode.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++--
> 1 files changed, 50 insertions(+), 2 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 14ec7d1..a45121f 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -2458,6 +2458,33 @@ static int ext4_da_writepages(struct address_space *mapping,
> return ret;
> }
>
> +#define FALL_BACK_TO_NONDELALLOC 1
> +static int ext4_nonda_switch(struct super_block *sb)
> +{
> + s64 free_blocks, dirty_blocks;
> + struct ext4_sb_info *sbi = EXT4_SB(sb);
> +
> + /*
> + * switch to non delalloc mode if we are running low
> + * on free block. The free block accounting via percpu
> + * counters can get slightly wrong with FBC_BATCH getting
> + * accumulated on each CPU without updating global counters
> + * Delalloc need an accurate free block accounting. So switch
> + * to non delalloc when we are near to error range.
> + */
> + free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
> + dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
> + if (2 * free_blocks < 3 * dirty_blocks ||
> + free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
> + /*
> + * free block count is less that 150% of dirty blocks
> + * or free blocks is less that watermark
> + */
> + return 1;
> + }
> + return 0;
> +}
> +
> static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
> loff_t pos, unsigned len, unsigned flags,
> struct page **pagep, void **fsdata)
> @@ -2472,6 +2499,13 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
> index = pos >> PAGE_CACHE_SHIFT;
> from = pos & (PAGE_CACHE_SIZE - 1);
> to = from + len;
> +
> + if (ext4_nonda_switch(inode->i_sb)) {
> + *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
> + return ext4_write_begin(file, mapping, pos,
> + len, flags, pagep, fsdata);
> + }
> + *fsdata = (void *)0;
> retry:
> /*
> * With delayed allocation, we don't log the i_disksize update
> @@ -2540,6 +2574,19 @@ static int ext4_da_write_end(struct file *file,
> handle_t *handle = ext4_journal_current_handle();
> loff_t new_i_size;
> unsigned long start, end;
> + int write_mode = (int)fsdata;
> +
> + if (write_mode == FALL_BACK_TO_NONDELALLOC) {
> + if (ext4_should_order_data(inode)) {
> + return ext4_ordered_write_end(file, mapping, pos,
> + len, copied, page, fsdata);
> + } else if (ext4_should_writeback_data(inode)) {
> + return ext4_writeback_write_end(file, mapping, pos,
> + len, copied, page, fsdata);
> + } else {
> + BUG();
> + }
> + }
>
> start = pos & (PAGE_CACHE_SIZE - 1);
> end = start + copied -1;
> @@ -4877,6 +4924,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
> loff_t size;
> unsigned long len;
> int ret = -EINVAL;
> + void *fsdata;
> struct file *file = vma->vm_file;
> struct inode *inode = file->f_path.dentry->d_inode;
> struct address_space *mapping = inode->i_mapping;
> @@ -4915,11 +4963,11 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
> * on the same page though
> */
> ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
> - len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
> + len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
> if (ret < 0)
> goto out_unlock;
> ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
> - len, len, page, NULL);
> + len, len, page, fsdata);
> if (ret < 0)
> goto out_unlock;
> ret = 0;

2008-08-28 21:03:18

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH -V3 06/11] ext4: Update meta-data reservation with delalloc.


在 2008-08-27三的 20:58 +0530,Aneesh Kumar K.V写道:
> This makes the meta-data reservation simpler. The logic
> followed is simpler. After each block allocation request
> if we have allocated some meta-data blocks subtract the
> same from the reserved meta-data blocks. If the total
> reserved data blocks after allocation is zero, free the
> remaining meta-data blocks reserved.

With this change ext4 keeps unnecessary blocks reserved for metadata
blocks for a longer time (untilall dirty data have been flushed), I am
concerned this will leads to early ENOSPC.

The current metadata reservation logic is a little complex, but it's not
that bad. It's there to make sure we don't over-reserve the metadata.
I'd say it worth the effort.

> During reservation
> if the total reserved blocks need more meta-data blocks
> add the extra meta-data blocks needed to the reserve_meta_blocks
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> fs/ext4/inode.c | 75 +++++++++++++++++++++++++++----------------------------
> 1 files changed, 37 insertions(+), 38 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index a45121f..3ef0822 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1019,31 +1019,34 @@ static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
> static void ext4_da_update_reserve_space(struct inode *inode, int used)
> {
> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> - int total, mdb, mdb_free;
>
> spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
> - /* recalculate the number of metablocks still need to be reserved */
> - total = EXT4_I(inode)->i_reserved_data_blocks - used;
> - mdb = ext4_calc_metadata_amount(inode, total);
> -
> - /* figure out how many metablocks to release */
> - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
> - mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
> -
> - if (mdb_free) {
> - /* Account for allocated meta_blocks */
> - mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
> -
> - /* update fs dirty blocks counter */
> - percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
> + if (EXT4_I(inode)->i_allocated_meta_blocks) {
> + /* update the reseved meta- blocks */
> + BUG_ON(EXT4_I(inode)->i_allocated_meta_blocks >
> + EXT4_I(inode)->i_reserved_meta_blocks);
> + EXT4_I(inode)->i_reserved_meta_blocks -=
> + EXT4_I(inode)->i_allocated_meta_blocks;
> EXT4_I(inode)->i_allocated_meta_blocks = 0;
> - EXT4_I(inode)->i_reserved_meta_blocks = mdb;
> + /*
> + * We already updated the percpu dirty
> + * block counter in the block allocation path.
> + */
> }
> -
> /* update per-inode reservations */
> BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
> EXT4_I(inode)->i_reserved_data_blocks -= used;
> + if (!EXT4_I(inode)->i_reserved_data_blocks) {
> + /*
> + * If we don't have any reserved data blocks
> + * release all the resered meta data blocks
> + * update percpu dirty block counter also.
> + */
> + percpu_counter_sub(&sbi->s_dirtyblocks_counter,
> + EXT4_I(inode)->i_reserved_meta_blocks);
> + EXT4_I(inode)->i_reserved_meta_blocks = 0;
>
> + }
> spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
> }
>
> @@ -1524,7 +1527,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
> {
> int retries = 0;
> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> - unsigned long md_needed, mdblocks, total = 0;
> + unsigned long md_needed = 0, mdblocks, total = 0;
>
> /*
> * recalculate the amount of metadata blocks to reserve
> @@ -1535,9 +1538,9 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
> spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
> total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
> mdblocks = ext4_calc_metadata_amount(inode, total);
> - BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
> + if (mdblocks > EXT4_I(inode)->i_reserved_meta_blocks)
> + md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
>
> - md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
> total = md_needed + nrblocks;
>
> if (ext4_claim_free_blocks(sbi, total)) {
> @@ -1549,7 +1552,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
> return -ENOSPC;
> }
> EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
> - EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
> + EXT4_I(inode)->i_reserved_meta_blocks += md_needed;
>
> spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
> return 0; /* success */
> @@ -1558,13 +1561,12 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
> static void ext4_da_release_space(struct inode *inode, int to_free)
> {
> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> - int total, mdb, mdb_free, release;
> + int release, mdb_free = 0;
>
> if (!to_free)
> return; /* Nothing to release, exit */
>
> spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
> -
> if (!EXT4_I(inode)->i_reserved_data_blocks) {
> /*
> * if there is no reserved blocks, but we try to free some
> @@ -1578,26 +1580,23 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
> spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
> return;
> }
> + /* update per-inode reservations */
> + BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
> + EXT4_I(inode)->i_reserved_data_blocks -= to_free;
> + if (!EXT4_I(inode)->i_reserved_data_blocks) {
> + /*
> + * If we don't have any reserved data blocks
> + * release all the resered meta data blocks
> + * update percpu dirty block counter also.
> + */
> + mdb_free = EXT4_I(inode)->i_reserved_meta_blocks;
> + EXT4_I(inode)->i_reserved_meta_blocks = 0;
>
> - /* recalculate the number of metablocks still need to be reserved */
> - total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
> - mdb = ext4_calc_metadata_amount(inode, total);
> -
> - /* figure out how many metablocks to release */
> - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
> - mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
> -
> + }
> release = to_free + mdb_free;
>
> /* update fs dirty blocks counter for truncate case */
> percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
> -
> - /* update per-inode reservations */
> - BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
> - EXT4_I(inode)->i_reserved_data_blocks -= to_free;
> -
> - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
> - EXT4_I(inode)->i_reserved_meta_blocks = mdb;
> spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
> }
>

2008-08-28 21:03:45

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH -V3 07/11] ext4: request for blocks with ar.excepted_group = -1


在 2008-08-27三的 20:58 +0530,Aneesh Kumar K.V写道:
> Otherwise we skip group 0 during block allocation.
> This cause ENOSPC even if we have free blocks in
> group 0. This should be merged with defrag. The
> expected_group changes are introduced by defrag patches.
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---

Added to patch queue.

Mingming
> fs/ext4/balloc.c | 1 +
> fs/ext4/extents.c | 1 +
> 2 files changed, 2 insertions(+), 0 deletions(-)
>
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index b19346a..53fdb05 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -2023,6 +2023,7 @@ static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
> ar.goal = goal;
> ar.len = *count;
> ar.logical = iblock;
> + ar.excepted_group = -1;
>
> if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
> /* enable in-core preallocation for data block allocation */
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index bf612a7..268e96d 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -2879,6 +2879,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
> ar.goal = ext4_ext_find_goal(inode, path, iblock);
> ar.logical = iblock;
> ar.len = allocated;
> + ar.excepted_group = -1;
> if (S_ISREG(inode->i_mode))
> ar.flags = EXT4_MB_HINT_DATA;
> else

2008-08-28 21:04:14

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH -V3 08/11] ext4: Signed arithematic fix


在 2008-08-27三的 20:58 +0530,Aneesh Kumar K.V写道:
> This patch converts some usage of ext4_fsblk_t to s64
> This is needed so that some of the sign conversion works
> as expected in if loops.
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>

Reviewed-by: Mingming Cao <[email protected]>
Added to patch queue
> ---
> fs/ext4/balloc.c | 19 ++++++++++---------
> fs/ext4/ext4.h | 4 ++--
> 2 files changed, 12 insertions(+), 11 deletions(-)
>
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index 53fdb05..7fdc236 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -1603,10 +1603,10 @@ ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
> }
>
> int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
> - ext4_fsblk_t nblocks)
> + s64 nblocks)
> {
> s64 free_blocks, dirty_blocks;
> - ext4_fsblk_t root_blocks = 0;
> + s64 root_blocks = 0;
> struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
> struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
>
> @@ -1631,7 +1631,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
> /* Check whether we have space after
> * accounting for current dirty blocks
> */
> - if (free_blocks < ((s64)(root_blocks + nblocks) + dirty_blocks))
> + if (free_blocks < ((root_blocks + nblocks) + dirty_blocks))
> /* we don't have free space */
> return -ENOSPC;
>
> @@ -1650,10 +1650,10 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
> * On success, return nblocks
> */
> ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
> - ext4_fsblk_t nblocks)
> + s64 nblocks)
> {
> - ext4_fsblk_t free_blocks, dirty_blocks;
> - ext4_fsblk_t root_blocks = 0;
> + s64 free_blocks, dirty_blocks;
> + s64 root_blocks = 0;
> struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
> struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
>
> @@ -1667,14 +1667,15 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
>
> if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
> EXT4_FREEBLOCKS_WATERMARK) {
> - free_blocks = percpu_counter_sum_positive(fbc);
> - dirty_blocks = percpu_counter_sum_positive(dbc);
> + free_blocks = percpu_counter_sum(fbc);
> + dirty_blocks = percpu_counter_sum(dbc);
> }
> if (free_blocks <= (root_blocks + dirty_blocks))
> /* we don't have free space */
> return 0;
> +
> if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
> - return free_blocks - root_blocks;
> + return free_blocks - (root_blocks + dirty_blocks);
> return nblocks;
> }
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 71a4fde..13c69ed 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1048,9 +1048,9 @@ extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
> extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
> ext4_fsblk_t goal, unsigned long *count, int *errp);
> extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
> - ext4_fsblk_t nblocks);
> + s64 nblocks);
> extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
> - ext4_fsblk_t nblocks);
> + s64 nblocks);
> extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
> ext4_fsblk_t block, unsigned long count, int metadata);
> extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,

2008-08-28 07:59:24

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Wed, 2008-08-27 at 14:22 -0700, Andrew Morton wrote:

> btw, what the heck is percpu_counter_init_irq()? Some mysterious
> lockdep-specific thing?
>
> <does git-fiddle. Oh. crappy changelog.>
>
> I let that one leak through uncommented. Must be getting old.
> Probably it will need an EXPORT_SYMBOL() sometime.

Basically all it does it break the percpu_counter lock into two classes.

One for the irq-unsafe users and one for the irq-safe users. Without
this lockdep goes splat complaining about irq recursion deadlocks and
the like between these two separate users.


2008-08-28 21:58:07

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH -V3 09/11] ext4: Fix ext4 nomballoc allocator for ENOSPC


在 2008-08-27三的 20:58 +0530,Aneesh Kumar K.V写道:
> Make sure we set windowsize to zero if the free
> blocks left is less that window size. Otherwise
> we skip some group with low freeblock count during
> block allocation
>

Current code has a way to try to prevent early ENOSPC with old ext3
block reservation. After searching for all block groups and can't do
block reservation and allocation, it will fall back to no block
reservation and scan the block groups from the beginning again.

But this doesn't work in the case the reservation was turned off in the
first goal block group allocation due to 0 free blocks, and the rest
block groups are skipped due to the check of "free_blocks < windowsz/2",
I think this causes the ENOSPC error you saw.

There are two issues. I am attaching the fix for two issues here.

Thanks,

From: Mingming Cao <[email protected]>

ext4: Fix ext4 nomballoc allocator for ENOSPC

We run into ENOSPC error on nonmballoc ext4, even when there is free blocks
on the filesystem.

The problem is triggered in the case the goal block group has 0 free blocks
, and the rest block groups are skipped due to the check of "free_blocks
< windowsz/2". Current code could fall back to non reservation allocation
to prevent early ENOSPC after examing all the block groups with reservation on
, but this code was bypassed if the reservation window is turned off already,
which is true in this case.

This patch fixed two issues:
1) We don't need to turn off block reservation if the goal block group has
0 free blocks left and continue search for the rest of block groups.

Current code the intention is to turn off the block reservation if the
goal allocation group has a few (some) free blocks left (not enough
for make the desired reservation window),to try to allocation in the
goal block group, to get better locality. But if the goal blocks have
0 free blocks, it should leave the block reservation on, and continues
search for the next block groups,rather than turn off block reservation
completely.

2) we don't need to check the window size if the block reservation is off.

Signed-off-by: Mingming Cao <[email protected]>

Index: linux-2.6.27-rc3/fs/ext4/balloc.c
===================================================================
--- linux-2.6.27-rc3.orig/fs/ext4/balloc.c 2008-08-28 12:41:55.000000000 -0700
+++ linux-2.6.27-rc3/fs/ext4/balloc.c 2008-08-28 14:40:43.000000000 -0700
@@ -1807,6 +1807,7 @@
* turn off reservation for this allocation
*/
if (my_rsv && (free_blocks < windowsz)
+ && (free_blocks > 0)
&& (rsv_is_empty(&my_rsv->rsv_window)))
my_rsv = NULL;

@@ -1843,7 +1844,7 @@
* free blocks is less than half of the reservation
* window size.
*/
- if (free_blocks <= (windowsz/2))
+ if (my_rsv && (free_blocks <= (windowsz/2)))
continue;

brelse(bitmap_bh);

2008-08-28 22:59:48

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture


在 2008-08-27三的 21:09 -0700,Andrew Morton写道:
> On Thu, 28 Aug 2008 09:22:00 +0530 "Aneesh Kumar K.V" <[email protected]> wrote:
>
> > On Wed, Aug 27, 2008 at 02:22:50PM -0700, Andrew Morton wrote:
> > > On Wed, 27 Aug 2008 23:01:52 +0200
> > > Peter Zijlstra <[email protected]> wrote:
> > >
> > > > >
> > > > > > +static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> > > > > > +{
> > > > > > + return fbc_count(fbc);
> > > > > > +}
> > > > >
> > > > > This change means that a percpu_counter_read() from interrupt context
> > > > > on a 32-bit machine is now deadlockable, whereas it previously was not
> > > > > deadlockable on either 32-bit or 64-bit.
> > > > >
> > > > > This flows on to the lib/proportions.c, which uses
> > > > > percpu_counter_read() and also does spin_lock_irqsave() internally,
> > > > > indicating that it is (or was) designed to be used in IRQ contexts.
> > > >
> > > > percpu_counter() never was irq safe, which is why the proportion stuff
> > > > does all the irq disabling bits by hand.
> > >
> > > percpu_counter_read() was irq-safe. That changes here. Needs careful
> > > review, changelogging and, preferably, runtime checks. But perhaps
> > > they should be inside some CONFIG_thing which won't normally be done in
> > > production.
> > >
> > > otoh, percpu_counter_read() is in fact a rare operation, so a bit of
> > > overhead probably won't matter.
> > >
> > > (write-often, read-rarely is the whole point. This patch's changelog's
> > > assertion that "Since fbc->count is read more frequently and updated
> > > rarely" is probably wrong. Most percpu_counters will have their
> > > fbc->count modified far more frequently than having it read from).
> >
> > we may actually be doing percpu_counter_add. But that doesn't update
> > fbc->count. Only if the local percpu values cross FBC_BATCH we update
> > fbc->count. If we are modifying fbc->count more frequently than
> > reading fbc->count then i guess we would be contenting of fbc->lock more.
> >
> >
>
> Yep. The frequency of modification of fbc->count is of the order of a
> tenth or a hundredth of the frequency of
> precpu_counter_<modification>() calls.
>
> But in many cases the frequency of percpu_counter_read() calls is far
> far less than this. For example, the percpu_counter_read() may only
> happen when userspace polls a /proc file.
>
>

The global counter is is much more frequently accessed with delalloc.:(

With delayed allocation, we have to do read the free blocks counter at
each write_begin(), to make sure there is enough free blocks to do
block reservation to prevent lately writepages returns ENOSPC.

Mingming

2008-08-29 03:44:58

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH -V3 09/11] ext4: Fix ext4 nomballoc allocator for ENOSPC

On Thu, Aug 28, 2008 at 02:57:49PM -0700, Mingming Cao wrote:
>
> From: Mingming Cao <[email protected]>
>
> ext4: Fix ext4 nomballoc allocator for ENOSPC
>
> We run into ENOSPC error on nonmballoc ext4, even when there is free blocks
> on the filesystem.
>
> The problem is triggered in the case the goal block group has 0 free blocks
> , and the rest block groups are skipped due to the check of "free_blocks
> < windowsz/2".

The goal block group had free blocks < windowsz .


>Current code could fall back to non reservation allocation
> to prevent early ENOSPC after examing all the block groups with reservation on
> , but this code was bypassed if the reservation window is turned off already,
> which is true in this case.
>
> This patch fixed two issues:
> 1) We don't need to turn off block reservation if the goal block group has
> 0 free blocks left and continue search for the rest of block groups.
>
> Current code the intention is to turn off the block reservation if the
> goal allocation group has a few (some) free blocks left (not enough
> for make the desired reservation window),to try to allocation in the
> goal block group, to get better locality. But if the goal blocks have
> 0 free blocks, it should leave the block reservation on, and continues
> search for the next block groups,rather than turn off block reservation
> completely.

I don't see how this change is going to make a difference. The goal group
had free blocks < windowsz and that made my_rsv = NULL. I guess we
should not make my_rsv in the first loop. Or in otherwords we can remove

/*
* if there is not enough free blocks to make a new
* resevation
* turn off reservation for this allocation
*/
if (my_rsv && (free_blocks < windowsz)
&& (free_blocks > 0)
&& (rsv_is_empty(&my_rsv->rsv_window)))
my_rsv = NULL;

And since we have the below check in the for loop

if (my_rsv && (free_blocks <= (windowsz/2)))
continue;

We would skip all the groups that have low free block count.
Now if we are not able to allocate any blocks (ENOSPC)
we loop back because of

if (my_rsv) {
my_rsv = NULL;
windowsz = 0;
group_no = goal_group;
goto retry_alloc;
}

and that would allocate blocks from the first group available.
This also give a chance to scan all the groups to make sure
if we have any of them left with enough free blocks to
add to the reservation.


>
> 2) we don't need to check the window size if the block reservation is off.


This change i have already tested.

>
> Signed-off-by: Mingming Cao <[email protected]>


Signed-off-by: Aneesh Kumar K.V <[email protected]>


>
> Index: linux-2.6.27-rc3/fs/ext4/balloc.c
> ===================================================================
> --- linux-2.6.27-rc3.orig/fs/ext4/balloc.c 2008-08-28 12:41:55.000000000 -0700
> +++ linux-2.6.27-rc3/fs/ext4/balloc.c 2008-08-28 14:40:43.000000000 -0700
> @@ -1807,6 +1807,7 @@
> * turn off reservation for this allocation
> */
> if (my_rsv && (free_blocks < windowsz)
> + && (free_blocks > 0)
> && (rsv_is_empty(&my_rsv->rsv_window)))
> my_rsv = NULL;
>
> @@ -1843,7 +1844,7 @@
> * free blocks is less than half of the reservation
> * window size.
> */
> - if (free_blocks <= (windowsz/2))
> + if (my_rsv && (free_blocks <= (windowsz/2)))
> continue;
>
> brelse(bitmap_bh);
>
>

2008-08-29 04:14:26

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH -V3 09/11] ext4: Fix ext4 nomballoc allocator for ENOSPC

On Fri, Aug 29, 2008 at 09:14:51AM +0530, Aneesh Kumar K.V wrote:
> On Thu, Aug 28, 2008 at 02:57:49PM -0700, Mingming Cao wrote:
> >
> > From: Mingming Cao <[email protected]>
> >
> > ext4: Fix ext4 nomballoc allocator for ENOSPC
> >
> > We run into ENOSPC error on nonmballoc ext4, even when there is free blocks
> > on the filesystem.
> >
> > The problem is triggered in the case the goal block group has 0 free blocks
> > , and the rest block groups are skipped due to the check of "free_blocks
> > < windowsz/2".
>
> The goal block group had free blocks < windowsz .
>
>

Ok how about this. The Final change is same to what you have done.
But it make the code easier to understand. I also added a comment
explaining the details

commit 0216ee1ac13270c1ab7b7517d41775727f7da02d
Author: Aneesh Kumar K.V <[email protected]>
Date: Fri Aug 29 09:35:15 2008 +0530

ext4: Fix ext4 nomballoc allocator for ENOSPC

We run into ENOSPC error on nonmballoc ext4, even when there is free blocks
on the filesystem.

The patch include two changes

a) Set reservation to NULL if we trying to allocate near group_target_block
from the goal group if the free block in the group is less than windowsz.
This should give us a better chance to allocate near group_target_block.
This also ensures that if we are not allocating near group_target_block
then we don't trun off reservation. This should enable us to allocate
with reservation from other groups that have large free blocks count.

b) we don't need to check the window size if the block reservation is off.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: Mingming Cao <[email protected]>

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index cfe01b4..399bec5 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1802,15 +1802,17 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
goto io_error;

free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
- /*
- * if there is not enough free blocks to make a new resevation
- * turn off reservation for this allocation
- */
- if (my_rsv && (free_blocks < windowsz)
- && (rsv_is_empty(&my_rsv->rsv_window)))
- my_rsv = NULL;

if (free_blocks > 0) {
+ /*
+ * try to allocate with group target block
+ * in the goal group. If we have low free_blocks
+ * count turn off reservation
+ */
+ if (my_rsv && (free_blocks < windowsz)
+ && (rsv_is_empty(&my_rsv->rsv_window)))
+ my_rsv = NULL;
+
bitmap_bh = ext4_read_block_bitmap(sb, group_no);
if (!bitmap_bh)
goto io_error;
@@ -1843,7 +1845,7 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
* free blocks is less than half of the reservation
* window size.
*/
- if (free_blocks <= (windowsz/2))
+ if (my_rsv && (free_blocks <= (windowsz/2)))
continue;

brelse(bitmap_bh);

2008-08-29 05:02:13

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH -V3 09/11] ext4: Fix ext4 nomballoc allocator for ENOSPC


在 2008-08-29五的 09:44 +0530,Aneesh Kumar K.V写道:
> On Fri, Aug 29, 2008 at 09:14:51AM +0530, Aneesh Kumar K.V wrote:
> > On Thu, Aug 28, 2008 at 02:57:49PM -0700, Mingming Cao wrote:
> > >
> > > From: Mingming Cao <[email protected]>
> > >
> > > ext4: Fix ext4 nomballoc allocator for ENOSPC
> > >
> > > We run into ENOSPC error on nonmballoc ext4, even when there is free blocks
> > > on the filesystem.
> > >
> > > The problem is triggered in the case the goal block group has 0 free blocks
> > > , and the rest block groups are skipped due to the check of "free_blocks
> > > < windowsz/2".
> >
> > The goal block group had free blocks < windowsz .
> >
> >
>
> Ok how about this. The Final change is same to what you have done.
> But it make the code easier to understand. I also added a comment
> explaining the details
>

Fine with me, I will update the patch in the ext4 patch queue with
additional comment. But Andrew has already took ext2/3 version to mm
tree, I am not sure if it worth to resend with an patch against original
patch.

> commit 0216ee1ac13270c1ab7b7517d41775727f7da02d
> Author: Aneesh Kumar K.V <[email protected]>
> Date: Fri Aug 29 09:35:15 2008 +0530
>
> ext4: Fix ext4 nomballoc allocator for ENOSPC
>
> We run into ENOSPC error on nonmballoc ext4, even when there is free blocks
> on the filesystem.
>
> The patch include two changes
>
> a) Set reservation to NULL if we trying to allocate near group_target_block
> from the goal group if the free block in the group is less than windowsz.
> This should give us a better chance to allocate near group_target_block.
> This also ensures that if we are not allocating near group_target_block
> then we don't trun off reservation. This should enable us to allocate
> with reservation from other groups that have large free blocks count.
>
> b) we don't need to check the window size if the block reservation is off.
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> Signed-off-by: Mingming Cao <[email protected]>
>
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index cfe01b4..399bec5 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -1802,15 +1802,17 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
> goto io_error;
>
> free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
> - /*
> - * if there is not enough free blocks to make a new resevation
> - * turn off reservation for this allocation
> - */
> - if (my_rsv && (free_blocks < windowsz)
> - && (rsv_is_empty(&my_rsv->rsv_window)))
> - my_rsv = NULL;
>
> if (free_blocks > 0) {
> + /*
> + * try to allocate with group target block
> + * in the goal group. If we have low free_blocks
> + * count turn off reservation
> + */
> + if (my_rsv && (free_blocks < windowsz)
> + && (rsv_is_empty(&my_rsv->rsv_window)))
> + my_rsv = NULL;
> +
> bitmap_bh = ext4_read_block_bitmap(sb, group_no);
> if (!bitmap_bh)
> goto io_error;
> @@ -1843,7 +1845,7 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
> * free blocks is less than half of the reservation
> * window size.
> */
> - if (free_blocks <= (windowsz/2))
> + if (my_rsv && (free_blocks <= (windowsz/2)))
> continue;
>
> brelse(bitmap_bh);

2008-08-29 05:06:19

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH -V3 09/11] ext4: Fix ext4 nomballoc allocator for ENOSPC


在 2008-08-29五的 09:14 +0530,Aneesh Kumar K.V写道:
> On Thu, Aug 28, 2008 at 02:57:49PM -0700, Mingming Cao wrote:
> >
> > From: Mingming Cao <[email protected]>
> >
> > ext4: Fix ext4 nomballoc allocator for ENOSPC
> >
> > We run into ENOSPC error on nonmballoc ext4, even when there is free blocks
> > on the filesystem.
> >
> > The problem is triggered in the case the goal block group has 0 free blocks
> > , and the rest block groups are skipped due to the check of "free_blocks
> > < windowsz/2".
>
> The goal block group had free blocks < windowsz .
>
Hmm, if the goal block group had free blocks, why allocation failed
(reservation is turned off by setting my_rsv as NULL)? I wonder if there
is other threads trying to allocating in the same goal block group at
the same time, steal the last free blocks?

Mingming

2008-08-29 08:26:55

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH -V3 09/11] ext4: Fix ext4 nomballoc allocator for ENOSPC

On Thu, Aug 28, 2008 at 10:06:14PM -0700, Mingming Cao wrote:
>
> 在 2008-08-29五的 09:14 +0530,Aneesh Kumar K.V写道:
> > On Thu, Aug 28, 2008 at 02:57:49PM -0700, Mingming Cao wrote:
> > >
> > > From: Mingming Cao <[email protected]>
> > >
> > > ext4: Fix ext4 nomballoc allocator for ENOSPC
> > >
> > > We run into ENOSPC error on nonmballoc ext4, even when there is free blocks
> > > on the filesystem.
> > >
> > > The problem is triggered in the case the goal block group has 0 free blocks
> > > , and the rest block groups are skipped due to the check of "free_blocks
> > > < windowsz/2".
> >
> > The goal block group had free blocks < windowsz .
> >
> Hmm, if the goal block group had free blocks, why allocation failed
> (reservation is turned off by setting my_rsv as NULL)? I wonder if there
> is other threads trying to allocating in the same goal block group at
> the same time, steal the last free blocks?
>

We are trying block allocation with a grp_target_blk there and even if
reservation is turned off it can return ENOSPC.

-aneesh

2008-10-09 20:51:15

by Eric Sandeen

[permalink] [raw]
Subject: Re: [PATCH -V3 04/11] ext4: Add percpu dirty block accounting.

Aneesh Kumar K.V wrote:
> This patch add dirty block accounting using percpu_counters.
> Delayed allocation block reservation is now done by updating
> dirty block counter. In the later patch we switch to non
> delalloc mode if the filesystem free blocks is < that
> 150 % of total filesystem dirty blocks
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>

...

(nitpick, I wish the changelog stated why the change was made, rather
than simply describing the change...) but anyway:

> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 419009f..4da4b9a 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2971,22 +2971,11 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
> le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
> gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
> spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
> -
> + percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
> /*
> - * free blocks account has already be reduced/reserved
> - * at write_begin() time for delayed allocation
> - * do not double accounting
> + * Now reduce the dirty block count also. Should not go negative
> */
> - if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) &&
> - ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) {
> - /*
> - * we allocated less blocks than we calimed
> - * Add the difference back
> - */
> - percpu_counter_add(&sbi->s_freeblocks_counter,
> - ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len);
> - }
> -
> + percpu_counter_sub(&sbi->s_dirtyblocks_counter, ac->ac_b_ex.fe_len);
> if (sbi->s_log_groups_per_flex) {
> ext4_group_t flex_group = ext4_flex_group(sbi,
> ac->ac_b_ex.fe_group);

Why was this part removed? Near as I can tell it's still needed; with
all patches in the queue applied, if I run fallocate to try and allocate
10G of space to a file, on a filesystem with 30G free, I run out of
space after only 1.6G is allocated!

# /mnt/test/fallocate-amit -f /mnt/test/testfile 0 10737418240

SYSCALL: received error 28, ret=-1
# FALLOCATE TEST REPORT #
New blocks preallocated = 0.
Number of bytes preallocated = 0
Old file size = 0, New file size -474484472.
Old num blocks = 0, New num blocks 0.
test_fallocate: ERROR ! ret=1


#!# TESTS FAILED #!#

I see the request for the original 2621440 blocks come in; this gets
limited to 32767 due to max uninit length.

Somehow, though, we seem to be allocating only 2048 blocks at a time
(haven't worked out why, yet - this also seems problematic) - but at any
rate, losing (32767-2048) blocks in each loop from fallocate seems to be
causing this space loss and eventual ENOSPC.

fallocate loops 243 times for me; losing (32767-2048) each time accounts
for the 28G:

(32767-2048)*243*4096/1024/1024/1024
28

(plus the ~2G actually allocated gets us back to 30G that was originally
free)

Anyway, fsck finds no errors, and remounting fixes it. It's apparently
just the in-memory counters that get off.

-Eric

2008-10-10 04:52:25

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH -V3 04/11] ext4: Add percpu dirty block accounting.

On Thu, Oct 09, 2008 at 03:44:51PM -0500, Eric Sandeen wrote:
> Aneesh Kumar K.V wrote:
> > This patch add dirty block accounting using percpu_counters.
> > Delayed allocation block reservation is now done by updating
> > dirty block counter. In the later patch we switch to non
> > delalloc mode if the filesystem free blocks is < that
> > 150 % of total filesystem dirty blocks
> >
> > Signed-off-by: Aneesh Kumar K.V <[email protected]>
>
> ...
>
> (nitpick, I wish the changelog stated why the change was made, rather
> than simply describing the change...) but anyway:
>
> > diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> > index 419009f..4da4b9a 100644
> > --- a/fs/ext4/mballoc.c
> > +++ b/fs/ext4/mballoc.c
> > @@ -2971,22 +2971,11 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
> > le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
> > gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
> > spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
> > -
> > + percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
> > /*
> > - * free blocks account has already be reduced/reserved
> > - * at write_begin() time for delayed allocation
> > - * do not double accounting
> > + * Now reduce the dirty block count also. Should not go negative
> > */
> > - if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) &&
> > - ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) {
> > - /*
> > - * we allocated less blocks than we calimed
> > - * Add the difference back
> > - */
> > - percpu_counter_add(&sbi->s_freeblocks_counter,
> > - ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len);
> > - }
> > -
> > + percpu_counter_sub(&sbi->s_dirtyblocks_counter, ac->ac_b_ex.fe_len);
> > if (sbi->s_log_groups_per_flex) {
> > ext4_group_t flex_group = ext4_flex_group(sbi,
> > ac->ac_b_ex.fe_group);
>
> Why was this part removed? Near as I can tell it's still needed; with
> all patches in the queue applied, if I run fallocate to try and allocate
> 10G of space to a file, on a filesystem with 30G free, I run out of
> space after only 1.6G is allocated!
>
> # /mnt/test/fallocate-amit -f /mnt/test/testfile 0 10737418240
>
> SYSCALL: received error 28, ret=-1
> # FALLOCATE TEST REPORT #
> New blocks preallocated = 0.
> Number of bytes preallocated = 0
> Old file size = 0, New file size -474484472.
> Old num blocks = 0, New num blocks 0.
> test_fallocate: ERROR ! ret=1
>
>
> #!# TESTS FAILED #!#
>
> I see the request for the original 2621440 blocks come in; this gets
> limited to 32767 due to max uninit length.
>
> Somehow, though, we seem to be allocating only 2048 blocks at a time
> (haven't worked out why, yet - this also seems problematic) - but at any
> rate, losing (32767-2048) blocks in each loop from fallocate seems to be
> causing this space loss and eventual ENOSPC.
>
> fallocate loops 243 times for me; losing (32767-2048) each time accounts
> for the 28G:
>
> (32767-2048)*243*4096/1024/1024/1024
> 28
>
> (plus the ~2G actually allocated gets us back to 30G that was originally
> free)
>
> Anyway, fsck finds no errors, and remounting fixes it. It's apparently
> just the in-memory counters that get off.
>

Can you test this patch

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 64eeb9a..6e81c38 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2800,7 +2800,7 @@ void exit_ext4_mballoc(void)
*/
static noinline_for_stack int
ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
- handle_t *handle)
+ handle_t *handle, unsigned long reserv_blks)
{
struct buffer_head *bitmap_bh = NULL;
struct ext4_super_block *es;
@@ -2893,7 +2893,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
/*
* Now reduce the dirty block count also. Should not go negative
*/
- percpu_counter_sub(&sbi->s_dirtyblocks_counter, ac->ac_b_ex.fe_len);
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi,
ac->ac_b_ex.fe_group);
@@ -4284,12 +4284,13 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
struct ext4_allocation_request *ar, int *errp)
{
+ int freed;
struct ext4_allocation_context *ac = NULL;
struct ext4_sb_info *sbi;
struct super_block *sb;
ext4_fsblk_t block = 0;
- int freed;
- int inquota;
+ unsigned long inquota;
+ unsigned long reserv_blks;

sb = ar->inode->i_sb;
sbi = EXT4_SB(sb);
@@ -4308,6 +4309,8 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
return 0;
}
}
+ /* Number of reserv_blks for both delayed an non delayed allocation */
+ reserv_blks = ar->len;
while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
ar->flags |= EXT4_MB_HINT_NOPREALLOC;
ar->len--;
@@ -4353,7 +4356,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
}

if (likely(ac->ac_status == AC_STATUS_FOUND)) {
- *errp = ext4_mb_mark_diskspace_used(ac, handle);
+ *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
if (*errp == -EAGAIN) {
ac->ac_b_ex.fe_group = 0;
ac->ac_b_ex.fe_start = 0;

2008-10-10 04:58:39

by Eric Sandeen

[permalink] [raw]
Subject: Re: [PATCH -V3 04/11] ext4: Add percpu dirty block accounting.

Aneesh Kumar K.V wrote:

> Can you test this patch

This does fix my 10G-fallocate testcase.

-Eric

> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 64eeb9a..6e81c38 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2800,7 +2800,7 @@ void exit_ext4_mballoc(void)
> */
> static noinline_for_stack int
> ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
> - handle_t *handle)
> + handle_t *handle, unsigned long reserv_blks)
> {
> struct buffer_head *bitmap_bh = NULL;
> struct ext4_super_block *es;
> @@ -2893,7 +2893,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
> /*
> * Now reduce the dirty block count also. Should not go negative
> */
> - percpu_counter_sub(&sbi->s_dirtyblocks_counter, ac->ac_b_ex.fe_len);
> + percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
> if (sbi->s_log_groups_per_flex) {
> ext4_group_t flex_group = ext4_flex_group(sbi,
> ac->ac_b_ex.fe_group);
> @@ -4284,12 +4284,13 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
> ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
> struct ext4_allocation_request *ar, int *errp)
> {
> + int freed;
> struct ext4_allocation_context *ac = NULL;
> struct ext4_sb_info *sbi;
> struct super_block *sb;
> ext4_fsblk_t block = 0;
> - int freed;
> - int inquota;
> + unsigned long inquota;
> + unsigned long reserv_blks;
>
> sb = ar->inode->i_sb;
> sbi = EXT4_SB(sb);
> @@ -4308,6 +4309,8 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
> return 0;
> }
> }
> + /* Number of reserv_blks for both delayed an non delayed allocation */
> + reserv_blks = ar->len;
> while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
> ar->flags |= EXT4_MB_HINT_NOPREALLOC;
> ar->len--;
> @@ -4353,7 +4356,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
> }
>
> if (likely(ac->ac_status == AC_STATUS_FOUND)) {
> - *errp = ext4_mb_mark_diskspace_used(ac, handle);
> + *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
> if (*errp == -EAGAIN) {
> ac->ac_b_ex.fe_group = 0;
> ac->ac_b_ex.fe_start = 0;
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html


2008-10-11 21:10:22

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH -V3 04/11] ext4: Add percpu dirty block accounting.

On Oct 09, 2008 15:44 -0500, Eric Sandeen wrote:
> Somehow, though, we seem to be allocating only 2048 blocks at a time
> (haven't worked out why, yet - this also seems problematic) - but at any
> rate, losing (32767-2048) blocks in each loop from fallocate seems to be
> causing this space loss and eventual ENOSPC.

I believe the 2048-block (8MB) allocation limit is imposed by mballoc to
avoid scanning the whole filesystem looking for huge chunks of free disk.
That said, it would be nice if there IS lots of free space that this is
allocated optimistically if possible.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.