2008-08-25 11:20:42

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [RFC PATCH -v2] percpu_counters: make fbc->count read atomic on 32 bit architecture

fbc->count is of type s64. The change was introduced by
0216bfcffe424a5473daa4da47440881b36c1f4 which changed the type
from long to s64. Moving to s64 also means on 32 bit architectures
we can get wrong values on fbc->count. Since fbc->count is read
more frequently and updated rarely use seqlocks. This should
reduce the impact of locking in the read path for 32bit arch.

percpu_counter_read is used within interrupt context also. So
use the irq safe version of seqlock while reading

Signed-off-by: Aneesh Kumar K.V <[email protected]>
CC: Peter Zijlstra <[email protected]>
CC: Andrew Morton <[email protected]>
---
include/linux/percpu_counter.h | 29 +++++++++++++++++++++++++----
lib/percpu_counter.c | 20 ++++++++++----------
2 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 9007ccd..36f3d2d 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -6,7 +6,7 @@
* WARNING: these things are HUGE. 4 kbytes per counter on 32-way P4.
*/

-#include <linux/spinlock.h>
+#include <linux/seqlock.h>
#include <linux/smp.h>
#include <linux/list.h>
#include <linux/threads.h>
@@ -16,7 +16,7 @@
#ifdef CONFIG_SMP

struct percpu_counter {
- spinlock_t lock;
+ seqlock_t lock;
s64 count;
#ifdef CONFIG_HOTPLUG_CPU
struct list_head list; /* All percpu_counters are on a list */
@@ -53,10 +53,31 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
return __percpu_counter_sum(fbc);
}

-static inline s64 percpu_counter_read(struct percpu_counter *fbc)
+#if BITS_PER_LONG == 64
+static inline s64 fbc_count(struct percpu_counter *fbc)
{
return fbc->count;
}
+#else
+/* doesn't have atomic 64 bit operation */
+static inline s64 fbc_count(struct percpu_counter *fbc)
+{
+ s64 ret;
+ unsigned seq;
+ unsigned long flags;
+ do {
+ seq = read_seqbegin_irqsave(&fbc->lock, flags);
+ ret = fbc->count;
+ } while(read_seqretry_irqrestore(&fbc->lock, seq, flags));
+ return ret;
+
+}
+#endif
+
+static inline s64 percpu_counter_read(struct percpu_counter *fbc)
+{
+ return fbc_count(fbc);
+}

/*
* It is possible for the percpu_counter_read() to return a small negative
@@ -65,7 +86,7 @@ static inline s64 percpu_counter_read(struct percpu_counter *fbc)
*/
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
- s64 ret = fbc->count;
+ s64 ret = fbc_count(fbc);

barrier(); /* Prevent reloads of fbc->count */
if (ret >= 0)
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index a866389..83bb809 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -18,13 +18,13 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
{
int cpu;

- spin_lock(&fbc->lock);
+ write_seqlock(&fbc->lock);
for_each_possible_cpu(cpu) {
s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
*pcount = 0;
}
fbc->count = amount;
- spin_unlock(&fbc->lock);
+ write_sequnlock(&fbc->lock);
}
EXPORT_SYMBOL(percpu_counter_set);

@@ -37,10 +37,10 @@ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
pcount = per_cpu_ptr(fbc->counters, cpu);
count = *pcount + amount;
if (count >= batch || count <= -batch) {
- spin_lock(&fbc->lock);
+ write_seqlock(&fbc->lock);
fbc->count += count;
*pcount = 0;
- spin_unlock(&fbc->lock);
+ write_sequnlock(&fbc->lock);
} else {
*pcount = count;
}
@@ -57,7 +57,7 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
s64 ret;
int cpu;

- spin_lock(&fbc->lock);
+ write_seqlock(&fbc->lock);
ret = fbc->count;
for_each_online_cpu(cpu) {
s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
@@ -66,7 +66,7 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
}
fbc->count = ret;

- spin_unlock(&fbc->lock);
+ write_sequnlock(&fbc->lock);
return ret;
}
EXPORT_SYMBOL(__percpu_counter_sum);
@@ -75,7 +75,7 @@ EXPORT_SYMBOL(__percpu_counter_sum);

int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
{
- spin_lock_init(&fbc->lock);
+ seqlock_init(&fbc->lock);
fbc->count = amount;
fbc->counters = alloc_percpu(s32);
if (!fbc->counters)
@@ -95,7 +95,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount)

err = percpu_counter_init(fbc, amount);
if (!err)
- lockdep_set_class(&fbc->lock, &percpu_counter_irqsafe);
+ lockdep_set_class(&fbc->lock.lock, &percpu_counter_irqsafe);
return err;
}

@@ -130,11 +130,11 @@ static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb,
s32 *pcount;
unsigned long flags;

- spin_lock_irqsave(&fbc->lock, flags);
+ write_seqlock_irqsave(&fbc->lock, flags);
pcount = per_cpu_ptr(fbc->counters, cpu);
fbc->count += *pcount;
*pcount = 0;
- spin_unlock_irqrestore(&fbc->lock, flags);
+ write_sequnlock_irqrestore(&fbc->lock, flags);
}
mutex_unlock(&percpu_counters_lock);
return NOTIFY_OK;
--
1.6.0.1.90.g27a6e



2008-08-25 11:20:42

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [RFC PATCH -v2] ext4: request for blocks with ar.excepted_group = -1

Otherwise we skip group 0 during block allocation.
This cause ENOSPC even if we have free blocks in
group 0. This should be merged with defrag. The
expected_group changes are introduced by defrag patches.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/balloc.c | 1 +
fs/ext4/extents.c | 1 +
2 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 4ebe3b6..1a068f5 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -2034,6 +2034,7 @@ static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
ar.goal = goal;
ar.len = *count;
ar.logical = iblock;
+ ar.excepted_group = -1;

if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
/* enable in-core preallocation for data block allocation */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bf612a7..268e96d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2879,6 +2879,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
ar.goal = ext4_ext_find_goal(inode, path, iblock);
ar.logical = iblock;
ar.len = allocated;
+ ar.excepted_group = -1;
if (S_ISREG(inode->i_mode))
ar.flags = EXT4_MB_HINT_DATA;
else
--
1.6.0.1.90.g27a6e


2008-08-25 11:20:42

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [RFC PATCH -v2] ext4: Retry block reservation

During block reservation if we don't have enough
blocks left, retry block reservation with smaller
block count. This make sure we try fallocate
and DIO with smaller request size and don't fail early.
The delayed allocation reservation cannot try with smaller
block count. So retry block reservation to handle temporary
disk full conditions. Also print free blocks details if we
fail block allocation during writepages.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/balloc.c | 8 +++++++-
fs/ext4/inode.c | 33 ++++++++++++++++++++++++++++++---
fs/ext4/mballoc.c | 7 ++++++-
3 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 4a53541..b7d1347 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1760,10 +1760,16 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
/*
* With delalloc we already reserved the blocks
*/
- if (ext4_claim_free_blocks(sbi, *count)) {
+ while (*count && ext4_claim_free_blocks(sbi, *count)) {
+ /* let others to free the space */
+ yield();
+ *count = *count >> 1;
+ }
+ if (!*count) {
*errp = -ENOSPC;
return 0; /*return with ENOSPC error */
}
+ num = *count;
}
/*
* Check quota for allocation of this block.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d965a05..72a4a71 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1521,6 +1521,7 @@ static int ext4_journalled_write_end(struct file *file,

static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
{
+ int retries = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned long md_needed, mdblocks, total = 0;

@@ -1529,6 +1530,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
* in order to allocate nrblocks
* worse case is one extent per block
*/
+repeat:
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
mdblocks = ext4_calc_metadata_amount(inode, total);
@@ -1539,6 +1541,10 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)

if (ext4_claim_free_blocks(sbi, total)) {
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+ yield();
+ goto repeat;
+ }
return -ENOSPC;
}
EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
@@ -1813,6 +1819,26 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
return;
}

+static void ext4_print_free_blocks(struct super_block *sb)
+{
+#ifdef EXT4FS_DEBUG
+ ext4_group_t i;
+ struct ext4_group_desc *gdp;
+ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+ printk(KERN_DEBUG "Free blocks details\n");
+ for (i = 0; i < ngroups; i++) {
+ gdp = ext4_get_group_desc(sb, i, NULL);
+ if (!gdp)
+ continue;
+ printk(KERN_DEBUG "Free blocks in group %lu is %d\n",
+ i, le16_to_cpu(gdp->bg_free_blocks_count));
+ }
+#endif
+ printk(KERN_CRIT "Total free blocks count %lld\n",
+ ext4_count_free_blocks(sb));
+ return;
+}
+
/*
* mpage_da_map_blocks - go through given space
*
@@ -1825,20 +1851,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
static int mpage_da_map_blocks(struct mpage_da_data *mpd)
{
int err = 0;
+ struct buffer_head new;
struct buffer_head *lbh = &mpd->lbh;
sector_t next = lbh->b_blocknr;
- struct buffer_head new;

/*
* We consider only non-mapped and non-allocated blocks
*/
if (buffer_mapped(lbh) && !buffer_delay(lbh))
return 0;
-
new.b_state = lbh->b_state;
new.b_blocknr = 0;
new.b_size = lbh->b_size;
-
/*
* If we didn't accumulate anything
* to write simply return
@@ -1871,6 +1895,9 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
lbh->b_size >> mpd->inode->i_blkbits, err);
printk(KERN_EMERG "This should not happen.!! "
"Data will be lost\n");
+ if (err == -ENOSPC) {
+ ext4_print_free_blocks(mpd->inode->i_sb);
+ }
/* invlaidate all the pages */
ext4_da_block_invalidatepages(mpd, next,
lbh->b_size >> mpd->inode->i_blkbits);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4404b46..419009f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4397,7 +4397,12 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
/*
* With delalloc we already reserved the blocks
*/
- if (ext4_claim_free_blocks(sbi, ar->len)) {
+ while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
+ /* let others to free the space */
+ yield();
+ ar->len = ar->len >> 1;
+ }
+ if (!ar->len) {
*errp = -ENOSPC;
return 0;
}
--
1.6.0.1.90.g27a6e


2008-08-25 11:20:42

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [RFC PATCH -v2] ext4: Switch to non delalloc mode when we are low on free blocks count.

delayed allocation allocate blocks during writepages. That also
means we cannot handle block allocation failures. Switch to
non - delalloc when we are running low on free blocks.
Delayed allocation need to do aggressive meta-data block reservation
considering that the requested blocks can all be discontiguous.
Switching to non-delalloc avoids that. Also we can satisfy
partial write in non-delalloc mode.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/inode.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
1 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3f3ecc0..d923a14 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2482,6 +2482,29 @@ static int ext4_da_writepages(struct address_space *mapping,
return ret;
}

+#define FALL_BACK_TO_NONDELALLOC 1
+static int ext4_nonda_switch(struct super_block *sb)
+{
+ s64 free_blocks, dirty_blocks;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ /*
+ * switch to non delalloc mode if we are running low
+ * on free block. The free block accounting via percpu
+ * counters can get slightly wrong with FBC_BATCH getting
+ * accumulated on each CPU without updating global counters
+ * Delalloc need an accurate free block accounting. So switch
+ * to non delalloc when we are near to error range.
+ */
+ free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+ dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
+ if ( 2 * free_blocks < 3 * dirty_blocks) {
+ /* free block count is less that 150% of dirty blocks */
+ return 1;
+ }
+ return 0;
+}
+
static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -2496,6 +2519,13 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
index = pos >> PAGE_CACHE_SHIFT;
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
+
+ if (ext4_nonda_switch(inode->i_sb)) {
+ *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
+ return ext4_write_begin(file, mapping, pos,
+ len, flags, pagep, fsdata);
+ }
+ *fsdata = (void *)0;
retry:
/*
* With delayed allocation, we don't log the i_disksize update
@@ -2564,6 +2594,19 @@ static int ext4_da_write_end(struct file *file,
handle_t *handle = ext4_journal_current_handle();
loff_t new_i_size;
unsigned long start, end;
+ int write_mode = (int)fsdata;
+
+ if (write_mode == FALL_BACK_TO_NONDELALLOC) {
+ if (ext4_should_order_data(inode)) {
+ return ext4_ordered_write_end(file, mapping, pos,
+ len, copied, page, fsdata);
+ } else if (ext4_should_writeback_data(inode)) {
+ return ext4_writeback_write_end(file, mapping, pos,
+ len, copied, page, fsdata);
+ } else {
+ BUG();
+ }
+ }

start = pos & (PAGE_CACHE_SIZE - 1);
end = start + copied -1;
@@ -4901,6 +4944,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
loff_t size;
unsigned long len;
int ret = -EINVAL;
+ void *fsdata;
struct file *file = vma->vm_file;
struct inode *inode = file->f_path.dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
@@ -4939,11 +4983,11 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
* on the same page though
*/
ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
- len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+ len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
if (ret < 0)
goto out_unlock;
ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
- len, len, page, NULL);
+ len, len, page, fsdata);
if (ret < 0)
goto out_unlock;
ret = 0;
--
1.6.0.1.90.g27a6e


2008-08-25 11:20:42

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [RFC PATCH -v2] ext4: Make sure all the block allocation paths reserve blocks

With delayed allocation we need to make sure block are reserved
before we attempt to allocate them. Otherwise we get block
allocation failure (ENOSPC) during writepages which cannot
be handled. This would mean silent data loss (We do a printk
stating data will be lost). This patch update the DIO
and fallocate code path to do block reservation before block
allocation.

When free blocks count go below a threshold we switch to
a slow patch which looks at other CPU's accumulated percpu
counter values.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/balloc.c | 72 ++++++++++++++++++++++++++++++++++++++++++++---------
fs/ext4/ext4.h | 2 +
fs/ext4/inode.c | 5 +---
fs/ext4/mballoc.c | 23 +++++++++-------
4 files changed, 76 insertions(+), 26 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index cfed283..4a53541 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1602,6 +1602,47 @@ ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
return ret;
}

+int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+ ext4_fsblk_t nblocks)
+{
+ s64 free_blocks;
+ ext4_fsblk_t root_blocks = 0;
+ struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
+
+ free_blocks = percpu_counter_read(fbc);
+
+ if (!capable(CAP_SYS_RESOURCE) &&
+ sbi->s_resuid != current->fsuid &&
+ (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+ root_blocks = ext4_r_blocks_count(sbi->s_es);
+#ifdef CONFIG_SMP
+ /* Each CPU can accumulate FBC_BATCH blocks in their local
+ * counters. So we need to make sure we have free blocks more
+ * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times.
+ */
+ if (free_blocks - (nblocks + root_blocks) <
+ (4 * (FBC_BATCH * nr_cpu_ids))) {
+ /*
+ * We need to sum and claim under lock
+ * This is the slow patch which will be
+ * taken when we are very low on free blocks
+ */
+ if (percpu_counter_sum_and_sub(fbc, nblocks + root_blocks))
+ return -ENOSPC;
+ /* add root_blocks back */
+ percpu_counter_add(fbc, root_blocks);
+ return 0;
+ }
+#endif
+ if (free_blocks < (root_blocks + nblocks))
+ /* we don't have free space */
+ return -ENOSPC;
+
+ /* reduce fs free blocks counter */
+ percpu_counter_sub(fbc, nblocks);
+ return 0;
+}
+
/**
* ext4_has_free_blocks()
* @sbi: in-core super block structure.
@@ -1624,9 +1665,15 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
root_blocks = ext4_r_blocks_count(sbi->s_es);
#ifdef CONFIG_SMP
- if (free_blocks - root_blocks < FBC_BATCH)
+ /* Each CPU can accumulate FBC_BATCH blocks in their local
+ * counters. So we need to make sure we have free blocks more
+ * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times.
+ */
+ if (free_blocks - (nblocks + root_blocks) <
+ (4 * (FBC_BATCH * nr_cpu_ids))) {
free_blocks =
percpu_counter_sum(&sbi->s_freeblocks_counter);
+ }
#endif
if (free_blocks <= root_blocks)
/* we don't have free space */
@@ -1634,7 +1681,7 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
if (free_blocks - root_blocks < nblocks)
return free_blocks - root_blocks;
return nblocks;
- }
+}


/**
@@ -1713,14 +1760,11 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
/*
* With delalloc we already reserved the blocks
*/
- *count = ext4_has_free_blocks(sbi, *count);
- }
- if (*count == 0) {
- *errp = -ENOSPC;
- return 0; /*return with ENOSPC error */
+ if (ext4_claim_free_blocks(sbi, *count)) {
+ *errp = -ENOSPC;
+ return 0; /*return with ENOSPC error */
+ }
}
- num = *count;
-
/*
* Check quota for allocation of this block.
*/
@@ -1915,9 +1959,13 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
le16_add_cpu(&gdp->bg_free_blocks_count, -num);
gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
spin_unlock(sb_bgl_lock(sbi, group_no));
- if (!EXT4_I(inode)->i_delalloc_reserved_flag)
- percpu_counter_sub(&sbi->s_freeblocks_counter, num);
-
+ if (!EXT4_I(inode)->i_delalloc_reserved_flag && (*count != num)) {
+ /*
+ * we allocated less blocks than we
+ * claimed. Add the difference back.
+ */
+ percpu_counter_add(&sbi->s_freeblocks_counter, *count - num);
+ }
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
spin_lock(sb_bgl_lock(sbi, flex_group));
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 7f11b25..3738039 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1047,6 +1047,8 @@ extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
unsigned long *count, int *errp);
extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+ ext4_fsblk_t nblocks);
extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
ext4_fsblk_t nblocks);
extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1c289c1..d965a05 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1537,13 +1537,10 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
total = md_needed + nrblocks;

- if (ext4_has_free_blocks(sbi, total) < total) {
+ if (ext4_claim_free_blocks(sbi, total)) {
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
return -ENOSPC;
}
- /* reduce fs free blocks counter */
- percpu_counter_sub(&sbi->s_freeblocks_counter, total);
-
EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 82dd0e4..4404b46 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2977,9 +2977,15 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
* at write_begin() time for delayed allocation
* do not double accounting
*/
- if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
- percpu_counter_sub(&sbi->s_freeblocks_counter,
- ac->ac_b_ex.fe_len);
+ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) &&
+ ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) {
+ /*
+ * we allocated less blocks than we calimed
+ * Add the difference back
+ */
+ percpu_counter_add(&sbi->s_freeblocks_counter,
+ ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len);
+ }

if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -4391,14 +4397,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
/*
* With delalloc we already reserved the blocks
*/
- ar->len = ext4_has_free_blocks(sbi, ar->len);
- }
-
- if (ar->len == 0) {
- *errp = -ENOSPC;
- return 0;
+ if (ext4_claim_free_blocks(sbi, ar->len)) {
+ *errp = -ENOSPC;
+ return 0;
+ }
}

2008-08-25 11:20:42

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [RFC PATCH -v2] ext4: Add percpu dirty block accounting.

This patch add dirty block accounting using percpu_counters.
Delayed allocation block reservation is now done by updating
dirty block counter. In the later patch we switch to non
delalloc mode if the filesystem free blocks is < that
150 % of total filesystem dirty blocks

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/balloc.c | 64 ++++++++++++++++++++++++++++------------------------
fs/ext4/ext4_sb.h | 1 +
fs/ext4/inode.c | 25 ++++++++++++--------
fs/ext4/mballoc.c | 17 ++-----------
fs/ext4/super.c | 8 +++++-
5 files changed, 60 insertions(+), 55 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index b7d1347..4ebe3b6 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1605,11 +1605,13 @@ ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
ext4_fsblk_t nblocks)
{
- s64 free_blocks;
+ s64 free_blocks, dirty_blocks;
ext4_fsblk_t root_blocks = 0;
struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
+ struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;

- free_blocks = percpu_counter_read(fbc);
+ free_blocks = percpu_counter_read_positive(fbc);
+ dirty_blocks = percpu_counter_read_positive(dbc);

if (!capable(CAP_SYS_RESOURCE) &&
sbi->s_resuid != current->fsuid &&
@@ -1620,26 +1622,27 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
* counters. So we need to make sure we have free blocks more
* than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times.
*/
- if (free_blocks - (nblocks + root_blocks) <
+ if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
(4 * (FBC_BATCH * nr_cpu_ids))) {
- /*
- * We need to sum and claim under lock
- * This is the slow patch which will be
- * taken when we are very low on free blocks
- */
- if (percpu_counter_sum_and_sub(fbc, nblocks + root_blocks))
- return -ENOSPC;
- /* add root_blocks back */
- percpu_counter_add(fbc, root_blocks);
- return 0;
+
+ free_blocks = percpu_counter_sum(fbc);
+ dirty_blocks = percpu_counter_sum(dbc);
+ if (dirty_blocks < 0) {
+ printk(KERN_CRIT "Dirty block accounting "
+ "went wrong %lld\n",
+ dirty_blocks);
+ }
}
#endif
- if (free_blocks < (root_blocks + nblocks))
+ /* Check whether we have space after
+ * accounting for current dirty blocks
+ */
+ if (free_blocks < ((s64)(root_blocks + nblocks) + dirty_blocks))
/* we don't have free space */
return -ENOSPC;

- /* reduce fs free blocks counter */
- percpu_counter_sub(fbc, nblocks);
+ /* Add the blocks to nblocks */
+ percpu_counter_add(dbc, nblocks);
return 0;
}

@@ -1655,10 +1658,13 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
ext4_fsblk_t nblocks)
{
- ext4_fsblk_t free_blocks;
+ ext4_fsblk_t free_blocks, dirty_blocks;
ext4_fsblk_t root_blocks = 0;
+ struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
+ struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;

- free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+ free_blocks = percpu_counter_read_positive(fbc);
+ dirty_blocks = percpu_counter_read_positive(dbc);

if (!capable(CAP_SYS_RESOURCE) &&
sbi->s_resuid != current->fsuid &&
@@ -1669,16 +1675,16 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
* counters. So we need to make sure we have free blocks more
* than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times.
*/
- if (free_blocks - (nblocks + root_blocks) <
+ if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
(4 * (FBC_BATCH * nr_cpu_ids))) {
- free_blocks =
- percpu_counter_sum(&sbi->s_freeblocks_counter);
+ free_blocks = percpu_counter_sum_positive(fbc);
+ dirty_blocks = percpu_counter_sum_positive(dbc);
}
#endif
- if (free_blocks <= root_blocks)
+ if (free_blocks <= (root_blocks + dirty_blocks))
/* we don't have free space */
return 0;
- if (free_blocks - root_blocks < nblocks)
+ if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
return free_blocks - root_blocks;
return nblocks;
}
@@ -1965,13 +1971,11 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
le16_add_cpu(&gdp->bg_free_blocks_count, -num);
gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
spin_unlock(sb_bgl_lock(sbi, group_no));
- if (!EXT4_I(inode)->i_delalloc_reserved_flag && (*count != num)) {
- /*
- * we allocated less blocks than we
- * claimed. Add the difference back.
- */
- percpu_counter_add(&sbi->s_freeblocks_counter, *count - num);
- }
+ percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+ /*
+ * Now reduce the dirty block count also. Should not go negative
+ */
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, num);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
spin_lock(sb_bgl_lock(sbi, flex_group));
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 6300226..0fa3762 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -59,6 +59,7 @@ struct ext4_sb_info {
struct percpu_counter s_freeblocks_counter;
struct percpu_counter s_freeinodes_counter;
struct percpu_counter s_dirs_counter;
+ struct percpu_counter s_dirtyblocks_counter;
struct blockgroup_lock s_blockgroup_lock;

/* root of the per fs reservation window tree */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 72a4a71..3f3ecc0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1030,19 +1030,25 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;

- /* Account for allocated meta_blocks */
- mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+ if (mdb_free) {
+ /* Account for allocated meta_blocks */
+ mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;

- /* update fs free blocks counter for truncate case */
- percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free);
+ /* update fs dirty blocks counter */
+ /*
+ * FIXME!! doing this get the free block count wrong
+ * But we need to take care of over allocated meta-data
+ * blocks
+ */
+ //percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
+ EXT4_I(inode)->i_allocated_meta_blocks = 0;
+ EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+ }

/* update per-inode reservations */
BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
EXT4_I(inode)->i_reserved_data_blocks -= used;

- BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
- EXT4_I(inode)->i_reserved_meta_blocks = mdb;
- EXT4_I(inode)->i_allocated_meta_blocks = 0;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
}

@@ -1588,8 +1594,8 @@ static void ext4_da_release_space(struct inode *inode, int to_free)

release = to_free + mdb_free;

- /* update fs free blocks counter for truncate case */
- percpu_counter_add(&sbi->s_freeblocks_counter, release);
+ /* update fs dirty blocks counter for truncate case */
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);

/* update per-inode reservations */
BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
@@ -2490,7 +2496,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
index = pos >> PAGE_CACHE_SHIFT;
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
-
retry:
/*
* With delayed allocation, we don't log the i_disksize update
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 419009f..4da4b9a 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2971,22 +2971,11 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
-
+ percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
/*
- * free blocks account has already be reduced/reserved
- * at write_begin() time for delayed allocation
- * do not double accounting
+ * Now reduce the dirty block count also. Should not go negative
*/
- if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) &&
- ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) {
- /*
- * we allocated less blocks than we calimed
- * Add the difference back
- */
- percpu_counter_add(&sbi->s_freeblocks_counter,
- ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len);
- }
-
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, ac->ac_b_ex.fe_len);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi,
ac->ac_b_ex.fe_group);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ed77786..7b9db51 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -520,6 +520,7 @@ static void ext4_put_super(struct super_block *sb)
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
+ percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
brelse(sbi->s_sbh);
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
@@ -2259,6 +2260,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
err = percpu_counter_init(&sbi->s_dirs_counter,
ext4_count_dirs(sb));
}
+ if (!err) {
+ err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+ }
if (err) {
printk(KERN_ERR "EXT4-fs: insufficient memory\n");
goto failed_mount3;
@@ -2491,6 +2495,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
+ percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
failed_mount2:
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
@@ -3164,7 +3169,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = EXT4_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
- buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
+ buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
+ percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
ext4_free_blocks_count_set(es, buf->f_bfree);
buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
if (buf->f_bfree < ext4_r_blocks_count(es))
--
1.6.0.1.90.g27a6e


2008-08-25 11:27:30

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH -v2] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Mon, 2008-08-25 at 16:50 +0530, Aneesh Kumar K.V wrote:
> fbc->count is of type s64. The change was introduced by
> 0216bfcffe424a5473daa4da47440881b36c1f4 which changed the type
> from long to s64. Moving to s64 also means on 32 bit architectures
> we can get wrong values on fbc->count. Since fbc->count is read
> more frequently and updated rarely use seqlocks. This should
> reduce the impact of locking in the read path for 32bit arch.
>
> percpu_counter_read is used within interrupt context also. So
> use the irq safe version of seqlock while reading
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> CC: Peter Zijlstra <[email protected]>
> CC: Andrew Morton <[email protected]>
> ---
> include/linux/percpu_counter.h | 29 +++++++++++++++++++++++++----
> lib/percpu_counter.c | 20 ++++++++++----------
> 2 files changed, 35 insertions(+), 14 deletions(-)
>
> diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
> index 9007ccd..36f3d2d 100644
> --- a/include/linux/percpu_counter.h
> +++ b/include/linux/percpu_counter.h
> @@ -6,7 +6,7 @@
> * WARNING: these things are HUGE. 4 kbytes per counter on 32-way P4.
> */
>
> -#include <linux/spinlock.h>
> +#include <linux/seqlock.h>
> #include <linux/smp.h>
> #include <linux/list.h>
> #include <linux/threads.h>
> @@ -16,7 +16,7 @@
> #ifdef CONFIG_SMP
>
> struct percpu_counter {
> - spinlock_t lock;
> + seqlock_t lock;
> s64 count;
> #ifdef CONFIG_HOTPLUG_CPU
> struct list_head list; /* All percpu_counters are on a list */
> @@ -53,10 +53,31 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
> return __percpu_counter_sum(fbc);
> }
>
> -static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> +#if BITS_PER_LONG == 64
> +static inline s64 fbc_count(struct percpu_counter *fbc)
> {
> return fbc->count;
> }
> +#else
> +/* doesn't have atomic 64 bit operation */
> +static inline s64 fbc_count(struct percpu_counter *fbc)
> +{
> + s64 ret;
> + unsigned seq;
> + unsigned long flags;
> + do {
> + seq = read_seqbegin_irqsave(&fbc->lock, flags);
> + ret = fbc->count;
> + } while(read_seqretry_irqrestore(&fbc->lock, seq, flags));

Do we really need to disabled IRQs here? It seems to me the worst that
can happen is that the IRQ will change ->count and increase the sequence
number a bit - a case that is perfectly handled by the current retry
logic.

And not doing the IRQ flags bit saves a lot of time on some archs.

> + return ret;
> +
> +}
> +#endif
> +
> +static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> +{
> + return fbc_count(fbc);
> +}
>
> /*
> * It is possible for the percpu_counter_read() to return a small negative
> @@ -65,7 +86,7 @@ static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> */
> static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
> {
> - s64 ret = fbc->count;
> + s64 ret = fbc_count(fbc);
>
> barrier(); /* Prevent reloads of fbc->count */
> if (ret >= 0)
> diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
> index a866389..83bb809 100644
> --- a/lib/percpu_counter.c
> +++ b/lib/percpu_counter.c
> @@ -18,13 +18,13 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
> {
> int cpu;
>
> - spin_lock(&fbc->lock);
> + write_seqlock(&fbc->lock);
> for_each_possible_cpu(cpu) {
> s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
> *pcount = 0;
> }
> fbc->count = amount;
> - spin_unlock(&fbc->lock);
> + write_sequnlock(&fbc->lock);
> }
> EXPORT_SYMBOL(percpu_counter_set);
>
> @@ -37,10 +37,10 @@ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
> pcount = per_cpu_ptr(fbc->counters, cpu);
> count = *pcount + amount;
> if (count >= batch || count <= -batch) {
> - spin_lock(&fbc->lock);
> + write_seqlock(&fbc->lock);
> fbc->count += count;
> *pcount = 0;
> - spin_unlock(&fbc->lock);
> + write_sequnlock(&fbc->lock);
> } else {
> *pcount = count;
> }
> @@ -57,7 +57,7 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
> s64 ret;
> int cpu;
>
> - spin_lock(&fbc->lock);
> + write_seqlock(&fbc->lock);
> ret = fbc->count;
> for_each_online_cpu(cpu) {
> s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
> @@ -66,7 +66,7 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
> }
> fbc->count = ret;
>
> - spin_unlock(&fbc->lock);
> + write_sequnlock(&fbc->lock);
> return ret;
> }
> EXPORT_SYMBOL(__percpu_counter_sum);
> @@ -75,7 +75,7 @@ EXPORT_SYMBOL(__percpu_counter_sum);
>
> int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
> {
> - spin_lock_init(&fbc->lock);
> + seqlock_init(&fbc->lock);
> fbc->count = amount;
> fbc->counters = alloc_percpu(s32);
> if (!fbc->counters)
> @@ -95,7 +95,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount)
>
> err = percpu_counter_init(fbc, amount);
> if (!err)
> - lockdep_set_class(&fbc->lock, &percpu_counter_irqsafe);
> + lockdep_set_class(&fbc->lock.lock, &percpu_counter_irqsafe);
> return err;
> }
>
> @@ -130,11 +130,11 @@ static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb,
> s32 *pcount;
> unsigned long flags;
>
> - spin_lock_irqsave(&fbc->lock, flags);
> + write_seqlock_irqsave(&fbc->lock, flags);
> pcount = per_cpu_ptr(fbc->counters, cpu);
> fbc->count += *pcount;
> *pcount = 0;
> - spin_unlock_irqrestore(&fbc->lock, flags);
> + write_sequnlock_irqrestore(&fbc->lock, flags);
> }
> mutex_unlock(&percpu_counters_lock);
> return NOTIFY_OK;


2008-08-25 14:05:45

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [RFC PATCH -v2] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Mon, Aug 25, 2008 at 01:27:19PM +0200, Peter Zijlstra wrote:
> On Mon, 2008-08-25 at 16:50 +0530, Aneesh Kumar K.V wrote:
> > fbc->count is of type s64. The change was introduced by
> > 0216bfcffe424a5473daa4da47440881b36c1f4 which changed the type
> > from long to s64. Moving to s64 also means on 32 bit architectures
> > we can get wrong values on fbc->count. Since fbc->count is read
> > more frequently and updated rarely use seqlocks. This should
> > reduce the impact of locking in the read path for 32bit arch.
> >
> > percpu_counter_read is used within interrupt context also. So
> > use the irq safe version of seqlock while reading
> >
> > Signed-off-by: Aneesh Kumar K.V <[email protected]>
> > CC: Peter Zijlstra <[email protected]>
> > CC: Andrew Morton <[email protected]>
> > ---
> > include/linux/percpu_counter.h | 29 +++++++++++++++++++++++++----
> > lib/percpu_counter.c | 20 ++++++++++----------
> > 2 files changed, 35 insertions(+), 14 deletions(-)
> >
> > diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
> > index 9007ccd..36f3d2d 100644
> > --- a/include/linux/percpu_counter.h
> > +++ b/include/linux/percpu_counter.h
> > @@ -6,7 +6,7 @@
> > * WARNING: these things are HUGE. 4 kbytes per counter on 32-way P4.
> > */
> >
> > -#include <linux/spinlock.h>
> > +#include <linux/seqlock.h>
> > #include <linux/smp.h>
> > #include <linux/list.h>
> > #include <linux/threads.h>
> > @@ -16,7 +16,7 @@
> > #ifdef CONFIG_SMP
> >
> > struct percpu_counter {
> > - spinlock_t lock;
> > + seqlock_t lock;
> > s64 count;
> > #ifdef CONFIG_HOTPLUG_CPU
> > struct list_head list; /* All percpu_counters are on a list */
> > @@ -53,10 +53,31 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
> > return __percpu_counter_sum(fbc);
> > }
> >
> > -static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> > +#if BITS_PER_LONG == 64
> > +static inline s64 fbc_count(struct percpu_counter *fbc)
> > {
> > return fbc->count;
> > }
> > +#else
> > +/* doesn't have atomic 64 bit operation */
> > +static inline s64 fbc_count(struct percpu_counter *fbc)
> > +{
> > + s64 ret;
> > + unsigned seq;
> > + unsigned long flags;
> > + do {
> > + seq = read_seqbegin_irqsave(&fbc->lock, flags);
> > + ret = fbc->count;
> > + } while(read_seqretry_irqrestore(&fbc->lock, seq, flags));
>
> Do we really need to disabled IRQs here? It seems to me the worst that
> can happen is that the IRQ will change ->count and increase the sequence
> number a bit - a case that is perfectly handled by the current retry
> logic.
>
> And not doing the IRQ flags bit saves a lot of time on some archs.
>

Will update in the next version. BTW does it make sense to do
the above unconditionally now ? ie to remove the #if ?. How much
impact would it be to do read_seqbegin and read_seqretry on a 64bit
machine too ?

-aneesh

2008-08-25 14:21:44

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH -v2] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Mon, 2008-08-25 at 19:35 +0530, Aneesh Kumar K.V wrote:
> On Mon, Aug 25, 2008 at 01:27:19PM +0200, Peter Zijlstra wrote:
> > On Mon, 2008-08-25 at 16:50 +0530, Aneesh Kumar K.V wrote:

> > > @@ -53,10 +53,31 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
> > > return __percpu_counter_sum(fbc);
> > > }
> > >
> > > -static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> > > +#if BITS_PER_LONG == 64
> > > +static inline s64 fbc_count(struct percpu_counter *fbc)
> > > {
> > > return fbc->count;
> > > }
> > > +#else
> > > +/* doesn't have atomic 64 bit operation */
> > > +static inline s64 fbc_count(struct percpu_counter *fbc)
> > > +{
> > > + s64 ret;
> > > + unsigned seq;
> > > + unsigned long flags;
> > > + do {
> > > + seq = read_seqbegin_irqsave(&fbc->lock, flags);
> > > + ret = fbc->count;
> > > + } while(read_seqretry_irqrestore(&fbc->lock, seq, flags));
> >
> > Do we really need to disabled IRQs here? It seems to me the worst that
> > can happen is that the IRQ will change ->count and increase the sequence
> > number a bit - a case that is perfectly handled by the current retry
> > logic.
> >
> > And not doing the IRQ flags bit saves a lot of time on some archs.
> >
>
> Will update in the next version. BTW does it make sense to do
> the above unconditionally now ? ie to remove the #if ?. How much
> impact would it be to do read_seqbegin and read_seqretry on a 64bit
> machine too ?

there's a few smp_rmb()s in there - so that will at the very least be a
compiler barrier and thus generate slightly worse code along with the
few extra reads.

But I'm not sure that's measurable..


2008-08-25 21:00:51

by Mingming Cao

[permalink] [raw]
Subject: Re: [RFC PATCH -v2] ext4: Make sure all the block allocation paths reserve blocks


在 2008-08-25一的 16:50 +0530,Aneesh Kumar K.V写道:
> With delayed allocation we need to make sure block are reserved
> before we attempt to allocate them. Otherwise we get block
> allocation failure (ENOSPC) during writepages which cannot
> be handled. This would mean silent data loss (We do a printk
> stating data will be lost). This patch update the DIO
> and fallocate code path to do block reservation before block
> allocation.
>

I like the DIO race with buffered IO case you described on IRC, it
explains well why we need the block reservation even for DIO and
fallocate. Could you add that in the description here?

> When free blocks count go below a threshold we switch to
> a slow patch which looks at other CPU's accumulated percpu
> counter values.
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> fs/ext4/balloc.c | 72 ++++++++++++++++++++++++++++++++++++++++++++---------
> fs/ext4/ext4.h | 2 +
> fs/ext4/inode.c | 5 +---
> fs/ext4/mballoc.c | 23 +++++++++-------
> 4 files changed, 76 insertions(+), 26 deletions(-)
>
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index cfed283..4a53541 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -1602,6 +1602,47 @@ ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
> return ret;
> }
>
> +int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
> + ext4_fsblk_t nblocks)
> +{
> + s64 free_blocks;
> + ext4_fsblk_t root_blocks = 0;
> + struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
> +
> + free_blocks = percpu_counter_read(fbc);
> +
> + if (!capable(CAP_SYS_RESOURCE) &&
> + sbi->s_resuid != current->fsuid &&
> + (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
> + root_blocks = ext4_r_blocks_count(sbi->s_es);
> +#ifdef CONFIG_SMP
> + /* Each CPU can accumulate FBC_BATCH blocks in their local
> + * counters. So we need to make sure we have free blocks more
> + * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times.
> + */
> + if (free_blocks - (nblocks + root_blocks) <
> + (4 * (FBC_BATCH * nr_cpu_ids))) {

It would be nice to define a
#define Ext4_FREEBLOCK_WATERMARK to hide the details of FBC_BATCH, and
get rid of the +#ifdef CONFIG_SMP here.

> + /*
> + * We need to sum and claim under lock
> + * This is the slow patch which will be
> + * taken when we are very low on free blocks
> + */
> + if (percpu_counter_sum_and_sub(fbc, nblocks + root_blocks))
> + return -ENOSPC;
> + /* add root_blocks back */
> + percpu_counter_add(fbc, root_blocks);
> + return 0;
> + }
> +#endif
> + if (free_blocks < (root_blocks + nblocks))
> + /* we don't have free space */
> + return -ENOSPC;
> +
> + /* reduce fs free blocks counter */
> + percpu_counter_sub(fbc, nblocks);
> + return 0;
> +}
> +
> *
> * ext4_has_free_blocks()
> * @sbi: in-core super block structure.
> @@ -1624,9 +1665,15 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
> (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
> root_blocks = ext4_r_blocks_count(sbi->s_es);
> #ifdef CONFIG_SMP
> - if (free_blocks - root_blocks < FBC_BATCH)
> + /* Each CPU can accumulate FBC_BATCH blocks in their local
> + * counters. So we need to make sure we have free blocks more
> + * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times.
> + */
> + if (free_blocks - (nblocks + root_blocks) <
> + (4 * (FBC_BATCH * nr_cpu_ids))) {
> free_blocks =
> percpu_counter_sum(&sbi->s_freeblocks_counter);
> + }
> #endif


Same as above.

> /*
> if (free_blocks <= root_blocks)
> /* we don't have free space */
> @@ -1634,7 +1681,7 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
> if (free_blocks - root_blocks < nblocks)
> return free_blocks - root_blocks;
> return nblocks;
> - }
> +}
>
>
> /**
> @@ -1713,14 +1760,11 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
> /*
> * With delalloc we already reserved the blocks
> */
> - *count = ext4_has_free_blocks(sbi, *count);
> - }
> - if (*count == 0) {
> - *errp = -ENOSPC;
> - return 0; /*return with ENOSPC error */
> + if (ext4_claim_free_blocks(sbi, *count)) {
> + *errp = -ENOSPC;
> + return 0; /*return with ENOSPC error */
> + }
> }
> - num = *count;
> -
> /*
> * Check quota for allocation of this block.
> */
> @@ -1915,9 +1959,13 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
> le16_add_cpu(&gdp->bg_free_blocks_count, -num);
> gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
> spin_unlock(sb_bgl_lock(sbi, group_no));
> - if (!EXT4_I(inode)->i_delalloc_reserved_flag)
> - percpu_counter_sub(&sbi->s_freeblocks_counter, num);
> -
> + if (!EXT4_I(inode)->i_delalloc_reserved_flag && (*count != num)) {
> + /*
> + * we allocated less blocks than we
> + * claimed. Add the difference back.
> + */
> + percpu_counter_add(&sbi->s_freeblocks_counter, *count - num);
> + }
> if (sbi->s_log_groups_per_flex) {
> ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
> spin_lock(sb_bgl_lock(sbi, flex_group));
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 7f11b25..3738039 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1047,6 +1047,8 @@ extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
> unsigned long *count, int *errp);
> extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
> ext4_fsblk_t goal, unsigned long *count, int *errp);
> +extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
> + ext4_fsblk_t nblocks);
> extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
> ext4_fsblk_t nblocks);
> extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 1c289c1..d965a05 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1537,13 +1537,10 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
> md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
> total = md_needed + nrblocks;
>
> - if (ext4_has_free_blocks(sbi, total) < total) {
> + if (ext4_claim_free_blocks(sbi, total)) {
> spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
> return -ENOSPC;
> }
> - /* reduce fs free blocks counter */
> - percpu_counter_sub(&sbi->s_freeblocks_counter, total);
> -
> EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
> EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
>
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 82dd0e4..4404b46 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2977,9 +2977,15 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
> * at write_begin() time for delayed allocation
> * do not double accounting
> */
> - if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
> - percpu_counter_sub(&sbi->s_freeblocks_counter,
> - ac->ac_b_ex.fe_len);
> + if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) &&
> + ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) {
> + /*
> + * we allocated less blocks than we calimed
> + * Add the difference back
> + */
> + percpu_counter_add(&sbi->s_freeblocks_counter,
> + ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len);
> + }
>
> if (sbi->s_log_groups_per_flex) {
> ext4_group_t flex_group = ext4_flex_group(sbi,
> @@ -4391,14 +4397,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
> /*
> * With delalloc we already reserved the blocks
> */
> - ar->len = ext4_has_free_blocks(sbi, ar->len);
> - }
> -
> - if (ar->len == 0) {
> - *errp = -ENOSPC;
> - return 0;
> + if (ext4_claim_free_blocks(sbi, ar->len)) {
> + *errp = -ENOSPC;
> + return 0;
> + }
> }
> -
> while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
> ar->flags |= EXT4_MB_HINT_NOPREALLOC;
> ar->len--;

2008-08-25 21:06:38

by Mingming Cao

[permalink] [raw]
Subject: Re: [RFC PATCH -v2] ext4: Retry block reservation


在 2008-08-25一的 16:50 +0530,Aneesh Kumar K.V写道:
> During block reservation if we don't have enough
> blocks left, retry block reservation with smaller
> block count. This make sure we try fallocate
> and DIO with smaller request size and don't fail early.
> The delayed allocation reservation cannot try with smaller
> block count. So retry block reservation to handle temporary
> disk full conditions. Also print free blocks details if we
> fail block allocation during writepages.
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> fs/ext4/balloc.c | 8 +++++++-
> fs/ext4/inode.c | 33 ++++++++++++++++++++++++++++++---
> fs/ext4/mballoc.c | 7 ++++++-
> 3 files changed, 43 insertions(+), 5 deletions(-)
>
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index 4a53541..b7d1347 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -1760,10 +1760,16 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
> /*
> * With delalloc we already reserved the blocks
> */
> - if (ext4_claim_free_blocks(sbi, *count)) {
> + while (*count && ext4_claim_free_blocks(sbi, *count)) {
> + /* let others to free the space */
> + yield();
> + *count = *count >> 1;
> + }
> + if (!*count) {

Could ext4_claim_free_blocks() returns the total number of blocks
shortage, so that we don't need to "guess" how much to attempt to
allocate?


> *errp = -ENOSPC;
> return 0; /*return with ENOSPC error */
> }
> + num = *count;
> }
> /*
> * Check quota for allocation of this block.
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index d965a05..72a4a71 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1521,6 +1521,7 @@ static int ext4_journalled_write_end(struct file *file,
>
> static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
> {
> + int retries = 0;
> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> unsigned long md_needed, mdblocks, total = 0;
>
> @@ -1529,6 +1530,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
> * in order to allocate nrblocks
> * worse case is one extent per block
> */
> +repeat:
> spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
> total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
> mdblocks = ext4_calc_metadata_amount(inode, total);
> @@ -1539,6 +1541,10 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
>
> if (ext4_claim_free_blocks(sbi, total)) {
> spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
> + if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
> + yield();
> + goto repeat;
> + }
> return -ENOSPC;
> }
> EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
> @@ -1813,6 +1819,26 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
> return;
> }
>
> +static void ext4_print_free_blocks(struct super_block *sb)
> +{
> +#ifdef EXT4FS_DEBUG
> + ext4_group_t i;
> + struct ext4_group_desc *gdp;
> + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
> + printk(KERN_DEBUG "Free blocks details\n");
> + for (i = 0; i < ngroups; i++) {
> + gdp = ext4_get_group_desc(sb, i, NULL);
> + if (!gdp)
> + continue;
> + printk(KERN_DEBUG "Free blocks in group %lu is %d\n",
> + i, le16_to_cpu(gdp->bg_free_blocks_count));
> + }
> +#endif
> + printk(KERN_CRIT "Total free blocks count %lld\n",
> + ext4_count_free_blocks(sb));
> + return;
> +}
> +

Ext4_count_free_blocks() has the extra debugging code too. I think you
could calling ext4_count_free_blocks() directly and don't need the new
function ext4_print_free_blocks().
> /*
> * mpage_da_map_blocks - go through given space
> *
> @@ -1825,20 +1851,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
> static int mpage_da_map_blocks(struct mpage_da_data *mpd)
> {
> int err = 0;
> + struct buffer_head new;
> struct buffer_head *lbh = &mpd->lbh;
> sector_t next = lbh->b_blocknr;
> - struct buffer_head new;
>
> /*
> * We consider only non-mapped and non-allocated blocks
> */
> if (buffer_mapped(lbh) && !buffer_delay(lbh))
> return 0;
> -
> new.b_state = lbh->b_state;
> new.b_blocknr = 0;
> new.b_size = lbh->b_size;
> -
> /*
> * If we didn't accumulate anything
> * to write simply return
> @@ -1871,6 +1895,9 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
> lbh->b_size >> mpd->inode->i_blkbits, err);
> printk(KERN_EMERG "This should not happen.!! "
> "Data will be lost\n");

could we print out the error code with the warning message here?

> + if (err == -ENOSPC) {
> + ext4_print_free_blocks(mpd->inode->i_sb);
> + }
> /* invlaidate all the pages */
> ext4_da_block_invalidatepages(mpd, next,
> lbh->b_size >> mpd->inode->i_blkbits);
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 4404b46..419009f 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -4397,7 +4397,12 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
> /*
> * With delalloc we already reserved the blocks
> */
> - if (ext4_claim_free_blocks(sbi, ar->len)) {
> + while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
> + /* let others to free the space */
> + yield();
> + ar->len = ar->len >> 1;
> + }
> + if (!ar->len) {
> *errp = -ENOSPC;
> return 0;
> }

2008-08-25 21:26:11

by Mingming Cao

[permalink] [raw]
Subject: Re: [RFC PATCH -v2] ext4: Add percpu dirty block accounting.


在 2008-08-25一的 16:50 +0530,Aneesh Kumar K.V写道:
> This patch add dirty block accounting using percpu_counters.
> Delayed allocation block reservation is now done by updating
> dirty block counter. In the later patch we switch to non
> delalloc mode if the filesystem free blocks is < that
> 150 % of total filesystem dirty blocks
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> fs/ext4/balloc.c | 64 ++++++++++++++++++++++++++++------------------------
> fs/ext4/ext4_sb.h | 1 +
> fs/ext4/inode.c | 25 ++++++++++++--------
> fs/ext4/mballoc.c | 17 ++-----------
> fs/ext4/super.c | 8 +++++-
> 5 files changed, 60 insertions(+), 55 deletions(-)
>
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index b7d1347..4ebe3b6 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -1605,11 +1605,13 @@ ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
> int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
> ext4_fsblk_t nblocks)
> {
> - s64 free_blocks;
> + s64 free_blocks, dirty_blocks;
> ext4_fsblk_t root_blocks = 0;
> struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
> + struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
>
> - free_blocks = percpu_counter_read(fbc);
> + free_blocks = percpu_counter_read_positive(fbc);
> + dirty_blocks = percpu_counter_read_positive(dbc);
>
> if (!capable(CAP_SYS_RESOURCE) &&
> sbi->s_resuid != current->fsuid &&
> @@ -1620,26 +1622,27 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
> * counters. So we need to make sure we have free blocks more
> * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times.
> */
> - if (free_blocks - (nblocks + root_blocks) <
> + if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
> (4 * (FBC_BATCH * nr_cpu_ids))) {
> - /*
> - * We need to sum and claim under lock
> - * This is the slow patch which will be
> - * taken when we are very low on free blocks
> - */
> - if (percpu_counter_sum_and_sub(fbc, nblocks + root_blocks))
> - return -ENOSPC;
> - /* add root_blocks back */
> - percpu_counter_add(fbc, root_blocks);
> - return 0;
> +
> + free_blocks = percpu_counter_sum(fbc);
> + dirty_blocks = percpu_counter_sum(dbc);
> + if (dirty_blocks < 0) {
> + printk(KERN_CRIT "Dirty block accounting "
> + "went wrong %lld\n",
> + dirty_blocks);
> + }
> }
> #endif
> - if (free_blocks < (root_blocks + nblocks))
> + /* Check whether we have space after
> + * accounting for current dirty blocks
> + */
> + if (free_blocks < ((s64)(root_blocks + nblocks) + dirty_blocks))
> /* we don't have free space */
> return -ENOSPC;
>
> - /* reduce fs free blocks counter */
> - percpu_counter_sub(fbc, nblocks);
> + /* Add the blocks to nblocks */
> + percpu_counter_add(dbc, nblocks);
> return 0;
> }
>

I noticed that you dropped the code to update the counter together with
the accurate percpu_counter_sum(). This will open a window that two
allocation reservation get passed through when the fs is almost
full/fully booked... Any reason to drop that?

> @@ -1655,10 +1658,13 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
> ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
> ext4_fsblk_t nblocks)
> {
> - ext4_fsblk_t free_blocks;
> + ext4_fsblk_t free_blocks, dirty_blocks;
> ext4_fsblk_t root_blocks = 0;
> + struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
> + struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
>
> - free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
> + free_blocks = percpu_counter_read_positive(fbc);
> + dirty_blocks = percpu_counter_read_positive(dbc);
>
> if (!capable(CAP_SYS_RESOURCE) &&
> sbi->s_resuid != current->fsuid &&
> @@ -1669,16 +1675,16 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
> * counters. So we need to make sure we have free blocks more
> * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times.
> */
> - if (free_blocks - (nblocks + root_blocks) <
> + if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
> (4 * (FBC_BATCH * nr_cpu_ids))) {
> - free_blocks =
> - percpu_counter_sum(&sbi->s_freeblocks_counter);
> + free_blocks = percpu_counter_sum_positive(fbc);
> + dirty_blocks = percpu_counter_sum_positive(dbc);
> }
> #endif
> - if (free_blocks <= root_blocks)
> + if (free_blocks <= (root_blocks + dirty_blocks))
> /* we don't have free space */
> return 0;
> - if (free_blocks - root_blocks < nblocks)
> + if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
> return free_blocks - root_blocks;
> return nblocks;
> }
> @@ -1965,13 +1971,11 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
> le16_add_cpu(&gdp->bg_free_blocks_count, -num);
> gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
> spin_unlock(sb_bgl_lock(sbi, group_no));
> - if (!EXT4_I(inode)->i_delalloc_reserved_flag && (*count != num)) {
> - /*
> - * we allocated less blocks than we
> - * claimed. Add the difference back.
> - */
> - percpu_counter_add(&sbi->s_freeblocks_counter, *count - num);
> - }
> + percpu_counter_sub(&sbi->s_freeblocks_counter, num);
> + /*
> + * Now reduce the dirty block count also. Should not go negative
> + */
> + percpu_counter_sub(&sbi->s_dirtyblocks_counter, num);

ah... I think this is a bug which casue the ENOPSC still
You are updating the s_dirtyblocks_counter here, taking away the block
reservation counter, regardless whether this block allocation is go
through the delalloc mode or nondelalloc mode.

For example when fs is relatively fully booked, you have a file1 come in
and switch to the non delalloc mode, due to fs is relatively fully
booked. then after file1 done block allocation, it reduced the
s_dirtyblocks_counter, this is wrong, and will cause later ENOSPC.

> if (sbi->s_log_groups_per_flex) {
> ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
> spin_lock(sb_bgl_lock(sbi, flex_group));
> diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
> index 6300226..0fa3762 100644
> --- a/fs/ext4/ext4_sb.h
> +++ b/fs/ext4/ext4_sb.h
> @@ -59,6 +59,7 @@ struct ext4_sb_info {
> struct percpu_counter s_freeblocks_counter;
> struct percpu_counter s_freeinodes_counter;
> struct percpu_counter s_dirs_counter;
> + struct percpu_counter s_dirtyblocks_counter;
> struct blockgroup_lock s_blockgroup_lock;
>
> /* root of the per fs reservation window tree */
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 72a4a71..3f3ecc0 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1030,19 +1030,25 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
> BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
> mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
>
> - /* Account for allocated meta_blocks */
> - mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
> + if (mdb_free) {
> + /* Account for allocated meta_blocks */
> + mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
>
> - /* update fs free blocks counter for truncate case */
> - percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free);
> + /* update fs dirty blocks counter */
> + /*
> + * FIXME!! doing this get the free block count wrong
> + * But we need to take care of over allocated meta-data
> + * blocks
> + */
> + //percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);

I think we should update the overall reserved metadata blocks. Turn it
off only hide the real problem.

> + EXT4_I(inode)->i_allocated_meta_blocks = 0;
> + EXT4_I(inode)->i_reserved_meta_blocks = mdb;
> + }
>
> /* update per-inode reservations */
> BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
> EXT4_I(inode)->i_reserved_data_blocks -= used;
>
> - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
> - EXT4_I(inode)->i_reserved_meta_blocks = mdb;
> - EXT4_I(inode)->i_allocated_meta_blocks = 0;
> spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
> }
>
> @@ -1588,8 +1594,8 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
>
> release = to_free + mdb_free;
>
> - /* update fs free blocks counter for truncate case */
> - percpu_counter_add(&sbi->s_freeblocks_counter, release);
> + /* update fs dirty blocks counter for truncate case */
> + percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
>
> /* update per-inode reservations */
> BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
> @@ -2490,7 +2496,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
> index = pos >> PAGE_CACHE_SHIFT;
> from = pos & (PAGE_CACHE_SIZE - 1);
> to = from + len;
> -
> retry:
> /*
> * With delayed allocation, we don't log the i_disksize update
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 419009f..4da4b9a 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2971,22 +2971,11 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
> le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
> gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
> spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
> -
> + percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
> /*
> - * free blocks account has already be reduced/reserved
> - * at write_begin() time for delayed allocation
> - * do not double accounting
> + * Now reduce the dirty block count also. Should not go negative
> */
> - if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) &&
> - ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) {
> - /*
> - * we allocated less blocks than we calimed
> - * Add the difference back
> - */
> - percpu_counter_add(&sbi->s_freeblocks_counter,
> - ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len);
> - }
> -
> + percpu_counter_sub(&sbi->s_dirtyblocks_counter, ac->ac_b_ex.fe_len);

Same bug as before. We can't update s_dirtyblocks unconditionally. At
least check if this allocation request is coming from delalloc.

> if (sbi->s_log_groups_per_flex) {
> ext4_group_t flex_group = ext4_flex_group(sbi,
> ac->ac_b_ex.fe_group);
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index ed77786..7b9db51 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -520,6 +520,7 @@ static void ext4_put_super(struct super_block *sb)
> percpu_counter_destroy(&sbi->s_freeblocks_counter);
> percpu_counter_destroy(&sbi->s_freeinodes_counter);
> percpu_counter_destroy(&sbi->s_dirs_counter);
> + percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
> brelse(sbi->s_sbh);
> #ifdef CONFIG_QUOTA
> for (i = 0; i < MAXQUOTAS; i++)
> @@ -2259,6 +2260,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> err = percpu_counter_init(&sbi->s_dirs_counter,
> ext4_count_dirs(sb));
> }
> + if (!err) {
> + err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
> + }
> if (err) {
> printk(KERN_ERR "EXT4-fs: insufficient memory\n");
> goto failed_mount3;
> @@ -2491,6 +2495,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> percpu_counter_destroy(&sbi->s_freeblocks_counter);
> percpu_counter_destroy(&sbi->s_freeinodes_counter);
> percpu_counter_destroy(&sbi->s_dirs_counter);
> + percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
> failed_mount2:
> for (i = 0; i < db_count; i++)
> brelse(sbi->s_group_desc[i]);
> @@ -3164,7 +3169,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
> buf->f_type = EXT4_SUPER_MAGIC;
> buf->f_bsize = sb->s_blocksize;
> buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
> - buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
> + buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
> + percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);

in case the counter turns out negative, I think better to add a slow
path here, use the accurate version of freeblocks-dirtyblocks.


> ext4_free_blocks_count_set(es, buf->f_bfree);
> buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
> if (buf->f_bfree < ext4_r_blocks_count(es))

2008-08-25 21:31:39

by Mingming Cao

[permalink] [raw]
Subject: Re: [RFC PATCH -v2] ext4: Switch to non delalloc mode when we are low on free blocks count.


在 2008-08-25一的 16:50 +0530,Aneesh Kumar K.V写道:
> delayed allocation allocate blocks during writepages. That also
> means we cannot handle block allocation failures. Switch to
> non - delalloc when we are running low on free blocks.
> Delayed allocation need to do aggressive meta-data block reservation
> considering that the requested blocks can all be discontiguous.
> Switching to non-delalloc avoids that. Also we can satisfy
> partial write in non-delalloc mode.
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> fs/ext4/inode.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
> 1 files changed, 46 insertions(+), 2 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 3f3ecc0..d923a14 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -2482,6 +2482,29 @@ static int ext4_da_writepages(struct address_space *mapping,
> return ret;
> }
>
> +#define FALL_BACK_TO_NONDELALLOC 1
> +static int ext4_nonda_switch(struct super_block *sb)
> +{
> + s64 free_blocks, dirty_blocks;
> + struct ext4_sb_info *sbi = EXT4_SB(sb);
> +
> + /*
> + * switch to non delalloc mode if we are running low
> + * on free block. The free block accounting via percpu
> + * counters can get slightly wrong with FBC_BATCH getting
> + * accumulated on each CPU without updating global counters
> + * Delalloc need an accurate free block accounting. So switch
> + * to non delalloc when we are near to error range.
> + */
> + free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
> + dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
> + if ( 2 * free_blocks < 3 * dirty_blocks) {
> + /* free block count is less that 150% of dirty blocks */
> + return 1;
> + }

In the case the free_blocks is below the EXT4_FREEBLOCKS_WATERMARK, we
should turn back to nondelalloc mode, even if there is no dirty_blocks.

> + return 0;
> +}
> +
> static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
> loff_t pos, unsigned len, unsigned flags,
> struct page **pagep, void **fsdata)
> @@ -2496,6 +2519,13 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
> index = pos >> PAGE_CACHE_SHIFT;
> from = pos & (PAGE_CACHE_SIZE - 1);
> to = from + len;
> +
> + if (ext4_nonda_switch(inode->i_sb)) {
> + *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
> + return ext4_write_begin(file, mapping, pos,
> + len, flags, pagep, fsdata);
> + }
> + *fsdata = (void *)0;

We probably should add a warning if *fsdata is non 0, instead of forcing
it reset to 0 unconditionally.

> retry:
> /*
> * With delayed allocation, we don't log the i_disksize update
> @@ -2564,6 +2594,19 @@ static int ext4_da_write_end(struct file *file,
> handle_t *handle = ext4_journal_current_handle();
> loff_t new_i_size;
> unsigned long start, end;
> + int write_mode = (int)fsdata;
> +
> + if (write_mode == FALL_BACK_TO_NONDELALLOC) {
> + if (ext4_should_order_data(inode)) {
> + return ext4_ordered_write_end(file, mapping, pos,
> + len, copied, page, fsdata);
> + } else if (ext4_should_writeback_data(inode)) {
> + return ext4_writeback_write_end(file, mapping, pos,
> + len, copied, page, fsdata);
> + } else {
> + BUG();

Shouldn't we warnining user that we can't fall back to journalled mode
instead, let it continue with delalloc mode, instead of BUG() the
system?

> + }
> + }
>
> start = pos & (PAGE_CACHE_SIZE - 1);
> end = start + copied -1;
> @@ -4901,6 +4944,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
> loff_t size;
> unsigned long len;
> int ret = -EINVAL;
> + void *fsdata;
> struct file *file = vma->vm_file;
> struct inode *inode = file->f_path.dentry->d_inode;
> struct address_space *mapping = inode->i_mapping;
> @@ -4939,11 +4983,11 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
> * on the same page though
> */
> ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
> - len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
> + len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
> if (ret < 0)
> goto out_unlock;
> ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
> - len, len, page, NULL);
> + len, len, page, fsdata);
> if (ret < 0)
> goto out_unlock;
> ret = 0;

2008-08-25 23:18:15

by Andreas Dilger

[permalink] [raw]
Subject: Re: [RFC PATCH -v2] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Aug 25, 2008 19:35 +0530, Aneesh Kumar wrote:
> BTW does it make sense to do
> the above unconditionally now ? ie to remove the #if ?. How much
> impact would it be to do read_seqbegin and read_seqretry on a 64bit
> machine too ?

If we don't need it, why do it? There is no benefit and just overhead.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.


2008-08-27 00:33:55

by Andrew Morton

[permalink] [raw]
Subject: Re: [RFC PATCH -v2] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Mon, 25 Aug 2008 16:50:28 +0530
"Aneesh Kumar K.V" <[email protected]> wrote:

> fbc->count is of type s64. The change was introduced by
> 0216bfcffe424a5473daa4da47440881b36c1f4 which changed the type
> from long to s64. Moving to s64 also means on 32 bit architectures
> we can get wrong values on fbc->count. Since fbc->count is read
> more frequently and updated rarely use seqlocks. This should
> reduce the impact of locking in the read path for 32bit arch.
>
> percpu_counter_read is used within interrupt context also. So
> use the irq safe version of seqlock while reading
>

The linux-ext4 list is not an appropriate place for discussing a
kernel-wide change.

> include/linux/percpu_counter.h | 29 +++++++++++++++++++++++++----
> lib/percpu_counter.c | 20 ++++++++++----------
> 2 files changed, 35 insertions(+), 14 deletions(-)

Which this one surely is. I added linux-kernel to cc.

> diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
> index 9007ccd..36f3d2d 100644
> --- a/include/linux/percpu_counter.h
> +++ b/include/linux/percpu_counter.h
> @@ -6,7 +6,7 @@
> * WARNING: these things are HUGE. 4 kbytes per counter on 32-way P4.
> */
>
> -#include <linux/spinlock.h>
> +#include <linux/seqlock.h>
> #include <linux/smp.h>
> #include <linux/list.h>
> #include <linux/threads.h>
> @@ -16,7 +16,7 @@
> #ifdef CONFIG_SMP
>
> struct percpu_counter {
> - spinlock_t lock;
> + seqlock_t lock;
> s64 count;
> #ifdef CONFIG_HOTPLUG_CPU
> struct list_head list; /* All percpu_counters are on a list */
> @@ -53,10 +53,31 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
> return __percpu_counter_sum(fbc);
> }
>
> -static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> +#if BITS_PER_LONG == 64
> +static inline s64 fbc_count(struct percpu_counter *fbc)
> {
> return fbc->count;
> }
> +#else
> +/* doesn't have atomic 64 bit operation */
> +static inline s64 fbc_count(struct percpu_counter *fbc)
> +{
> + s64 ret;
> + unsigned seq;
> + unsigned long flags;
> + do {
> + seq = read_seqbegin_irqsave(&fbc->lock, flags);
> + ret = fbc->count;
> + } while(read_seqretry_irqrestore(&fbc->lock, seq, flags));
> + return ret;
> +
> +}
> +#endif

The problem of atomically handling 64-bit quantities on 32-bit machines
is by no means unique to percpu_counters. We sorta-solved it for
i_size and we continue to sorta-not-solve it for loff_t and surely
there are other places which already sorta-solve it and which will be
sorta-solved in the future.

All of which tells us that we need a real solution, at a lower level.

We already have a suitable type, really: atomic64_t. But it's an
arch-private thing and is only implemented on 64-bit architectures.

Perhaps atomic64_t should be promoted to being a kernel-wide facility?


2008-08-27 08:31:40

by Akira Fujita

[permalink] [raw]
Subject: Re: [RFC PATCH -v2] ext4: request for blocks with ar.excepted_group = -1

Hi Aneesh,

Thank you for your patch.
I will merge this change into the next version of defrag patches.

Regards,
Akira Fujita

Aneesh Kumar K.V Wrote:
> Otherwise we skip group 0 during block allocation.
> This cause ENOSPC even if we have free blocks in
> group 0. This should be merged with defrag. The
> expected_group changes are introduced by defrag patches.
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> fs/ext4/balloc.c | 1 +
> fs/ext4/extents.c | 1 +
> 2 files changed, 2 insertions(+), 0 deletions(-)
>
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index 4ebe3b6..1a068f5 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -2034,6 +2034,7 @@ static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
> ar.goal = goal;
> ar.len = *count;
> ar.logical = iblock;
> + ar.excepted_group = -1;
>
> if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
> /* enable in-core preallocation for data block allocation */
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index bf612a7..268e96d 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -2879,6 +2879,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
> ar.goal = ext4_ext_find_goal(inode, path, iblock);
> ar.logical = iblock;
> ar.len = allocated;
> + ar.excepted_group = -1;
> if (S_ISREG(inode->i_mode))
> ar.flags = EXT4_MB_HINT_DATA;
> else

--
Akira Fujita <[email protected]>

The First Fundamental Software Development Group,
Software Development Division,
NEC Software Tohoku, Ltd.