2008-08-22 13:34:44

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [RFC PATCH] percpu_counters: make fbc->count read atomic on 32 bit architecture

fbc->count is of type s64. The change was introduced by
0216bfcffe424a5473daa4da47440881b36c1f4 which changed the type
from long to s64. Moving to s64 also means on 32 bit architectures
we can get wrong values on fbc->count.

percpu_counter_read is used within interrupt context also. So
use the irq safe version of spinlock while reading

Signed-off-by: Aneesh Kumar K.V <[email protected]>
CC: Peter Zijlstra <[email protected]>
---
include/linux/percpu_counter.h | 23 +++++++++++++++++++++--
1 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 9007ccd..af485b1 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -53,10 +53,29 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
return __percpu_counter_sum(fbc);
}

-static inline s64 percpu_counter_read(struct percpu_counter *fbc)
+#if BITS_PER_LONG == 64
+static inline s64 fbc_count(struct percpu_counter *fbc)
{
return fbc->count;
}
+#else
+/* doesn't have atomic 64 bit operation */
+static inline s64 fbc_count(struct percpu_counter *fbc)
+{
+ s64 ret;
+ unsigned long flags;
+ spin_lock_irqsave(&fbc->lock, flags);
+ ret = fbc->count;
+ spin_unlock_irqrestore(&fbc->lock, flags);
+ return ret;
+
+}
+#endif
+
+static inline s64 percpu_counter_read(struct percpu_counter *fbc)
+{
+ return fbc_count(fbc);
+}

/*
* It is possible for the percpu_counter_read() to return a small negative
@@ -65,7 +84,7 @@ static inline s64 percpu_counter_read(struct percpu_counter *fbc)
*/
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
- s64 ret = fbc->count;
+ s64 ret = fbc_count(fbc);

barrier(); /* Prevent reloads of fbc->count */
if (ret >= 0)
--
1.6.0.2.g2ebc0



2008-08-22 13:34:48

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [RFC PATCH] ext4: Make sure all the block allocation patch reserve blocks

With delayed allocation we need to make sure block are reserved
before we attempt to allocate them. Otherwise we get block
allocation failure (ENOSPC) during writepages which cannot
be handled. This would mean silent data loss (We do a printk
stating data will be lost). This patch update the DIO
and fallocate code path to do block reservation before block
allocation.

When free blocks count go below a threshold we switch to
a slow patch which looks at other CPU's accumulated percpu
counter values.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/balloc.c | 72 ++++++++++++++++++++++++++++++++++++++++++++---------
fs/ext4/ext4.h | 2 +
fs/ext4/inode.c | 5 +---
fs/ext4/mballoc.c | 23 +++++++++-------
4 files changed, 76 insertions(+), 26 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index cfed283..4a53541 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1602,6 +1602,47 @@ ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
return ret;
}

+int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+ ext4_fsblk_t nblocks)
+{
+ s64 free_blocks;
+ ext4_fsblk_t root_blocks = 0;
+ struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
+
+ free_blocks = percpu_counter_read(fbc);
+
+ if (!capable(CAP_SYS_RESOURCE) &&
+ sbi->s_resuid != current->fsuid &&
+ (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+ root_blocks = ext4_r_blocks_count(sbi->s_es);
+#ifdef CONFIG_SMP
+ /* Each CPU can accumulate FBC_BATCH blocks in their local
+ * counters. So we need to make sure we have free blocks more
+ * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times.
+ */
+ if (free_blocks - (nblocks + root_blocks) <
+ (4 * (FBC_BATCH * nr_cpu_ids))) {
+ /*
+ * We need to sum and claim under lock
+ * This is the slow patch which will be
+ * taken when we are very low on free blocks
+ */
+ if (percpu_counter_sum_and_sub(fbc, nblocks + root_blocks))
+ return -ENOSPC;
+ /* add root_blocks back */
+ percpu_counter_add(fbc, root_blocks);
+ return 0;
+ }
+#endif
+ if (free_blocks < (root_blocks + nblocks))
+ /* we don't have free space */
+ return -ENOSPC;
+
+ /* reduce fs free blocks counter */
+ percpu_counter_sub(fbc, nblocks);
+ return 0;
+}
+
/**
* ext4_has_free_blocks()
* @sbi: in-core super block structure.
@@ -1624,9 +1665,15 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
root_blocks = ext4_r_blocks_count(sbi->s_es);
#ifdef CONFIG_SMP
- if (free_blocks - root_blocks < FBC_BATCH)
+ /* Each CPU can accumulate FBC_BATCH blocks in their local
+ * counters. So we need to make sure we have free blocks more
+ * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times.
+ */
+ if (free_blocks - (nblocks + root_blocks) <
+ (4 * (FBC_BATCH * nr_cpu_ids))) {
free_blocks =
percpu_counter_sum(&sbi->s_freeblocks_counter);
+ }
#endif
if (free_blocks <= root_blocks)
/* we don't have free space */
@@ -1634,7 +1681,7 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
if (free_blocks - root_blocks < nblocks)
return free_blocks - root_blocks;
return nblocks;
- }
+}


/**
@@ -1713,14 +1760,11 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
/*
* With delalloc we already reserved the blocks
*/
- *count = ext4_has_free_blocks(sbi, *count);
- }
- if (*count == 0) {
- *errp = -ENOSPC;
- return 0; /*return with ENOSPC error */
+ if (ext4_claim_free_blocks(sbi, *count)) {
+ *errp = -ENOSPC;
+ return 0; /*return with ENOSPC error */
+ }
}
- num = *count;
-
/*
* Check quota for allocation of this block.
*/
@@ -1915,9 +1959,13 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
le16_add_cpu(&gdp->bg_free_blocks_count, -num);
gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
spin_unlock(sb_bgl_lock(sbi, group_no));
- if (!EXT4_I(inode)->i_delalloc_reserved_flag)
- percpu_counter_sub(&sbi->s_freeblocks_counter, num);
-
+ if (!EXT4_I(inode)->i_delalloc_reserved_flag && (*count != num)) {
+ /*
+ * we allocated less blocks than we
+ * claimed. Add the difference back.
+ */
+ percpu_counter_add(&sbi->s_freeblocks_counter, *count - num);
+ }
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
spin_lock(sb_bgl_lock(sbi, flex_group));
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 7f11b25..3738039 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1047,6 +1047,8 @@ extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
unsigned long *count, int *errp);
extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+ ext4_fsblk_t nblocks);
extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
ext4_fsblk_t nblocks);
extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1c289c1..d965a05 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1537,13 +1537,10 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
total = md_needed + nrblocks;

- if (ext4_has_free_blocks(sbi, total) < total) {
+ if (ext4_claim_free_blocks(sbi, total)) {
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
return -ENOSPC;
}
- /* reduce fs free blocks counter */
- percpu_counter_sub(&sbi->s_freeblocks_counter, total);
-
EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 82dd0e4..4404b46 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2977,9 +2977,15 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
* at write_begin() time for delayed allocation
* do not double accounting
*/
- if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
- percpu_counter_sub(&sbi->s_freeblocks_counter,
- ac->ac_b_ex.fe_len);
+ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) &&
+ ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) {
+ /*
+ * we allocated less blocks than we calimed
+ * Add the difference back
+ */
+ percpu_counter_add(&sbi->s_freeblocks_counter,
+ ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len);
+ }

if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -4391,14 +4397,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
/*
* With delalloc we already reserved the blocks
*/
- ar->len = ext4_has_free_blocks(sbi, ar->len);
- }
-
- if (ar->len == 0) {
- *errp = -ENOSPC;
- return 0;
+ if (ext4_claim_free_blocks(sbi, ar->len)) {
+ *errp = -ENOSPC;
+ return 0;
+ }
}

2008-08-22 13:34:45

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [RFC PATCH] percpu_counters: Add new function percpu_counter_sum_and_sub

percpu_counter_sum_and_sub(struct percpu_counter *fbc, s64 amount)

This will be later used by ext4 code. It adds up the local
per cpu counter values to global count and subtract amount
from the gloabl count if gloabl count is > amount. This is
used during block resrevation to check within a lock we
have sufficient free blocks if so also claim the blocks.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
include/linux/percpu_counter.h | 17 +++++++++++++++++
lib/percpu_counter.c | 27 +++++++++++++++++++++++++++
2 files changed, 44 insertions(+), 0 deletions(-)

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index af485b1..978db67 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -36,6 +36,7 @@ void percpu_counter_destroy(struct percpu_counter *fbc);
void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
s64 __percpu_counter_sum(struct percpu_counter *fbc);
+s64 __percpu_counter_sum_and_sub(struct percpu_counter *fbc, s64 amount);

static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
{
@@ -53,6 +54,12 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
return __percpu_counter_sum(fbc);
}

+static inline int percpu_counter_sum_and_sub(struct percpu_counter *fbc,
+ s64 amount)
+{
+ return __percpu_counter_sum_and_sub(fbc, amount);
+}
+
#if BITS_PER_LONG == 64
static inline s64 fbc_count(struct percpu_counter *fbc)
{
@@ -146,6 +153,16 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
return percpu_counter_read(fbc);
}

+static inline int percpu_counter_sum_and_sub(struct percpu_counter *fbc,
+ s64 amount)
+{
+ if (fbc->count >= amount) {
+ percpu_counter_sub(fbc, amount);
+ return 0;
+ }
+ return -E2BIG;
+}
+
#endif /* CONFIG_SMP */

static inline void percpu_counter_inc(struct percpu_counter *fbc)
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index a866389..0336062 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -71,6 +71,33 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
}
EXPORT_SYMBOL(__percpu_counter_sum);

+ /*
+ * Add up all the per-cpu counts. If the result is greater than
+ * amount subtract amount from result and return 0. Otherwise return
+ * -E2BIG
+ */
+s64 __percpu_counter_sum_and_sub(struct percpu_counter *fbc, s64 amount)
+{
+ s64 ret;
+ int cpu;
+
+ spin_lock(&fbc->lock);
+ ret = fbc->count;
+ for_each_online_cpu(cpu) {
+ s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
+ ret += *pcount;
+ *pcount = 0;
+ }
+ if (ret >= amount) {
+ fbc->count = ret - amount;
+ spin_unlock(&fbc->lock);
+ return 0;
+ }
+ spin_unlock(&fbc->lock);
+ return -E2BIG;
+}
+EXPORT_SYMBOL(__percpu_counter_sum_and_sub);
+
static struct lock_class_key percpu_counter_irqsafe;

int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
--
1.6.0.2.g2ebc0


2008-08-22 18:01:52

by Mingming Cao

[permalink] [raw]
Subject: Re: [RFC PATCH] percpu_counters: Add new function percpu_counter_sum_and_sub


在 2008-08-22五的 19:04 +0530,Aneesh Kumar K.V写道:
> percpu_counter_sum_and_sub(struct percpu_counter *fbc, s64 amount)
>
> This will be later used by ext4 code. It adds up the local
> per cpu counter values to global count and subtract amount
> from the gloabl count if gloabl count is > amount. This is
> used during block resrevation to check within a lock we
> have sufficient free blocks if so also claim the blocks.
>

I'd suggest repost the patch to lkml for this new interface change when
this patch is ready. It would be nice to have ext4 part of change which
use this interface being sent together in a series.


Patch looks fine except a small place, see comment below.

You could add Reviewed-by: Mingming Cao <[email protected]>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> include/linux/percpu_counter.h | 17 +++++++++++++++++
> lib/percpu_counter.c | 27 +++++++++++++++++++++++++++
> 2 files changed, 44 insertions(+), 0 deletions(-)
>
> diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
> index af485b1..978db67 100644
> --- a/include/linux/percpu_counter.h
> +++ b/include/linux/percpu_counter.h
> @@ -36,6 +36,7 @@ void percpu_counter_destroy(struct percpu_counter *fbc);
> void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
> void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
> s64 __percpu_counter_sum(struct percpu_counter *fbc);
> +s64 __percpu_counter_sum_and_sub(struct percpu_counter *fbc, s64 amount);
>
> static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
> {
> @@ -53,6 +54,12 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
> return __percpu_counter_sum(fbc);
> }
>
> +static inline int percpu_counter_sum_and_sub(struct percpu_counter *fbc,
> + s64 amount)
> +{
> + return __percpu_counter_sum_and_sub(fbc, amount);
> +}
> +
> #if BITS_PER_LONG == 64
> static inline s64 fbc_count(struct percpu_counter *fbc)
> {
> @@ -146,6 +153,16 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
> return percpu_counter_read(fbc);
> }
>
> +static inline int percpu_counter_sum_and_sub(struct percpu_counter *fbc,
> + s64 amount)
> +{
> + if (fbc->count >= amount) {
> + percpu_counter_sub(fbc, amount);
> + return 0;
> + }
> + return -E2BIG;
> +}
> +
> #endif /* CONFIG_SMP */
>
> static inline void percpu_counter_inc(struct percpu_counter *fbc)
> diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
> index a866389..0336062 100644
> --- a/lib/percpu_counter.c
> +++ b/lib/percpu_counter.c
> @@ -71,6 +71,33 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
> }
> EXPORT_SYMBOL(__percpu_counter_sum);
>
> + /*
> + * Add up all the per-cpu counts. If the result is greater than
> + * amount subtract amount from result and return 0. Otherwise return
> + * -E2BIG
> + */
> +s64 __percpu_counter_sum_and_sub(struct percpu_counter *fbc, s64 amount)

Seems it returns 0 for success, and error code on failur. In this case,
should the return value type as s64?

I noticed previous ly the caller returns "int" type, so this might be
just a copy-and-paste error.

+static inline int percpu_counter_sum_and_sub(struct percpu_counter
*fbc,


> +{
> + s64 ret;
> + int cpu;
> +
> + spin_lock(&fbc->lock);
> + ret = fbc->count;
> + for_each_online_cpu(cpu) {
> + s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
> + ret += *pcount;
> + *pcount = 0;
> + }
> + if (ret >= amount) {
> + fbc->count = ret - amount;
> + spin_unlock(&fbc->lock);
> + return 0;
> + }
> + spin_unlock(&fbc->lock);
> + return -E2BIG;
> +}
> +EXPORT_SYMBOL(__percpu_counter_sum_and_sub);
> +
> static struct lock_class_key percpu_counter_irqsafe;
>
> int percpu_counter_init(struct percpu_counter *fbc, s64 amount)

2008-08-22 18:22:58

by Mingming Cao

[permalink] [raw]
Subject: Re: [RFC PATCH] ext4: Make sure all the block allocation patch reserve blocks


在 2008-08-22五的 19:04 +0530,Aneesh Kumar K.V写道:
> With delayed allocation we need to make sure block are reserved
> before we attempt to allocate them. Otherwise we get block
> allocation failure (ENOSPC) during writepages which cannot
> be handled. This would mean silent data loss (We do a printk
> stating data will be lost). m

The patch description is a little unaccurate. We already have the blocks
reservation for delayed allocation. But it doesn't handle the multiple
threads racing issue, in case of the filesystem is close to full, which
will result in overbooking free blocks, and ENOSPC later at the
wirtepages time.

> This patch update the DIO
> and fallocate code path to do block reservation before block
> allocation.
>
Do you mean the patch changes the mballoc and old block allocator to
claim free blocks before doing real block allocation, and after
allocation is done, update the free blocks counter properly (if
allocated < requested)?

Yes that's the code path that is common to DIO and fallocate, but I
don't see we need block reservation for DIO and fallocate. Unlike
delayed allocation, they could handle ENOSPC error, and return to user
right after the block allocation request.

> When free blocks count go below a threshold we switch to
> a slow patch which looks at other CPU's accumulated percpu
> counter values.
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> fs/ext4/balloc.c | 72 ++++++++++++++++++++++++++++++++++++++++++++---------
> fs/ext4/ext4.h | 2 +
> fs/ext4/inode.c | 5 +---
> fs/ext4/mballoc.c | 23 +++++++++-------
> 4 files changed, 76 insertions(+), 26 deletions(-)
>
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index cfed283..4a53541 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -1602,6 +1602,47 @@ ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
> return ret;
> }
>
> +int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
> + ext4_fsblk_t nblocks)
> +{
> + s64 free_blocks;
> + ext4_fsblk_t root_blocks = 0;
> + struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
> +
> + free_blocks = percpu_counter_read(fbc);
> +
> + if (!capable(CAP_SYS_RESOURCE) &&
> + sbi->s_resuid != current->fsuid &&
> + (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
> + root_blocks = ext4_r_blocks_count(sbi->s_es);
> +#ifdef CONFIG_SMP
> + /* Each CPU can accumulate FBC_BATCH blocks in their local
> + * counters. So we need to make sure we have free blocks more
> + * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times.
> + */
> + if (free_blocks - (nblocks + root_blocks) <
> + (4 * (FBC_BATCH * nr_cpu_ids))) {
> + /*
> + * We need to sum and claim under lock
> + * This is the slow patch which will be
> + * taken when we are very low on free blocks
> + */
> + if (percpu_counter_sum_and_sub(fbc, nblocks + root_blocks))
> + return -ENOSPC;
> + /* add root_blocks back */
> + percpu_counter_add(fbc, root_blocks);
> + return 0;
> + }
> +#endif
> + if (free_blocks < (root_blocks + nblocks))
> + /* we don't have free space */
> + return -ENOSPC;
> +
> + /* reduce fs free blocks counter */
> + percpu_counter_sub(fbc, nblocks);
> + return 0;
> +}
> +

The only reason we need #ifdef CONFIG_SMP is FBC_BATCH is undefined in
UP. So instead of using FBC_BATCH, we could use nr_cpu_ids *4 directly,
so avoid the need for CONFIG_SMP nest in ext4 code.

Better to have a a define of

#define EXT$_FREEBLOCKS_LOW 4 * nr_cpu_ids * nr_cpu_ids

So later if this watermark is too low, we could change it easily.
> /**
> * ext4_has_free_blocks()
> * @sbi: in-core super block structure.
> @@ -1624,9 +1665,15 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
> (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
> root_blocks = ext4_r_blocks_count(sbi->s_es);
> #ifdef CONFIG_SMP
> - if (free_blocks - root_blocks < FBC_BATCH)
> + /* Each CPU can accumulate FBC_BATCH blocks in their local
> + * counters. So we need to make sure we have free blocks more
> + * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times.
> + */
> + if (free_blocks - (nblocks + root_blocks) <
> + (4 * (FBC_BATCH * nr_cpu_ids))) {
> free_blocks =
> percpu_counter_sum(&sbi->s_freeblocks_counter);
> + }
> #endif
> if (free_blocks <= root_blocks)
> /* we don't have free space */
> @@ -1634,7 +1681,7 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
> if (free_blocks - root_blocks < nblocks)
> return free_blocks - root_blocks;
> return nblocks;
> - }
> +}
>
>
> /**
> @@ -1713,14 +1760,11 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
> /*
> * With delalloc we already reserved the blocks
> */
> - *count = ext4_has_free_blocks(sbi, *count);
> - }
> - if (*count == 0) {
> - *errp = -ENOSPC;
> - return 0; /*return with ENOSPC error */
> + if (ext4_claim_free_blocks(sbi, *count)) {
> + *errp = -ENOSPC;
> + return 0; /*return with ENOSPC error */
> + }

Oh, The changes hurt DIO and fallocate, I think.

In fact with this changes, DIO and fallocate will_STOP_ allocating even
if the there are some free blocks around, but less than requested, and
returns ENOSPC too early. With current code it will try to allocate as
much as possible.

> }
> - num = *count;
> -
> /*
> * Check quota for allocation of this block.
> */
> @@ -1915,9 +1959,13 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
> le16_add_cpu(&gdp->bg_free_blocks_count, -num);
> gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
> spin_unlock(sb_bgl_lock(sbi, group_no));
> - if (!EXT4_I(inode)->i_delalloc_reserved_flag)
> - percpu_counter_sub(&sbi->s_freeblocks_counter, num);
> -
> + if (!EXT4_I(inode)->i_delalloc_reserved_flag && (*count != num)) {
> + /*
> + * we allocated less blocks than we
> + * claimed. Add the difference back.
> + */
> + percpu_counter_add(&sbi->s_freeblocks_counter, *count - num);
> + }
> if (sbi->s_log_groups_per_flex) {
> ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
> spin_lock(sb_bgl_lock(sbi, flex_group));
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 7f11b25..3738039 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1047,6 +1047,8 @@ extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
> unsigned long *count, int *errp);
> extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
> ext4_fsblk_t goal, unsigned long *count, int *errp);
> +extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
> + ext4_fsblk_t nblocks);
> extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
> ext4_fsblk_t nblocks);
> extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 1c289c1..d965a05 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1537,13 +1537,10 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
> md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
> total = md_needed + nrblocks;
>
> - if (ext4_has_free_blocks(sbi, total) < total) {
> + if (ext4_claim_free_blocks(sbi, total)) {
> spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
> return -ENOSPC;
> }
> - /* reduce fs free blocks counter */
> - percpu_counter_sub(&sbi->s_freeblocks_counter, total);
> -
> EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
> EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
>
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 82dd0e4..4404b46 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2977,9 +2977,15 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
> * at write_begin() time for delayed allocation
> * do not double accounting
> */
> - if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
> - percpu_counter_sub(&sbi->s_freeblocks_counter,
> - ac->ac_b_ex.fe_len);
> + if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) &&
> + ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) {
> + /*
> + * we allocated less blocks than we calimed
> + * Add the difference back
> + */
> + percpu_counter_add(&sbi->s_freeblocks_counter,
> + ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len);
> + }
>
> if (sbi->s_log_groups_per_flex) {
> ext4_group_t flex_group = ext4_flex_group(sbi,
> @@ -4391,14 +4397,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
> /*
> * With delalloc we already reserved the blocks
> */
> - ar->len = ext4_has_free_blocks(sbi, ar->len);
> - }
> -
> - if (ar->len == 0) {
> - *errp = -ENOSPC;
> - return 0;
> + if (ext4_claim_free_blocks(sbi, ar->len)) {
> + *errp = -ENOSPC;
> + return 0;
> + }
> }
> -
> while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
> ar->flags |= EXT4_MB_HINT_NOPREALLOC;
> ar->len--;

2008-08-22 18:29:59

by Mingming Cao

[permalink] [raw]
Subject: Re: [RFC PATCH] percpu_counters: make fbc->count read atomic on 32 bit architecture


在 2008-08-22五的 19:04 +0530,Aneesh Kumar K.V写道:
> fbc->count is of type s64. The change was introduced by
> 0216bfcffe424a5473daa4da47440881b36c1f4 which changed the type
> from long to s64. Moving to s64 also means on 32 bit architectures
> we can get wrong values on fbc->count.
>
> percpu_counter_read is used within interrupt context also. So
> use the irq safe version of spinlock while reading
>

It's quit expensive to hold the lock to do percpu_counter_read on 32
bit arch, the common case.

The type of the global counter and local counter were explictly
specified using s64 and s32 The global counter is changed from long to
s64, while the local counter is changed from long to s32, so we could
avoid doing 64 bit update in most cases. After all the percpu counter
read is not a accurate value.

Mingming
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> CC: Peter Zijlstra <[email protected]>
> ---
> include/linux/percpu_counter.h | 23 +++++++++++++++++++++--
> 1 files changed, 21 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
> index 9007ccd..af485b1 100644
> --- a/include/linux/percpu_counter.h
> +++ b/include/linux/percpu_counter.h
> @@ -53,10 +53,29 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
> return __percpu_counter_sum(fbc);
> }
>
> -static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> +#if BITS_PER_LONG == 64
> +static inline s64 fbc_count(struct percpu_counter *fbc)
> {
> return fbc->count;
> }
> +#else
> +/* doesn't have atomic 64 bit operation */
> +static inline s64 fbc_count(struct percpu_counter *fbc)
> +{
> + s64 ret;
> + unsigned long flags;
> + spin_lock_irqsave(&fbc->lock, flags);
> + ret = fbc->count;
> + spin_unlock_irqrestore(&fbc->lock, flags);
> + return ret;
> +
> +}
> +#endif
> +
> +static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> +{
> + return fbc_count(fbc);
> +}
>
> /*
> * It is possible for the percpu_counter_read() to return a small negative
> @@ -65,7 +84,7 @@ static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> */
> static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
> {
> - s64 ret = fbc->count;
> + s64 ret = fbc_count(fbc);
>
> barrier(); /* Prevent reloads of fbc->count */
> if (ret >= 0)

2008-08-22 18:34:12

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Fri, 2008-08-22 at 11:29 -0700, Mingming Cao wrote:
> 在 2008-08-22五的 19:04 +0530,Aneesh Kumar K.V写道:
> > fbc->count is of type s64. The change was introduced by
> > 0216bfcffe424a5473daa4da47440881b36c1f4 which changed the type
> > from long to s64. Moving to s64 also means on 32 bit architectures
> > we can get wrong values on fbc->count.
> >
> > percpu_counter_read is used within interrupt context also. So
> > use the irq safe version of spinlock while reading
> >
>
> It's quit expensive to hold the lock to do percpu_counter_read on 32
> bit arch, the common case.
>
> The type of the global counter and local counter were explictly
> specified using s64 and s32 The global counter is changed from long to
> s64, while the local counter is changed from long to s32, so we could
> avoid doing 64 bit update in most cases. After all the percpu counter
> read is not a accurate value.

seqlocks come to mind.