Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/inode.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ab22297..cc4737e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1057,7 +1057,7 @@ qsize_t ext4_get_reserved_space(struct inode *inode)
EXT4_I(inode)->i_reserved_meta_blocks;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- return total;
+ return (total << inode->i_blkbits);
}
/*
* Calculate the number of metadata blocks need to reserve
--
1.6.0.4
Currently all quota's functions except vfs_dq_reserve_block()
called without i_block_reservation_lock. This result in
ext4_reservation vs quota_reservation inconsistency which provoke
incorrect reservation transfer ==> incorrect quota after transfer.
Race (1)
| Task 1 (chown) | Task 2 (truncate) |
| dquot_transfer | |
| ->down_write(dqptr_sem) | ext4_da_release_spac |
| -->dquot_get_reserved_space | ->lock(i_block_reservation_lock) |
| --->get_reserved_space | /* decrement reservation */ |
| ---->ext4_get_reserved_space | ->unlock(i_block_reservation_lock) |
| ----->lock(i_block_rsv_lock) | /* During this time window |
| /* Read ext4_rsv from inode */ | * fs's reservation not equals |
| /* transfer it to new quota */ | * to quota's */ |
| ->up_write(dqptr_sem) | ->vfs_dq_release_reservation_block() |
| | /* quota_rsv goes negative here */ |
| | |
Race (2)
| Task 1 (chown) | Task 2 (flush-8:16) |
| dquot_transfer() | ext4_mb_mark_diskspace_used() |
| ->down_write(dqptr_sem) | ->vfs_dq_claim_block() |
| --->get_reserved_space() | /* After this moment */ |
| --->ext4_get_reserved_space() | /* ext4_rsv != quota_ino_rsv */ |
| /* Read rsv from inode which | |
| ->dquot_free_reserved_space() | |
| /* quota_rsv goes negative */ | |
| | |
| | dquot_free_reserved_space() |
| | /* finally dec ext4_ino_rsv */ |
So, in order to protect us from this type of races we always have to
provides ext4_ino_rsv == quot_ino_rsv guarantee. And this is only
possible then i_block_reservation_lock is taken before entering any
quota operations.
In fact i_block_reservation_lock is held by ext4_da_reserve_space()
while calling vfs_dq_reserve_block(). Lock are held in following order
i_block_reservation_lock > dqptr_sem
This may result in deadlock because of different lock ordering:
ext4_da_reserve_space() dquot_transfer()
lock(i_block_reservation_lock) down_write(dqptr_sem)
down_write(dqptr_sem) lock(i_block_reservation_lock)
But this not happen only because both callers must have i_mutex so
serialization happens on i_mutex.
To prevent ext4_reservation vs dquot_reservation inconsistency we
have to make following changes
- Reorganize locking ordering like follows:
i_block_reservation_lock > dqptr_sem
This means what all functions which call vfs_dq_XXX must acquire
i_block_reservation_lock before.
- Move space claiming to ext4_da_update_reserve_space()
Only here we do know what we are dealing with data or metadata
- Delay dquot_claim for metadata before it until proper moment(
until we are allowed to decrement inodes->i_reserved_meta_blocks)
The most useful test case for delalloc+quota is concurrent tasks
with write+truncate vs chown for the same file.
Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/inode.c | 33 ++++++++++++++++++++++++---------
fs/ext4/mballoc.c | 6 ------
2 files changed, 24 insertions(+), 15 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cc4737e..979362d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1047,16 +1047,23 @@ cleanup:
out:
return err;
}
+/*
+ * Quota_transfer callback.
+ * During quota transfer we have to transfer rsv-blocks from one dquot to
+ * another. inode must be protected from concurrent reservation/reclamation.
+ * Locking ordering for all space reservation code:
+ * i_block_reservation_lock > dqptr_sem
+ * This is differ from i_block,i_lock locking ordering, but this is the
+ * only possible way.
+ * NOTE: Caller must hold i_block_reservation_lock.
+ */
qsize_t ext4_get_reserved_space(struct inode *inode)
{
unsigned long long total;
- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
total = EXT4_I(inode)->i_reserved_data_blocks +
EXT4_I(inode)->i_reserved_meta_blocks;
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-
return (total << inode->i_blkbits);
}
/*
@@ -1096,7 +1103,7 @@ static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
static void ext4_da_update_reserve_space(struct inode *inode, int used)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- int total, mdb, mdb_free;
+ int total, mdb, mdb_free, mdb_claim = 0;
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
/* recalculate the number of metablocks still need to be reserved */
@@ -1109,7 +1116,9 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
if (mdb_free) {
/* Account for allocated meta_blocks */
- mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+ mdb_claim = EXT4_I(inode)->i_allocated_meta_blocks;
+ BUG_ON(mdb_free < mdb_claim);
+ mdb_free -= mdb_claim;
/* update fs dirty blocks counter */
percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
@@ -1119,14 +1128,16 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
/* update per-inode reservations */
BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
- EXT4_I(inode)->i_reserved_data_blocks -= used;
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ EXT4_I(inode)->i_reserved_data_blocks -= used;
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, used + mdb_claim);
+ vfs_dq_claim_block(inode, used + mdb_claim);
/*
* free those over-booking quota for metadata blocks
*/
if (mdb_free)
vfs_dq_release_reservation_block(inode, mdb_free);
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
/*
* If we have done all the pending block allocations and if
@@ -1863,8 +1874,8 @@ repeat:
}
if (ext4_claim_free_blocks(sbi, total)) {
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
vfs_dq_release_reservation_block(inode, total);
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
yield();
goto repeat;
@@ -1921,9 +1932,9 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
EXT4_I(inode)->i_reserved_meta_blocks = mdb;
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
vfs_dq_release_reservation_block(inode, release);
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
}
static void ext4_da_page_release_reservation(struct page *page,
@@ -5433,7 +5444,11 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
error = PTR_ERR(handle);
goto err_out;
}
+ /* i_block_reservation must being held in order to avoid races
+ * with concurent block reservation. */
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
if (error) {
ext4_journal_stop(handle);
return error;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index bba1282..d4c52db 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2750,12 +2750,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
/* release all the reserved blocks if non delalloc */
percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
- else {
- percpu_counter_sub(&sbi->s_dirtyblocks_counter,
- ac->ac_b_ex.fe_len);
- /* convert reserved quota blocks to real quota blocks */
- vfs_dq_claim_block(ac->ac_inode, ac->ac_b_ex.fe_len);
- }
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi,
--
1.6.0.4
Currently all quota block reservation macros contains hard-coded "2"
aka MAXQUOTAS value. This is no good because in some places it is not
obvious to understand what does this digit represent. Let's introduce
new macro with self descriptive name.
Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/ext4_jbd2.h | 8 ++++++--
fs/ext4/extents.c | 2 +-
fs/ext4/inode.c | 2 +-
fs/ext4/migrate.c | 4 ++--
fs/ext4/namei.c | 8 ++++----
5 files changed, 14 insertions(+), 10 deletions(-)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index a286598..404ab6c 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -49,7 +49,7 @@
#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
EXT4_XATTR_TRANS_BLOCKS - 2 + \
- 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
/*
* Define the number of metadata blocks we need to account to modify data.
@@ -57,7 +57,7 @@
* This include super block, inode block, quota blocks and xattr blocks
*/
#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
- 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
/* Delete operations potentially hit one directory's namespace plus an
* entire inode, plus arbitrary amounts of bitmap/indirection data. Be
@@ -92,6 +92,7 @@
* but inode, sb and group updates are done only once */
#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
+
#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
#else
@@ -99,6 +100,9 @@
#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
#endif
+#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
int
ext4_mark_iloc_dirty(handle_t *handle,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 715264b..f73c3ff 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2167,7 +2167,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
correct_index = 1;
credits += (ext_depth(inode)) + 1;
}
- credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+ credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
err = ext4_ext_truncate_extend_restart(handle, inode, credits);
if (err)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 979362d..211722b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5438,7 +5438,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
/* (user+group)*(old+new) structure, inode write (sb,
* inode block, ? - but truncate inode update has it) */
- handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
+ handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index a93d5b8..8646149 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -238,7 +238,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
* So allocate a credit of 3. We may update
* quota (user and group).
*/
- needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+ needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
if (ext4_journal_extend(handle, needed) != 0)
retval = ext4_journal_restart(handle, needed);
@@ -477,7 +477,7 @@ int ext4_ext_migrate(struct inode *inode)
handle = ext4_journal_start(inode,
EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)
+ EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
+ 1);
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index fde08c9..17a17e1 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1769,7 +1769,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -1803,7 +1803,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -1840,7 +1840,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2253,7 +2253,7 @@ static int ext4_symlink(struct inode *dir,
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
--
1.6.0.4
Inside ->setattr() call both ATTR_UID and ATTR_GID may be valid
This means that we may end-up with transferring all quotas. Add
we have to reserve QUOTA_DEL_BLOCKS for all quotas, as we do in
case of QUOTA_INIT_BLOCKS.
Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/inode.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 211722b..d42e954 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5439,7 +5439,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
/* (user+group)*(old+new) structure, inode write (sb,
* inode block, ? - but truncate inode update has it) */
handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
- EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+ EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto err_out;
--
1.6.0.4
Dmitry Monakhov wrote:
> Signed-off-by: Dmitry Monakhov <[email protected]>
Thanks,
Reviewed-by: Eric Sandeen <[email protected]>
> ---
> fs/ext4/inode.c | 2 +-
> 1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index ab22297..cc4737e 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1057,7 +1057,7 @@ qsize_t ext4_get_reserved_space(struct inode *inode)
> EXT4_I(inode)->i_reserved_meta_blocks;
> spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
>
> - return total;
> + return (total << inode->i_blkbits);
> }
> /*
> * Calculate the number of metadata blocks need to reserve
On Wed, Nov 25, 2009 at 09:57:39AM +0300, Dmitry Monakhov wrote:
> Currently all quota's functions except vfs_dq_reserve_block()
> called without i_block_reservation_lock. This result in
> ext4_reservation vs quota_reservation inconsistency which provoke
> incorrect reservation transfer ==> incorrect quota after transfer.
>
> Race (1)
> | Task 1 (chown) | Task 2 (truncate) |
> | dquot_transfer | |
> | ->down_write(dqptr_sem) | ext4_da_release_spac |
> | -->dquot_get_reserved_space | ->lock(i_block_reservation_lock) |
> | --->get_reserved_space | /* decrement reservation */ |
> | ---->ext4_get_reserved_space | ->unlock(i_block_reservation_lock) |
> | ----->lock(i_block_rsv_lock) | /* During this time window |
> | /* Read ext4_rsv from inode */ | * fs's reservation not equals |
> | /* transfer it to new quota */ | * to quota's */ |
> | ->up_write(dqptr_sem) | ->vfs_dq_release_reservation_block() |
> | | /* quota_rsv goes negative here */ |
> | | |
>
> Race (2)
> | Task 1 (chown) | Task 2 (flush-8:16) |
> | dquot_transfer() | ext4_mb_mark_diskspace_used() |
> | ->down_write(dqptr_sem) | ->vfs_dq_claim_block() |
> | --->get_reserved_space() | /* After this moment */ |
> | --->ext4_get_reserved_space() | /* ext4_rsv != quota_ino_rsv */ |
> | /* Read rsv from inode which | |
> | ->dquot_free_reserved_space() | |
> | /* quota_rsv goes negative */ | |
> | | |
> | | dquot_free_reserved_space() |
> | | /* finally dec ext4_ino_rsv */ |
>
> So, in order to protect us from this type of races we always have to
> provides ext4_ino_rsv == quot_ino_rsv guarantee. And this is only
> possible then i_block_reservation_lock is taken before entering any
> quota operations.
>
> In fact i_block_reservation_lock is held by ext4_da_reserve_space()
> while calling vfs_dq_reserve_block(). Lock are held in following order
> i_block_reservation_lock > dqptr_sem
>
> This may result in deadlock because of different lock ordering:
> ext4_da_reserve_space() dquot_transfer()
> lock(i_block_reservation_lock) down_write(dqptr_sem)
> down_write(dqptr_sem) lock(i_block_reservation_lock)
>
> But this not happen only because both callers must have i_mutex so
> serialization happens on i_mutex.
But that down_write can sleep right ?
For example:
http://bugzilla.kernel.org/show_bug.cgi?id=14739
-aneesh
"Aneesh Kumar K.V" <[email protected]> writes:
> On Wed, Nov 25, 2009 at 09:57:39AM +0300, Dmitry Monakhov wrote:
>> Currently all quota's functions except vfs_dq_reserve_block()
>> called without i_block_reservation_lock. This result in
>> ext4_reservation vs quota_reservation inconsistency which provoke
>> incorrect reservation transfer ==> incorrect quota after transfer.
>>
>> Race (1)
>> | Task 1 (chown) | Task 2 (truncate) |
>> | dquot_transfer | |
>> | ->down_write(dqptr_sem) | ext4_da_release_spac |
>> | -->dquot_get_reserved_space | ->lock(i_block_reservation_lock) |
>> | --->get_reserved_space | /* decrement reservation */ |
>> | ---->ext4_get_reserved_space | ->unlock(i_block_reservation_lock) |
>> | ----->lock(i_block_rsv_lock) | /* During this time window |
>> | /* Read ext4_rsv from inode */ | * fs's reservation not equals |
>> | /* transfer it to new quota */ | * to quota's */ |
>> | ->up_write(dqptr_sem) | ->vfs_dq_release_reservation_block() |
>> | | /* quota_rsv goes negative here */ |
>> | | |
>>
>> Race (2)
>> | Task 1 (chown) | Task 2 (flush-8:16) |
>> | dquot_transfer() | ext4_mb_mark_diskspace_used() |
>> | ->down_write(dqptr_sem) | ->vfs_dq_claim_block() |
>> | --->get_reserved_space() | /* After this moment */ |
>> | --->ext4_get_reserved_space() | /* ext4_rsv != quota_ino_rsv */ |
>> | /* Read rsv from inode which | |
>> | ->dquot_free_reserved_space() | |
>> | /* quota_rsv goes negative */ | |
>> | | |
>> | | dquot_free_reserved_space() |
>> | | /* finally dec ext4_ino_rsv */ |
>>
>> So, in order to protect us from this type of races we always have to
>> provides ext4_ino_rsv == quot_ino_rsv guarantee. And this is only
>> possible then i_block_reservation_lock is taken before entering any
>> quota operations.
>>
>> In fact i_block_reservation_lock is held by ext4_da_reserve_space()
>> while calling vfs_dq_reserve_block(). Lock are held in following order
>> i_block_reservation_lock > dqptr_sem
>>
>> This may result in deadlock because of different lock ordering:
>> ext4_da_reserve_space() dquot_transfer()
>> lock(i_block_reservation_lock) down_write(dqptr_sem)
>> down_write(dqptr_sem) lock(i_block_reservation_lock)
>>
>> But this not happen only because both callers must have i_mutex so
>> serialization happens on i_mutex.
>
>
> But that down_write can sleep right ?
Absolutely right. I've fixed an issue, but overlooked the BIGGEST one.
So off course my patch is wrong, even if we will acquire lock in
different order " dqptr_sem > i_block_reservation_lock"
we sill getting in to sleeping spin lock problems by following scenario:
ext4_da_update_reserve_space()
->dquot_claim_space()
ASSUMES that we hold i_block_reservation_lock here.
-->mark_dquot_dirty()
--->ext4_write_dquot()
if (journalled quota) ext4_write_dquot();
---->dquot_commit()
----->mutex_lock(&dqopt->dqio_mutt's); <<< sleep here.
This means that we have fully redesign quota reservation locking.
As i already suggested previously here:
http://thread.gmane.org/gmane.comp.file-systems.ext4/16576/focus=16587
* Introduce i_block analog for generic reserved space management:
We may introduce i_rsv_block field in generic intone, it protected
by i_lock(similar to i_block).
Introduce inc/dec/set/get methods similar to inode_get_bytes,
inode_sub_bytes.. .
This value is managed internally by quota code. Perform reservation
management inside devout_reserve_space, dquot_release_reservation
without interfering with fs internals, as we do for i_blocks.
So locking sequence will looks like follows
ext4_XXX_space()
->spin_lock(&i_block_reservation_lock)
->// update ext4_rsv fields
->spin_unlock(&i_block_reservation_lock)
->vfs_XXX_space()
-->down_XXX(&dqptr_sem)
--->inode_XXX_reserved_bytes << analog for inode_XXX_bytes()
---->spin_lock(&inode->i_lock)
---->// update inode->i_rsv_block (and inode->i_block if necessary)
---->spin_unlock(&inode->i_lock)
--->mark_dirty()
IMHO this is best way because:
1)This is the only sane way to fix #14739
2)This brings to well defined VFS interface for reserved space management.
I'll prepare the patch soon.
>
> For example:
> http://bugzilla.kernel.org/show_bug.cgi?id=14739
>
> -aneesh
> LocalWords: rsv inode dquot
On Wed, 2009-11-25 at 09:57 +0300, Dmitry Monakhov wrote:
> Currently all quota block reservation macros contains hard-coded "2"
> aka MAXQUOTAS value. This is no good because in some places it is not
> obvious to understand what does this digit represent. Let's introduce
> new macro with self descriptive name.
>
> Signed-off-by: Dmitry Monakhov <[email protected]>
looks sane, MAXQUOTAS is either user or group quota.
Acked-by: Mingming Cao <[email protected]>
> ---
> fs/ext4/ext4_jbd2.h | 8 ++++++--
> fs/ext4/extents.c | 2 +-
> fs/ext4/inode.c | 2 +-
> fs/ext4/migrate.c | 4 ++--
> fs/ext4/namei.c | 8 ++++----
> 5 files changed, 14 insertions(+), 10 deletions(-)
>
> diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
> index a286598..404ab6c 100644
> --- a/fs/ext4/ext4_jbd2.h
> +++ b/fs/ext4/ext4_jbd2.h
> @@ -49,7 +49,7 @@
>
> #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
> EXT4_XATTR_TRANS_BLOCKS - 2 + \
> - 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
> + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
>
> /*
> * Define the number of metadata blocks we need to account to modify data.
> @@ -57,7 +57,7 @@
> * This include super block, inode block, quota blocks and xattr blocks
> */
> #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
> - 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
> + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
>
> /* Delete operations potentially hit one directory's namespace plus an
> * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
> @@ -92,6 +92,7 @@
> * but inode, sb and group updates are done only once */
> #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
> (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
> +
> #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
> (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
> #else
> @@ -99,6 +100,9 @@
> #define EXT4_QUOTA_INIT_BLOCKS(sb) 0
> #define EXT4_QUOTA_DEL_BLOCKS(sb) 0
> #endif
> +#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
> +#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
> +#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
>
> int
> ext4_mark_iloc_dirty(handle_t *handle,
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 715264b..f73c3ff 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -2167,7 +2167,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
> correct_index = 1;
> credits += (ext_depth(inode)) + 1;
> }
> - credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
> + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
>
> err = ext4_ext_truncate_extend_restart(handle, inode, credits);
> if (err)
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 979362d..211722b 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5438,7 +5438,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
>
> /* (user+group)*(old+new) structure, inode write (sb,
> * inode block, ? - but truncate inode update has it) */
> - handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
> + handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
> EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
> if (IS_ERR(handle)) {
> error = PTR_ERR(handle);
> diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
> index a93d5b8..8646149 100644
> --- a/fs/ext4/migrate.c
> +++ b/fs/ext4/migrate.c
> @@ -238,7 +238,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
> * So allocate a credit of 3. We may update
> * quota (user and group).
> */
> - needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
> + needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
>
> if (ext4_journal_extend(handle, needed) != 0)
> retval = ext4_journal_restart(handle, needed);
> @@ -477,7 +477,7 @@ int ext4_ext_migrate(struct inode *inode)
> handle = ext4_journal_start(inode,
> EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
> EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
> - 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)
> + EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
> + 1);
> if (IS_ERR(handle)) {
> retval = PTR_ERR(handle);
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index fde08c9..17a17e1 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -1769,7 +1769,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
> retry:
> handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
> EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
> - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
> + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
> if (IS_ERR(handle))
> return PTR_ERR(handle);
>
> @@ -1803,7 +1803,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
> retry:
> handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
> EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
> - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
> + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
> if (IS_ERR(handle))
> return PTR_ERR(handle);
>
> @@ -1840,7 +1840,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
> retry:
> handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
> EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
> - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
> + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
> if (IS_ERR(handle))
> return PTR_ERR(handle);
>
> @@ -2253,7 +2253,7 @@ static int ext4_symlink(struct inode *dir,
> retry:
> handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
> EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
> - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
> + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
> if (IS_ERR(handle))
> return PTR_ERR(handle);
>
On Wed, 2009-11-25 at 09:57 +0300, Dmitry Monakhov wrote:
> Inside ->setattr() call both ATTR_UID and ATTR_GID may be valid
> This means that we may end-up with transferring all quotas. Add
> we have to reserve QUOTA_DEL_BLOCKS for all quotas, as we do in
> case of QUOTA_INIT_BLOCKS.
>
> Signed-off-by: Dmitry Monakhov <[email protected]>
Seems correct to me. It looks like ext3 need similar fix, mind to send a
ext3 patch as well?
Reviewed-by: Mingming Cao <[email protected]>
> ---
> fs/ext4/inode.c | 2 +-
> 1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 211722b..d42e954 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5439,7 +5439,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
> /* (user+group)*(old+new) structure, inode write (sb,
> * inode block, ? - but truncate inode update has it) */
> handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
> - EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
> + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
> if (IS_ERR(handle)) {
> error = PTR_ERR(handle);
> goto err_out;
On Wed, 2009-11-25 at 09:57 +0300, Dmitry Monakhov wrote:
> Signed-off-by: Dmitry Monakhov <[email protected]>
> ---
> fs/ext4/inode.c | 2 +-
> 1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index ab22297..cc4737e 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1057,7 +1057,7 @@ qsize_t ext4_get_reserved_space(struct inode *inode)
> EXT4_I(inode)->i_reserved_meta_blocks;
> spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
>
> - return total;
> + return (total << inode->i_blkbits);
> }
> /*
> * Calculate the number of metadata blocks need to reserve
It's a pity that we missed this fix at the time the change set merge to
the upstream, thanks for catching this.
Acked-by: Mingming Cao <[email protected]>
Mingming
Mingming <[email protected]> writes:
> On Wed, 2009-11-25 at 09:57 +0300, Dmitry Monakhov wrote:
>> Inside ->setattr() call both ATTR_UID and ATTR_GID may be valid
>> This means that we may end-up with transferring all quotas. Add
>> we have to reserve QUOTA_DEL_BLOCKS for all quotas, as we do in
>> case of QUOTA_INIT_BLOCKS.
>>
>> Signed-off-by: Dmitry Monakhov <[email protected]>
>
> Seems correct to me. It looks like ext3 need similar fix, mind to send a
> ext3 patch as well?
Done.
>
> Reviewed-by: Mingming Cao <[email protected]>
>> ---
>> fs/ext4/inode.c | 2 +-
>> 1 files changed, 1 insertions(+), 1 deletions(-)
>>
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 211722b..d42e954 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -5439,7 +5439,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
>> /* (user+group)*(old+new) structure, inode write (sb,
>> * inode block, ? - but truncate inode update has it) */
>> handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
>> - EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
>> + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
>> if (IS_ERR(handle)) {
>> error = PTR_ERR(handle);
>> goto err_out;
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 2009-11-25 at 09:57 +0300, Dmitry Monakhov wrote:
> Currently all quota's functions except vfs_dq_reserve_block()
> called without i_block_reservation_lock. This result in
> ext4_reservation vs quota_reservation inconsistency which provoke
> incorrect reservation transfer ==> incorrect quota after transfer.
>
> Race (1)
> | Task 1 (chown) | Task 2 (truncate) |
> | dquot_transfer | |
> | ->down_write(dqptr_sem) | ext4_da_release_spac |
> | -->dquot_get_reserved_space | ->lock(i_block_reservation_lock) |
> | --->get_reserved_space | /* decrement reservation */ |
> | ---->ext4_get_reserved_space | ->unlock(i_block_reservation_lock) |
> | ----->lock(i_block_rsv_lock) | /* During this time window |
> | /* Read ext4_rsv from inode */ | * fs's reservation not equals |
> | /* transfer it to new quota */ | * to quota's */ |
> | ->up_write(dqptr_sem) | ->vfs_dq_release_reservation_block() |
> | | /* quota_rsv goes negative here */ |
> | | |
>
it seems to me the issue is quota_rsv goes negative here. hmm, but we
could still possible ge negative quota rsv with chown and truncate
serialized, no? I am still not very clear how could the locking reorder
will protect not getting negative quota reservation?
> Race (2)
> | Task 1 (chown) | Task 2 (flush-8:16) |
> | dquot_transfer() | ext4_mb_mark_diskspace_used() |
> | ->down_write(dqptr_sem) | ->vfs_dq_claim_block() |
> | --->get_reserved_space() | /* After this moment */ |
> | --->ext4_get_reserved_space() | /* ext4_rsv != quota_ino_rsv */ |
> | /* Read rsv from inode which | |
> | ->dquot_free_reserved_space() | |
> | /* quota_rsv goes negative */ | |
> | | |
> | | dquot_free_reserved_space() |
> | | /* finally dec ext4_ino_rsv */ |
>
> So, in order to protect us from this type of races we always have to
> provides ext4_ino_rsv == quot_ino_rsv guarantee. And this is only
> possible then i_block_reservation_lock is taken before entering any
> quota operations.
>
> In fact i_block_reservation_lock is held by ext4_da_reserve_space()
> while calling vfs_dq_reserve_block(). Lock are held in following order
> i_block_reservation_lock > dqptr_sem
>
> This may result in deadlock because of different lock ordering:
> ext4_da_reserve_space() dquot_transfer()
> lock(i_block_reservation_lock) down_write(dqptr_sem)
> down_write(dqptr_sem) lock(i_block_reservation_lock)
>
> But this not happen only because both callers must have i_mutex so
> serialization happens on i_mutex.
>
yes, the i_mutex lock is there to prevent the deadlock.
> To prevent ext4_reservation vs dquot_reservation inconsistency we
> have to make following changes
> - Reorganize locking ordering like follows:
> i_block_reservation_lock > dqptr_sem
> This means what all functions which call vfs_dq_XXX must acquire
> i_block_reservation_lock before.
>
> - Move space claiming to ext4_da_update_reserve_space()
> Only here we do know what we are dealing with data or metadata
>
> - Delay dquot_claim for metadata before it until proper moment(
> until we are allowed to decrement inodes->i_reserved_meta_blocks)
>
Now these make sense to me, you are trying to keep the reserved quota
and reserved blocks consistant all the time. However the issue is
dquot_tranfer() only tranfers the reserved quotas, but not dec the
reserved data/metadata blocks for delalloc. so we will end up nega
values too...I don't see how is help.
Perhaps we shall not transfer the quota reservation to another inode?
Only those claimed quotas? I actually wondered whether we need to
transfer the quota reservation at the first time. Now it seems to me the
right thing to do is not transfer the reserved quota before they are
really claimed. This will make it keep consistant with the reserved
delalloc blocks possible.... CCing Jan who might have some good ideas...
> The most useful test case for delalloc+quota is concurrent tasks
> with write+truncate vs chown for the same file.
>
> Signed-off-by: Dmitry Monakhov <[email protected]>
> ---
> fs/ext4/inode.c | 33 ++++++++++++++++++++++++---------
> fs/ext4/mballoc.c | 6 ------
> 2 files changed, 24 insertions(+), 15 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index cc4737e..979362d 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1047,16 +1047,23 @@ cleanup:
> out:
> return err;
> }
> +/*
> + * Quota_transfer callback.
> + * During quota transfer we have to transfer rsv-blocks from one dquot to
> + * another. inode must be protected from concurrent reservation/reclamation.
> + * Locking ordering for all space reservation code:
> + * i_block_reservation_lock > dqptr_sem
> + * This is differ from i_block,i_lock locking ordering, but this is the
> + * only possible way.
> + * NOTE: Caller must hold i_block_reservation_lock.
> + */
>
> qsize_t ext4_get_reserved_space(struct inode *inode)
> {
> unsigned long long total;
>
> - spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
> total = EXT4_I(inode)->i_reserved_data_blocks +
> EXT4_I(inode)->i_reserved_meta_blocks;
> - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
> -
> return (total << inode->i_blkbits);
> }
> /*
> @@ -1096,7 +1103,7 @@ static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
> static void ext4_da_update_reserve_space(struct inode *inode, int used)
> {
> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> - int total, mdb, mdb_free;
> + int total, mdb, mdb_free, mdb_claim = 0;
>
> spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
> /* recalculate the number of metablocks still need to be reserved */
> @@ -1109,7 +1116,9 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
>
> if (mdb_free) {
> /* Account for allocated meta_blocks */
> - mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
> + mdb_claim = EXT4_I(inode)->i_allocated_meta_blocks;
> + BUG_ON(mdb_free < mdb_claim);
> + mdb_free -= mdb_claim;
>
> /* update fs dirty blocks counter */
> percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
> @@ -1119,14 +1128,16 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
>
> /* update per-inode reservations */
> BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
> - EXT4_I(inode)->i_reserved_data_blocks -= used;
> - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
> + EXT4_I(inode)->i_reserved_data_blocks -= used;
> + percpu_counter_sub(&sbi->s_dirtyblocks_counter, used + mdb_claim);
> + vfs_dq_claim_block(inode, used + mdb_claim);
>
> /*
> * free those over-booking quota for metadata blocks
> */
> if (mdb_free)
> vfs_dq_release_reservation_block(inode, mdb_free);
> + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
>
> /*
> * If we have done all the pending block allocations and if
> @@ -1863,8 +1874,8 @@ repeat:
> }
>
> if (ext4_claim_free_blocks(sbi, total)) {
> - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
> vfs_dq_release_reservation_block(inode, total);
> + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
> if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
> yield();
> goto repeat;
> @@ -1921,9 +1932,9 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
>
> BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
> EXT4_I(inode)->i_reserved_meta_blocks = mdb;
> - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
>
> vfs_dq_release_reservation_block(inode, release);
> + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
> }
>
> static void ext4_da_page_release_reservation(struct page *page,
> @@ -5433,7 +5444,11 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
> error = PTR_ERR(handle);
> goto err_out;
> }
> + /* i_block_reservation must being held in order to avoid races
> + * with concurent block reservation. */
> + spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
> error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
> + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
> if (error) {
> ext4_journal_stop(handle);
> return error;
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index bba1282..d4c52db 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2750,12 +2750,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
> if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
> /* release all the reserved blocks if non delalloc */
> percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
> - else {
> - percpu_counter_sub(&sbi->s_dirtyblocks_counter,
> - ac->ac_b_ex.fe_len);
> - /* convert reserved quota blocks to real quota blocks */
> - vfs_dq_claim_block(ac->ac_inode, ac->ac_b_ex.fe_len);
> - }
>
I'd prefer to keep the percpu_counter_sub(&sbi->s_dirtyblocks_counter,
used + mdb_claim); here, to stay together with no dellaoc case. Agree
that the quota claim could be defered and moved to
ext4_da_update_reserve_space()as you proposed.
> if (sbi->s_log_groups_per_flex) {
> ext4_group_t flex_group = ext4_flex_group(sbi,
On Mon, Dec 07, 2009 at 10:41:15PM +0300, Dmitry Monakhov wrote:
> Absolutely right. I've fixed an issue, but overlooked the BIGGEST one.
> So off course my patch is wrong, even if we will acquire lock in
> different order " dqptr_sem > i_block_reservation_lock"
> we sill getting in to sleeping spin lock problems by following scenario:
> ext4_da_update_reserve_space()
> ->dquot_claim_space()
> ASSUMES that we hold i_block_reservation_lock here.
> -->mark_dquot_dirty()
> --->ext4_write_dquot()
> if (journalled quota) ext4_write_dquot();
> ---->dquot_commit()
> ----->mutex_lock(&dqopt->dqio_mutt's); <<< sleep here.
>
> This means that we have fully redesign quota reservation locking.
> As i already suggested previously here:
> http://thread.gmane.org/gmane.comp.file-systems.ext4/16576/focus=16587
Given this, should I include this patch for now, given that it does
fix _one_ race, or should I hold off until you redo the locking? How
long do you think to send a revised/new patch?
- Ted
[email protected] writes:
> On Mon, Dec 07, 2009 at 10:41:15PM +0300, Dmitry Monakhov wrote:
>> Absolutely right. I've fixed an issue, but overlooked the BIGGEST one.
>> So off course my patch is wrong, even if we will acquire lock in
>> different order " dqptr_sem > i_block_reservation_lock"
>> we sill getting in to sleeping spin lock problems by following scenario:
>> ext4_da_update_reserve_space()
>> ->dquot_claim_space()
>> ASSUMES that we hold i_block_reservation_lock here.
>> -->mark_dquot_dirty()
>> --->ext4_write_dquot()
>> if (journalled quota) ext4_write_dquot();
>> ---->dquot_commit()
>> ----->mutex_lock(&dqopt->dqio_mutt's); <<< sleep here.
>>
>> This means that we have fully redesign quota reservation locking.
>> As i already suggested previously here:
>> http://thread.gmane.org/gmane.comp.file-systems.ext4/16576/focus=16587
>
> Given this, should I include this patch for now, given that it does
> fix _one_ race, or should I hold off until you redo the locking? How
> long do you think to send a revised/new patch?
Please wait until good version will be approved all involved people.
I've already prepared and tested RFC version which solves all known
issues. I'll send patch set in a minute.