2009-12-10 12:25:40

by Dmitry Monakhov

[permalink] [raw]
Subject: [PATCH 1/2] quota: decouple fs reserved space from quota reservation. [V3]

Currently inode_reservation is managed by fs itself and this
reservation is transfered on dquot_transfer(). This means what
inode_reservation must always be in sync with
dquot->dq_dqb.dqb_rsvspace. Otherwise dquot_transfer() will result
in incorrect quota(WARN_ON in dquot_claim_reserved_space() will be
triggered)
This is not easy because of complex locking order issues
for example http://bugzilla.kernel.org/show_bug.cgi?id=14739

The patch introduce quota reservation field for each fs-inode
(fs specific inode is used in order to prevent bloating generic
vfs inode). This reservation is managed by quota code internally
similar to i_blocks/i_bytes and may not be always in sync with
internal fs reservation.

Also perform some code rearrangement:
- Unify dquot_reserve_space() and dquot_reserve_space()
- Unify dquot_release_reserved_space() and dquot_free_space()
- Also this patch add missing warning update to release_rsv()
dquot_release_reserved_space() must call flush_warnings() as
dquot_free_space() does.

Changes from V1:
- move qutoa_reservation field from vfs_inode to fs_inode
- account reservation in bytes instead of blocks.

Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/quota/dquot.c | 219 ++++++++++++++++++++++++++++---------------------
include/linux/quota.h | 5 +-
2 files changed, 127 insertions(+), 97 deletions(-)

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 39b49c4..66221f8 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1388,6 +1388,71 @@ void vfs_dq_drop(struct inode *inode)
EXPORT_SYMBOL(vfs_dq_drop);

/*
+ * inode_reserved_space is managed internally by quota, and protected by
+ * i_lock similar to i_blocks+i_bytes.
+ */
+static qsize_t* inode_reserved_space(struct inode * inode)
+{
+ /* Filesystem must explicitly define it's own method in order to use
+ * quota reservation interface */
+ BUG_ON(!inode->i_sb->dq_op->get_reserved_space);
+ return inode->i_sb->dq_op->get_reserved_space(inode);
+}
+
+static void inode_add_rsv_space(struct inode *inode, qsize_t number)
+{
+ spin_lock(&inode->i_lock);
+ *inode_reserved_space(inode) += number;
+ spin_unlock(&inode->i_lock);
+}
+
+static void inode_claim_rsv_space(struct inode *inode, qsize_t number)
+{
+ spin_lock(&inode->i_lock);
+ *inode_reserved_space(inode) -= number;
+ inode->i_blocks += number >> 9;
+ number &= 511;
+ inode->i_bytes += number;
+ if (inode->i_bytes >= 512) {
+ inode->i_blocks++;
+ inode->i_bytes -= 512;
+ }
+ spin_unlock(&inode->i_lock);
+}
+
+static void inode_sub_rsv_space(struct inode *inode, qsize_t number)
+{
+ spin_lock(&inode->i_lock);
+ *inode_reserved_space(inode) -= number;
+ spin_unlock(&inode->i_lock);
+}
+
+static qsize_t inode_get_rsv_space(struct inode *inode)
+{
+ qsize_t ret;
+ spin_lock(&inode->i_lock);
+ ret = *inode_reserved_space(inode);
+ spin_unlock(&inode->i_lock);
+ return ret;
+}
+
+static void __inode_add_bytes(struct inode *inode, qsize_t number, int reserve)
+{
+ if (reserve)
+ inode_add_rsv_space(inode, number);
+ else
+ inode_add_bytes(inode, number);
+}
+
+static void __inode_sub_bytes(struct inode *inode, qsize_t number, int reserve)
+{
+ if (reserve)
+ inode_sub_rsv_space(inode, number);
+ else
+ inode_sub_bytes(inode, number);
+}
+
+/*
* Following four functions update i_blocks+i_bytes fields and
* quota information (together with appropriate checks)
* NOTE: We absolutely rely on the fact that caller dirties
@@ -1405,6 +1470,21 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
int cnt, ret = QUOTA_OK;
char warntype[MAXQUOTAS];

+ /*
+ * First test before acquiring mutex - solves deadlocks when we
+ * re-enter the quota code and are already holding the mutex
+ */
+ if (IS_NOQUOTA(inode)) {
+ __inode_add_bytes(inode, number, reserve);
+ goto out;
+ }
+
+ down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ if (IS_NOQUOTA(inode)) {
+ __inode_add_bytes(inode, number, reserve);
+ goto out_unlock;
+ }
+
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
warntype[cnt] = QUOTA_NL_NOWARN;

@@ -1415,7 +1495,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt)
== NO_QUOTA) {
ret = NO_QUOTA;
- goto out_unlock;
+ goto out_data_unlock;
}
}
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1426,64 +1506,32 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
else
dquot_incr_space(inode->i_dquot[cnt], number);
}
- if (!reserve)
- inode_add_bytes(inode, number);
-out_unlock:
- spin_unlock(&dq_data_lock);
- flush_warnings(inode->i_dquot, warntype);
- return ret;
-}
-
-int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
-{
- int cnt, ret = QUOTA_OK;
-
- /*
- * First test before acquiring mutex - solves deadlocks when we
- * re-enter the quota code and are already holding the mutex
- */
- if (IS_NOQUOTA(inode)) {
- inode_add_bytes(inode, number);
- goto out;
- }
-
- down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- if (IS_NOQUOTA(inode)) {
- inode_add_bytes(inode, number);
- goto out_unlock;
- }
-
- ret = __dquot_alloc_space(inode, number, warn, 0);
- if (ret == NO_QUOTA)
- goto out_unlock;
+ __inode_add_bytes(inode, number, reserve);

+ if (reserve)
+ goto out_data_unlock;
/* Dirtify all the dquots - this can block when journalling */
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
if (inode->i_dquot[cnt])
mark_dquot_dirty(inode->i_dquot[cnt]);
+out_data_unlock:
+ spin_unlock(&dq_data_lock);
+ flush_warnings(inode->i_dquot, warntype);
out_unlock:
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
out:
return ret;
}
+
+int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
+{
+ return __dquot_alloc_space(inode, number, warn, 0);
+}
EXPORT_SYMBOL(dquot_alloc_space);

int dquot_reserve_space(struct inode *inode, qsize_t number, int warn)
{
- int ret = QUOTA_OK;
-
- if (IS_NOQUOTA(inode))
- goto out;
-
- down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- if (IS_NOQUOTA(inode))
- goto out_unlock;
-
- ret = __dquot_alloc_space(inode, number, warn, 1);
-out_unlock:
- up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-out:
- return ret;
+ return __dquot_alloc_space(inode, number, warn, 1);
}
EXPORT_SYMBOL(dquot_reserve_space);

@@ -1540,14 +1588,14 @@ int dquot_claim_space(struct inode *inode, qsize_t number)
int ret = QUOTA_OK;

if (IS_NOQUOTA(inode)) {
- inode_add_bytes(inode, number);
+ inode_claim_rsv_space(inode, number);
goto out;
}

down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
if (IS_NOQUOTA(inode)) {
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- inode_add_bytes(inode, number);
+ inode_claim_rsv_space(inode, number);
goto out;
}

@@ -1559,7 +1607,7 @@ int dquot_claim_space(struct inode *inode, qsize_t number)
number);
}
/* Update inode bytes */
- inode_add_bytes(inode, number);
+ inode_claim_rsv_space(inode, number);
spin_unlock(&dq_data_lock);
/* Dirtify all the dquots - this can block when journalling */
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
@@ -1572,38 +1620,9 @@ out:
EXPORT_SYMBOL(dquot_claim_space);

/*
- * Release reserved quota space
- */
-void dquot_release_reserved_space(struct inode *inode, qsize_t number)
-{
- int cnt;
-
- if (IS_NOQUOTA(inode))
- goto out;
-
- down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
- if (IS_NOQUOTA(inode))
- goto out_unlock;
-
- spin_lock(&dq_data_lock);
- /* Release reserved dquots */
- for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (inode->i_dquot[cnt])
- dquot_free_reserved_space(inode->i_dquot[cnt], number);
- }
- spin_unlock(&dq_data_lock);
-
-out_unlock:
- up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-out:
- return;
-}
-EXPORT_SYMBOL(dquot_release_reserved_space);
-
-/*
* This operation can block, but only after everything is updated
*/
-int dquot_free_space(struct inode *inode, qsize_t number)
+int __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
{
unsigned int cnt;
char warntype[MAXQUOTAS];
@@ -1612,7 +1631,7 @@ int dquot_free_space(struct inode *inode, qsize_t number)
* re-enter the quota code and are already holding the mutex */
if (IS_NOQUOTA(inode)) {
out_sub:
- inode_sub_bytes(inode, number);
+ __inode_sub_bytes(inode, number, reserve);
return QUOTA_OK;
}

@@ -1627,21 +1646,43 @@ out_sub:
if (!inode->i_dquot[cnt])
continue;
warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number);
- dquot_decr_space(inode->i_dquot[cnt], number);
+ if (reserve)
+ dquot_free_reserved_space(inode->i_dquot[cnt], number);
+ else
+ dquot_decr_space(inode->i_dquot[cnt], number);
}
- inode_sub_bytes(inode, number);
+ __inode_sub_bytes(inode, number, reserve);
spin_unlock(&dq_data_lock);
+
+ if (reserve)
+ goto out_unlock;
/* Dirtify all the dquots - this can block when journalling */
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
if (inode->i_dquot[cnt])
mark_dquot_dirty(inode->i_dquot[cnt]);
+out_unlock:
flush_warnings(inode->i_dquot, warntype);
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
return QUOTA_OK;
}
+
+int dquot_free_space(struct inode *inode, qsize_t number)
+{
+ return __dquot_free_space(inode, number, 0);
+}
EXPORT_SYMBOL(dquot_free_space);

/*
+ * Release reserved quota space
+ */
+void dquot_release_reserved_space(struct inode *inode, qsize_t number)
+{
+ return (void )__dquot_free_space(inode, number, 1);
+
+}
+EXPORT_SYMBOL(dquot_release_reserved_space);
+
+/*
* This operation can block, but only after everything is updated
*/
int dquot_free_inode(const struct inode *inode, qsize_t number)
@@ -1679,19 +1720,6 @@ int dquot_free_inode(const struct inode *inode, qsize_t number)
EXPORT_SYMBOL(dquot_free_inode);

/*
- * call back function, get reserved quota space from underlying fs
- */
-qsize_t dquot_get_reserved_space(struct inode *inode)
-{
- qsize_t reserved_space = 0;
-
- if (sb_any_quota_active(inode->i_sb) &&
- inode->i_sb->dq_op->get_reserved_space)
- reserved_space = inode->i_sb->dq_op->get_reserved_space(inode);
- return reserved_space;
-}
-
-/*
* Transfer the number of inode and blocks from one diskquota to an other.
*
* This operation can block, but only after everything is updated
@@ -1734,7 +1762,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
}
spin_lock(&dq_data_lock);
cur_space = inode_get_bytes(inode);
- rsv_space = dquot_get_reserved_space(inode);
+ rsv_space = inode_get_rsv_space(inode);
space = cur_space + rsv_space;
/* Build the transfer_from list and check the limits */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1854,6 +1882,7 @@ const struct dquot_operations dquot_operations = {
.write_info = dquot_commit_info,
.alloc_dquot = dquot_alloc,
.destroy_dquot = dquot_destroy,
+
};

/*
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 78c4889..40845f1 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -313,8 +313,9 @@ struct dquot_operations {
int (*claim_space) (struct inode *, qsize_t);
/* release rsved quota for delayed alloc */
void (*release_rsv) (struct inode *, qsize_t);
- /* get reserved quota for delayed alloc */
- qsize_t (*get_reserved_space) (struct inode *);
+ /* get reserved quota for delayed alloc, value returned is managed by
+ * quota code only */
+ qsize_t* (*get_reserved_space) (struct inode *);
};

/* Operations handling requests from userspace */
--
1.6.0.4



2009-12-10 12:25:41

by Dmitry Monakhov

[permalink] [raw]
Subject: [PATCH 2/2] ext4: Convert to generic reserved quota's space management. [V3]

This patch fix write vs chown race condition.

Changes from V2:
-add missed i_reserved_quota iniatilization.diff

Signed-off-by: Dmitry Monakhov <[email protected]>
---
fs/ext4/ext4.h | 6 +++++-
fs/ext4/inode.c | 16 +++++++---------
fs/ext4/super.c | 3 +++
3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 26d3cf8..992cecd 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -705,6 +705,10 @@ struct ext4_inode_info {
struct list_head i_aio_dio_complete_list;
/* current io_end structure for async DIO write*/
ext4_io_end_t *cur_aio_dio;
+#ifdef CONFIG_QUOTA
+ /* quota space reservation, managed internally by quota code */
+ qsize_t i_reserved_quota;
+#endif
};

/*
@@ -1427,7 +1431,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
-extern qsize_t ext4_get_reserved_space(struct inode *inode);
+extern qsize_t* ext4_get_reserved_space(struct inode *inode);
extern int flush_aio_dio_completed_IO(struct inode *inode);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f693768..7fa6db6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1048,17 +1048,12 @@ out:
return err;
}

-qsize_t ext4_get_reserved_space(struct inode *inode)
+#ifdef CONFIG_QUOTA
+qsize_t* ext4_get_reserved_space(struct inode *inode)
{
- unsigned long long total;
-
- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- total = EXT4_I(inode)->i_reserved_data_blocks +
- EXT4_I(inode)->i_reserved_meta_blocks;
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-
- return (total << inode->i_blkbits);
+ return &EXT4_I(inode)->i_reserved_quota;
}
+#endif
/*
* Calculate the number of metadata blocks need to reserve
* to allocate @blocks for non extent file based file
@@ -5046,6 +5041,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
inode->i_size = ext4_isize(raw_inode);
ei->i_disksize = inode->i_size;
+#ifdef CONFIG_QUOTA
+ ei->i_reserved_quota = 0;
+#endif
inode->i_generation = le32_to_cpu(raw_inode->i_generation);
ei->i_block_group = iloc.block_group;
ei->i_last_alloc_group = ~0;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4c87d97..4697272 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -713,6 +713,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
spin_lock_init(&(ei->i_block_reservation_lock));
INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
ei->cur_aio_dio = NULL;
+#ifdef CONFIG_QUOTA
+ ei->i_reserved_quota = 0;
+#endif

return &ei->vfs_inode;
}
--
1.6.0.4


2009-12-11 20:45:13

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 1/2] quota: decouple fs reserved space from quota reservation. [V3]

Hi,

On Thu 10-12-09 15:25:51, Dmitry Monakhov wrote:
> Currently inode_reservation is managed by fs itself and this
> reservation is transfered on dquot_transfer(). This means what
> inode_reservation must always be in sync with
> dquot->dq_dqb.dqb_rsvspace. Otherwise dquot_transfer() will result
> in incorrect quota(WARN_ON in dquot_claim_reserved_space() will be
> triggered)
> This is not easy because of complex locking order issues
> for example http://bugzilla.kernel.org/show_bug.cgi?id=14739
>
> The patch introduce quota reservation field for each fs-inode
> (fs specific inode is used in order to prevent bloating generic
> vfs inode). This reservation is managed by quota code internally
> similar to i_blocks/i_bytes and may not be always in sync with
> internal fs reservation.
>
> Also perform some code rearrangement:
> - Unify dquot_reserve_space() and dquot_reserve_space()
> - Unify dquot_release_reserved_space() and dquot_free_space()
> - Also this patch add missing warning update to release_rsv()
> dquot_release_reserved_space() must call flush_warnings() as
> dquot_free_space() does.
>
> Changes from V1:
> - move qutoa_reservation field from vfs_inode to fs_inode
> - account reservation in bytes instead of blocks.
Thanks for looking into this! I have some comments to the patch below
but generally I like it.

> Signed-off-by: Dmitry Monakhov <[email protected]>
> ---
> fs/quota/dquot.c | 219 ++++++++++++++++++++++++++++---------------------
> include/linux/quota.h | 5 +-
> 2 files changed, 127 insertions(+), 97 deletions(-)
>
> diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
> index 39b49c4..66221f8 100644
> --- a/fs/quota/dquot.c
> +++ b/fs/quota/dquot.c
> @@ -1388,6 +1388,71 @@ void vfs_dq_drop(struct inode *inode)
> EXPORT_SYMBOL(vfs_dq_drop);
>
> /*
> + * inode_reserved_space is managed internally by quota, and protected by
> + * i_lock similar to i_blocks+i_bytes.
> + */
> +static qsize_t* inode_reserved_space(struct inode * inode)
^^ it should be "qsize_t *"
> +{
> + /* Filesystem must explicitly define it's own method in order to use
> + * quota reservation interface */
> + BUG_ON(!inode->i_sb->dq_op->get_reserved_space);
> + return inode->i_sb->dq_op->get_reserved_space(inode);
> +}
> +
> +static void inode_add_rsv_space(struct inode *inode, qsize_t number)
> +{
> + spin_lock(&inode->i_lock);
> + *inode_reserved_space(inode) += number;
> + spin_unlock(&inode->i_lock);
> +}
> +
Hmm, I think I have a better solution for this: Each inode will have a
pointer to "inode features" table. That table will contain offsets of
general structures (e.g. things needed for quotas) embedded inside
fs-specific inode. That way we can later move similar structures out of
general VFS inode to inodes of filesystems that need it. When a filesystem
creates an inode, it fills in a proper pointer to the table... That should
allow generic code access data in fs-specific part of inode structure,
we don't have to add functions returning pointers, etc.
But for now I'd go with your solution since mine will require more
widespread changes to quota code and we need to fix that bug.

> +static void inode_claim_rsv_space(struct inode *inode, qsize_t number)
> +{
> + spin_lock(&inode->i_lock);
> + *inode_reserved_space(inode) -= number;
> + inode->i_blocks += number >> 9;
> + number &= 511;
> + inode->i_bytes += number;
> + if (inode->i_bytes >= 512) {
> + inode->i_blocks++;
> + inode->i_bytes -= 512;
> + }
> + spin_unlock(&inode->i_lock);
> +}
Please add variant of inode_add_bytes() which does not acquire
inode->i_lock and use it here. I'd suggest __inode_add_bytes and rename
__inode_add_bytes below to something more appropriate like
inode_alloc_reserve_bytes or so.

...
> @@ -1426,64 +1506,32 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
> else
> dquot_incr_space(inode->i_dquot[cnt], number);
> }
> - if (!reserve)
> - inode_add_bytes(inode, number);
> -out_unlock:
> - spin_unlock(&dq_data_lock);
> - flush_warnings(inode->i_dquot, warntype);
> - return ret;
> -}
> -
> -int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
> -{
> - int cnt, ret = QUOTA_OK;
> -
> - /*
> - * First test before acquiring mutex - solves deadlocks when we
> - * re-enter the quota code and are already holding the mutex
> - */
> - if (IS_NOQUOTA(inode)) {
> - inode_add_bytes(inode, number);
> - goto out;
> - }
> -
> - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
> - if (IS_NOQUOTA(inode)) {
> - inode_add_bytes(inode, number);
> - goto out_unlock;
> - }
> -
> - ret = __dquot_alloc_space(inode, number, warn, 0);
> - if (ret == NO_QUOTA)
> - goto out_unlock;
> + __inode_add_bytes(inode, number, reserve);
>
> + if (reserve)
> + goto out_data_unlock;
> /* Dirtify all the dquots - this can block when journalling */
> for (cnt = 0; cnt < MAXQUOTAS; cnt++)
> if (inode->i_dquot[cnt])
> mark_dquot_dirty(inode->i_dquot[cnt]);
> +out_data_unlock:
> + spin_unlock(&dq_data_lock);
> + flush_warnings(inode->i_dquot, warntype);
You have to unlock dq_data_lock before mark_dquot_dirty as it can sleep.

> out_unlock:
> up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
> out:
> return ret;
> }
...
> @@ -1854,6 +1882,7 @@ const struct dquot_operations dquot_operations = {
> .write_info = dquot_commit_info,
> .alloc_dquot = dquot_alloc,
> .destroy_dquot = dquot_destroy,
> +
> };
I guess the above is a mistake...

> diff --git a/include/linux/quota.h b/include/linux/quota.h
> index 78c4889..40845f1 100644
> --- a/include/linux/quota.h
> +++ b/include/linux/quota.h
> @@ -313,8 +313,9 @@ struct dquot_operations {
> int (*claim_space) (struct inode *, qsize_t);
> /* release rsved quota for delayed alloc */
> void (*release_rsv) (struct inode *, qsize_t);
> - /* get reserved quota for delayed alloc */
> - qsize_t (*get_reserved_space) (struct inode *);
> + /* get reserved quota for delayed alloc, value returned is managed by
> + * quota code only */
> + qsize_t* (*get_reserved_space) (struct inode *);
I think proper coding style is "qsize_t *(*get_reserved_space) (...)"

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2009-12-11 20:45:13

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 2/2] ext4: Convert to generic reserved quota's space management. [V3]

On Thu 10-12-09 15:25:52, Dmitry Monakhov wrote:
> This patch fix write vs chown race condition.
>
> Changes from V2:
> -add missed i_reserved_quota iniatilization.diff
>
> Signed-off-by: Dmitry Monakhov <[email protected]>
Looks OK except for the qsize_t* coding style thing...

Honza

> ---
> fs/ext4/ext4.h | 6 +++++-
> fs/ext4/inode.c | 16 +++++++---------
> fs/ext4/super.c | 3 +++
> 3 files changed, 15 insertions(+), 10 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 26d3cf8..992cecd 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -705,6 +705,10 @@ struct ext4_inode_info {
> struct list_head i_aio_dio_complete_list;
> /* current io_end structure for async DIO write*/
> ext4_io_end_t *cur_aio_dio;
> +#ifdef CONFIG_QUOTA
> + /* quota space reservation, managed internally by quota code */
> + qsize_t i_reserved_quota;
> +#endif
> };
>
> /*
> @@ -1427,7 +1431,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
> extern int ext4_block_truncate_page(handle_t *handle,
> struct address_space *mapping, loff_t from);
> extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
> -extern qsize_t ext4_get_reserved_space(struct inode *inode);
> +extern qsize_t* ext4_get_reserved_space(struct inode *inode);
> extern int flush_aio_dio_completed_IO(struct inode *inode);
> /* ioctl.c */
> extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index f693768..7fa6db6 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1048,17 +1048,12 @@ out:
> return err;
> }
>
> -qsize_t ext4_get_reserved_space(struct inode *inode)
> +#ifdef CONFIG_QUOTA
> +qsize_t* ext4_get_reserved_space(struct inode *inode)
> {
> - unsigned long long total;
> -
> - spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
> - total = EXT4_I(inode)->i_reserved_data_blocks +
> - EXT4_I(inode)->i_reserved_meta_blocks;
> - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
> -
> - return (total << inode->i_blkbits);
> + return &EXT4_I(inode)->i_reserved_quota;
> }
> +#endif
> /*
> * Calculate the number of metadata blocks need to reserve
> * to allocate @blocks for non extent file based file
> @@ -5046,6 +5041,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
> ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
> inode->i_size = ext4_isize(raw_inode);
> ei->i_disksize = inode->i_size;
> +#ifdef CONFIG_QUOTA
> + ei->i_reserved_quota = 0;
> +#endif
> inode->i_generation = le32_to_cpu(raw_inode->i_generation);
> ei->i_block_group = iloc.block_group;
> ei->i_last_alloc_group = ~0;
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 4c87d97..4697272 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -713,6 +713,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
> spin_lock_init(&(ei->i_block_reservation_lock));
> INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
> ei->cur_aio_dio = NULL;
> +#ifdef CONFIG_QUOTA
> + ei->i_reserved_quota = 0;
> +#endif
>
> return &ei->vfs_inode;
> }
> --
> 1.6.0.4
>
--
Jan Kara <[email protected]>
SUSE Labs, CR

2009-12-14 12:45:26

by Dmitry Monakhov

[permalink] [raw]
Subject: Re: [PATCH 1/2] quota: decouple fs reserved space from quota reservation. [V3]

Hi, Please ignore all messages which I have sent yesterday(sunday). I
have had troubles with my default
smtp server.

I've submitted updated version of the patch-set. It starts from
following message.
Subject: [PATCH 1/5] Add unlocked version of inode_add_bytes() function
Date: Mon, 14 Dec 2009 15:21:12 +0300
Message-id: <[email protected]>
Last two patches in the series not directly related with the race bug,
but depends
on the previous patches. We may easily defer this patches until a proper moment.


2009/12/10 Jan Kara <[email protected]>:
>  Hi,
>
> On Thu 10-12-09 15:25:51, Dmitry Monakhov wrote:
>> Currently inode_reservation is managed by fs itself and this
>> reservation is transfered on dquot_transfer(). This means what
>> inode_reservation must always be in sync with
>> dquot->dq_dqb.dqb_rsvspace. Otherwise dquot_transfer() will result
>> in incorrect quota(WARN_ON in dquot_claim_reserved_space() will be
>> triggered)
>> This is not easy because of complex locking order issues
>> for example http://bugzilla.kernel.org/show_bug.cgi?id=14739
>>
>> The patch introduce quota reservation field for each fs-inode
>> (fs specific inode is used in order to prevent bloating generic
>> vfs inode). This reservation is managed by quota code internally
>> similar to i_blocks/i_bytes and may not be always in sync with
>> internal fs reservation.
>>
>> Also perform some code rearrangement:
>> - Unify dquot_reserve_space() and dquot_reserve_space()
>> - Unify dquot_release_reserved_space() and dquot_free_space()
>> - Also this patch add missing warning update to release_rsv()
>>   dquot_release_reserved_space() must call flush_warnings() as
>>   dquot_free_space() does.
>>
>> Changes from V1:
>> - move qutoa_reservation field from vfs_inode to fs_inode
>> - account reservation in bytes instead of blocks.
>  Thanks for looking into this! I have some comments to the patch below
> but generally I like it.
>
>> Signed-off-by: Dmitry Monakhov <[email protected]>
>> ---
>>  fs/quota/dquot.c      |  219 ++++++++++++++++++++++++++++---------------------
>>  include/linux/quota.h |    5 +-
>>  2 files changed, 127 insertions(+), 97 deletions(-)
>>
>> diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
>> index 39b49c4..66221f8 100644
>> --- a/fs/quota/dquot.c
>> +++ b/fs/quota/dquot.c
>> @@ -1388,6 +1388,71 @@ void vfs_dq_drop(struct inode *inode)
>>  EXPORT_SYMBOL(vfs_dq_drop);
>>
>>  /*
>> + * inode_reserved_space is managed internally by quota, and protected by
>> + * i_lock similar to i_blocks+i_bytes.
>> + */
>> +static qsize_t* inode_reserved_space(struct inode * inode)
>                 ^^ it should be "qsize_t *"
>> +{
>> +     /* Filesystem must explicitly define it's own method in order to use
>> +      * quota reservation interface */
>> +     BUG_ON(!inode->i_sb->dq_op->get_reserved_space);
>> +     return inode->i_sb->dq_op->get_reserved_space(inode);
>> +}
>> +
>> +static void inode_add_rsv_space(struct inode *inode, qsize_t number)
>> +{
>> +     spin_lock(&inode->i_lock);
>> +     *inode_reserved_space(inode) += number;
>> +     spin_unlock(&inode->i_lock);
>> +}
>> +
>  Hmm, I think I have a better solution for this: Each inode will have a
> pointer to "inode features" table. That table will contain offsets of
> general structures (e.g. things needed for quotas) embedded inside
> fs-specific inode. That way we can later move similar structures out of
> general VFS inode to inodes of filesystems that need it. When a filesystem
> creates an inode, it fills in a proper pointer to the table... That should
> allow generic code access data in fs-specific part of inode structure,
> we don't have to add functions returning pointers, etc.
>  But for now I'd go with your solution since mine will require more
> widespread changes to quota code and we need to fix that bug.
>
>> +static void inode_claim_rsv_space(struct inode *inode, qsize_t number)
>> +{
>> +     spin_lock(&inode->i_lock);
>> +     *inode_reserved_space(inode) -= number;
>> +     inode->i_blocks += number >> 9;
>> +     number &= 511;
>> +     inode->i_bytes += number;
>> +     if (inode->i_bytes >= 512) {
>> +             inode->i_blocks++;
>> +             inode->i_bytes -= 512;
>> +     }
>> +     spin_unlock(&inode->i_lock);
>> +}
>  Please add variant of inode_add_bytes() which does not acquire
> inode->i_lock and use it here. I'd suggest __inode_add_bytes and rename
> __inode_add_bytes below to something more appropriate like
> inode_alloc_reserve_bytes or so.
>
> ...
>> @@ -1426,64 +1506,32 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
>>               else
>>                       dquot_incr_space(inode->i_dquot[cnt], number);
>>       }
>> -     if (!reserve)
>> -             inode_add_bytes(inode, number);
>> -out_unlock:
>> -     spin_unlock(&dq_data_lock);
>> -     flush_warnings(inode->i_dquot, warntype);
>> -     return ret;
>> -}
>> -
>> -int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
>> -{
>> -     int cnt, ret = QUOTA_OK;
>> -
>> -     /*
>> -      * First test before acquiring mutex - solves deadlocks when we
>> -      * re-enter the quota code and are already holding the mutex
>> -      */
>> -     if (IS_NOQUOTA(inode)) {
>> -             inode_add_bytes(inode, number);
>> -             goto out;
>> -     }
>> -
>> -     down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
>> -     if (IS_NOQUOTA(inode)) {
>> -             inode_add_bytes(inode, number);
>> -             goto out_unlock;
>> -     }
>> -
>> -     ret = __dquot_alloc_space(inode, number, warn, 0);
>> -     if (ret == NO_QUOTA)
>> -             goto out_unlock;
>> +     __inode_add_bytes(inode, number, reserve);
>>
>> +     if (reserve)
>> +             goto out_data_unlock;
>>       /* Dirtify all the dquots - this can block when journalling */
>>       for (cnt = 0; cnt < MAXQUOTAS; cnt++)
>>               if (inode->i_dquot[cnt])
>>                       mark_dquot_dirty(inode->i_dquot[cnt]);
>> +out_data_unlock:
>> +     spin_unlock(&dq_data_lock);
>> +     flush_warnings(inode->i_dquot, warntype);
>  You have to unlock dq_data_lock before mark_dquot_dirty as it can sleep.
>
>>  out_unlock:
>>       up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
>>  out:
>>       return ret;
>>  }
> ...
>> @@ -1854,6 +1882,7 @@ const struct dquot_operations dquot_operations = {
>>       .write_info     = dquot_commit_info,
>>       .alloc_dquot    = dquot_alloc,
>>       .destroy_dquot  = dquot_destroy,
>> +
>>  };
>  I guess the above is a mistake...
>
>> diff --git a/include/linux/quota.h b/include/linux/quota.h
>> index 78c4889..40845f1 100644
>> --- a/include/linux/quota.h
>> +++ b/include/linux/quota.h
>> @@ -313,8 +313,9 @@ struct dquot_operations {
>>       int (*claim_space) (struct inode *, qsize_t);
>>       /* release rsved quota for delayed alloc */
>>       void (*release_rsv) (struct inode *, qsize_t);
>> -     /* get reserved quota for delayed alloc */
>> -     qsize_t (*get_reserved_space) (struct inode *);
>> +     /* get reserved quota for delayed alloc, value returned is managed by
>> +      * quota code only */
>> +     qsize_t* (*get_reserved_space) (struct inode *);
>  I think proper coding style is "qsize_t *(*get_reserved_space) (...)"
>
>                                                                        Honza
> --
> Jan Kara <[email protected]>
> SUSE Labs, CR
>