a) Incase journal transaction allocation
fails due to ENOMEM don't call ext4_std_error() since it will remount
the fs as readonly and logs the message in kernel log.
b) Call posix_acl_release() incase journal allocation fails in case
of error paths.
Signed-off-by: Manish Katiyar <[email protected]>
Acked-by: Jan Kara <[email protected]>
---
fs/ext4/acl.c | 9 ++++++---
1 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 21eacd7..93dc9a6 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -354,7 +354,8 @@ ext4_acl_chmod(struct inode *inode)
EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
- ext4_std_error(inode->i_sb, error);
+ if (error != -ENOMEM)
+ ext4_std_error(inode->i_sb, error);
goto out;
}
error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, clone);
@@ -450,8 +451,10 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
retry:
handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ goto release_and_out;
+ }
error = ext4_set_acl(handle, inode, type, acl);
ext4_journal_stop(handle);
if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
--
1.7.4.1
changes from v1 -> v2 :
*) Update start_this_handle to take extra parameter to specify whether
to retry the allocation or not.
*) Added jbd allocation flags for callers to control the transaction allocation
behavior. Callers can pass JBD2_TOPLEVEL if allocation needs to be done using GFP_KERNEL.
Pass extra flags in journal routines to specify if its ok to
fail in the journal transaction allocation. Passing JBD2_FAIL_OK means caller is
ok with journal start failures and can handle ENOMEM.
Update ocfs2 and ext4 routines to pass JBD2_NO_FAIL for the updated journal
interface by default, to retain the existing behavior.
Signed-off-by: Manish Katiyar <[email protected]>
---
fs/ext4/ext4_jbd2.h | 2 +-
fs/ext4/super.c | 2 +-
fs/jbd2/transaction.c | 44 ++++++++++++++++----------------------------
fs/ocfs2/journal.c | 8 ++++----
include/linux/jbd2.h | 13 +++++++++----
5 files changed, 31 insertions(+), 38 deletions(-)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index bb85757..14e6599 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -220,7 +220,7 @@ static inline int ext4_journal_extend(handle_t *handle, int nblocks)
static inline int ext4_journal_restart(handle_t *handle, int nblocks)
{
if (ext4_handle_valid(handle))
- return jbd2_journal_restart(handle, nblocks);
+ return jbd2_journal_restart(handle, nblocks, JBD2_NO_FAIL);
return 0;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d9937df..aa842f3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -295,7 +295,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
ext4_abort(sb, "Detected aborted journal");
return ERR_PTR(-EROFS);
}
- return jbd2_journal_start(journal, nblocks);
+ return jbd2_journal_start(journal, nblocks, JBD2_NO_FAIL);
}
/*
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 3eec82d..7f53589 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -106,6 +106,11 @@ static inline void update_t_max_wait(transaction_t *transaction,
#endif
}
+static inline int __jbd_flags_to_mask(int jbd_alloc_flags)
+{
+ return (jbd_alloc_flags & JBD2_TOPLEVEL) ? GFP_KERNEL : GFP_NOFS;
+}
+
/*
* start_this_handle: Given a handle, deal with any locking or stalling
* needed to make sure that there is enough journal space for the handle
@@ -114,13 +119,14 @@ static inline void update_t_max_wait(transaction_t *transaction,
*/
static int start_this_handle(journal_t *journal, handle_t *handle,
- int gfp_mask)
+ int jbd_alloc_flags)
{
transaction_t *transaction, *new_transaction = NULL;
tid_t tid;
int needed, need_to_start;
int nblocks = handle->h_buffer_credits;
unsigned long ts = jiffies;
+ int gfp_mask = __jbd_flags_to_mask(jbd_alloc_flags);
if (nblocks > journal->j_max_transaction_buffers) {
printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -133,14 +139,7 @@ alloc_transaction:
if (!journal->j_running_transaction) {
new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask);
if (!new_transaction) {
- /*
- * If __GFP_FS is not present, then we may be
- * being called from inside the fs writeback
- * layer, so we MUST NOT fail. Since
- * __GFP_NOFAIL is going away, we will arrange
- * to retry the allocation ourselves.
- */
- if ((gfp_mask & __GFP_FS) == 0) {
+ if (jbd_alloc_flags & JBD2_NO_FAIL) {
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto alloc_transaction;
}
@@ -308,6 +307,7 @@ static handle_t *new_handle(int nblocks)
* handle_t *jbd2_journal_start() - Obtain a new handle.
* @journal: Journal to start transaction on.
* @nblocks: number of block buffer we might modify
+ * @jbd_alloc_flags: jbd allocation flags for transaction.
*
* We make sure that the transaction can guarantee at least nblocks of
* modified buffers in the log. We block until the log can guarantee
@@ -319,7 +319,8 @@ static handle_t *new_handle(int nblocks)
* Return a pointer to a newly allocated handle, or an ERR_PTR() value
* on failure.
*/
-handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
+handle_t *jbd2_journal_start(journal_t *journal,
+ int nblocks, int jbd_alloc_flags)
{
handle_t *handle = journal_current_handle();
int err;
@@ -339,7 +340,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
current->journal_info = handle;
- err = start_this_handle(journal, handle, gfp_mask);
+ err = start_this_handle(journal, handle, jbd_alloc_flags);
if (err < 0) {
jbd2_free_handle(handle);
current->journal_info = NULL;
@@ -347,13 +348,6 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
}
return handle;
}
-EXPORT_SYMBOL(jbd2__journal_start);
-
-
-handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
-{
- return jbd2__journal_start(journal, nblocks, GFP_NOFS);
-}
EXPORT_SYMBOL(jbd2_journal_start);
@@ -442,7 +436,8 @@ out:
* transaction capabable of guaranteeing the requested number of
* credits.
*/
-int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
+int jbd2_journal_restart(handle_t *handle,
+ int nblocks, int jbd_alloc_flags)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal = transaction->t_journal;
@@ -478,16 +473,9 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
lock_map_release(&handle->h_lockdep_map);
handle->h_buffer_credits = nblocks;
- ret = start_this_handle(journal, handle, gfp_mask);
+ ret = start_this_handle(journal, handle, jbd_alloc_flags);
return ret;
}
-EXPORT_SYMBOL(jbd2__journal_restart);
-
-
-int jbd2_journal_restart(handle_t *handle, int nblocks)
-{
- return jbd2__journal_restart(handle, nblocks, GFP_NOFS);
-}
EXPORT_SYMBOL(jbd2_journal_restart);
/**
@@ -1437,7 +1425,7 @@ int jbd2_journal_force_commit(journal_t *journal)
handle_t *handle;
int ret;
- handle = jbd2_journal_start(journal, 1);
+ handle = jbd2_journal_start(journal, 1, JBD2_NO_FAIL);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
} else {
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index b141a44..a784a64 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -353,11 +353,11 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
/* Nested transaction? Just return the handle... */
if (journal_current_handle())
- return jbd2_journal_start(journal, max_buffs);
+ return jbd2_journal_start(journal, max_buffs, JBD2_NO_FAIL);
down_read(&osb->journal->j_trans_barrier);
- handle = jbd2_journal_start(journal, max_buffs);
+ handle = jbd2_journal_start(journal, max_buffs, JBD2_NO_FAIL);
if (IS_ERR(handle)) {
up_read(&osb->journal->j_trans_barrier);
@@ -437,8 +437,8 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
if (status > 0) {
trace_ocfs2_extend_trans_restart(old_nblocks + nblocks);
- status = jbd2_journal_restart(handle,
- old_nblocks + nblocks);
+ status = jbd2_journal_restart(handle, old_nblocks + nblocks,
+ JBD2_NO_FAIL);
if (status < 0) {
mlog_errno(status);
goto bail;
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 4ecb7b1..c11181b 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1106,10 +1106,10 @@ static inline handle_t *journal_current_handle(void)
* Register buffer modifications against the current transaction.
*/
-extern handle_t *jbd2_journal_start(journal_t *, int nblocks);
-extern handle_t *jbd2__journal_start(journal_t *, int nblocks, int gfp_mask);
-extern int jbd2_journal_restart(handle_t *, int nblocks);
-extern int jbd2__journal_restart(handle_t *, int nblocks, int gfp_mask);
+extern handle_t *jbd2_journal_start(journal_t *,
+ int nblocks, int jbd_alloc_flags);
+extern int jbd2_journal_restart(handle_t *,
+ int nblocks, int jbd_alloc_flags);
extern int jbd2_journal_extend (handle_t *, int nblocks);
extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
@@ -1322,6 +1322,11 @@ static inline int jbd_space_needed(journal_t *journal)
extern int jbd_blocks_per_page(struct inode *inode);
+/* JBD2 transaction allocation flags */
+#define JBD2_NO_FAIL 0x00000001
+#define JBD2_FAIL_OK 0x00000002
+#define JBD2_TOPLEVEL 0x00000004
+
#ifdef __KERNEL__
#define buffer_trace_init(bh) do {} while (0)
--
1.7.4.1
This patch updates ext4 code to pass appropriate transaction
allocation flags to callers who are safe to deal with ENOMEM.
Pass JBD2_TOPLEVEL from the callers who are safe to do allocations
with GFP_KERNEL instead of GFP_NOFS.
Signed-off-by: Manish Katiyar <[email protected]>
---
fs/ext4/acl.c | 6 ++++--
fs/ext4/ext4_jbd2.h | 8 +++++---
fs/ext4/extents.c | 11 ++++++-----
fs/ext4/ialloc.c | 2 +-
fs/ext4/inode.c | 40 ++++++++++++++++++++++++----------------
fs/ext4/ioctl.c | 6 ++++--
fs/ext4/migrate.c | 10 +++++-----
fs/ext4/move_extent.c | 3 ++-
fs/ext4/namei.c | 47 ++++++++++++++++++++++++++++++-----------------
fs/ext4/resize.c | 8 ++++----
fs/ext4/super.c | 18 +++++++++++-------
fs/ext4/xattr.c | 3 ++-
12 files changed, 98 insertions(+), 64 deletions(-)
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 93dc9a6..95db569 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -351,7 +351,8 @@ ext4_acl_chmod(struct inode *inode)
retry:
handle = ext4_journal_start(inode,
- EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb),
+ (JBD2_FAIL_OK | JBD2_TOPLEVEL));
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
if (error != -ENOMEM)
@@ -450,7 +451,8 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
acl = NULL;
retry:
- handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+ handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb),
+ (JBD2_FAIL_OK | JBD2_TOPLEVEL));
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto release_and_out;
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 14e6599..55084c2 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -156,7 +156,8 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
#define ext4_handle_dirty_super(handle, sb) \
__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
-handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
+handle_t *ext4_journal_start_sb(struct super_block *sb,
+ int nblocks, int jbd_alloc_flags);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -197,9 +198,10 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
return 1;
}
-static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
+static inline handle_t *ext4_journal_start(struct inode *inode,
+ int nblocks, int jbd_alloc_flags)
{
- return ext4_journal_start_sb(inode->i_sb, nblocks);
+ return ext4_journal_start_sb(inode->i_sb, nblocks, jbd_alloc_flags);
}
#define ext4_journal_stop(handle) \
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 5199bac..b610294 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2546,7 +2546,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
ext_debug("truncate since %u\n", start);
/* probably first extent we're gonna free will be last in block */
- handle = ext4_journal_start(inode, depth + 1);
+ handle = ext4_journal_start(inode, depth + 1, JBD2_NO_FAIL);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -3672,7 +3672,7 @@ void ext4_ext_truncate(struct inode *inode)
* probably first extent we're gonna free will be last in block
*/
err = ext4_writepage_trans_blocks(inode);
- handle = ext4_journal_start(inode, err);
+ handle = ext4_journal_start(inode, err, JBD2_NO_FAIL);
if (IS_ERR(handle))
return;
@@ -3811,7 +3811,8 @@ retry:
while (ret >= 0 && ret < max_blocks) {
map.m_lblk = map.m_lblk + ret;
map.m_len = max_blocks = max_blocks - ret;
- handle = ext4_journal_start(inode, credits);
+ handle = ext4_journal_start(inode, credits,
+ (JBD2_FAIL_OK | JBD2_TOPLEVEL));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
break;
@@ -3889,7 +3890,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
while (ret >= 0 && ret < max_blocks) {
map.m_lblk += ret;
map.m_len = (max_blocks -= ret);
- handle = ext4_journal_start(inode, credits);
+ handle = ext4_journal_start(inode, credits, JBD2_NO_FAIL);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
break;
@@ -4215,7 +4216,7 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
ext4_flush_completed_IO(inode);
credits = ext4_writepage_trans_blocks(inode);
- handle = ext4_journal_start(inode, credits);
+ handle = ext4_journal_start(inode, credits, JBD2_NO_FAIL);
if (IS_ERR(handle))
return PTR_ERR(handle);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 21bb2f6..09b73d7 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1263,7 +1263,7 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
goto out;
- handle = ext4_journal_start_sb(sb, 1);
+ handle = ext4_journal_start_sb(sb, 1, JBD2_NO_FAIL);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 50d0e9c..0355858 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -129,7 +129,8 @@ static handle_t *start_transaction(struct inode *inode)
{
handle_t *result;
- result = ext4_journal_start(inode, blocks_for_truncate(inode));
+ result = ext4_journal_start(inode,
+ blocks_for_truncate(inode), JBD2_NO_FAIL);
if (!IS_ERR(result))
return result;
@@ -204,7 +205,8 @@ void ext4_evict_inode(struct inode *inode)
if (is_bad_inode(inode))
goto no_delete;
- handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
+ handle = ext4_journal_start(inode,
+ blocks_for_truncate(inode)+3, JBD2_NO_FAIL);
if (IS_ERR(handle)) {
ext4_std_error(inode->i_sb, PTR_ERR(handle));
/*
@@ -1401,7 +1403,7 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
if (map.m_len > DIO_MAX_BLOCKS)
map.m_len = DIO_MAX_BLOCKS;
dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
- handle = ext4_journal_start(inode, dio_credits);
+ handle = ext4_journal_start(inode, dio_credits, JBD2_NO_FAIL);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
return ret;
@@ -1624,7 +1626,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
to = from + len;
retry:
- handle = ext4_journal_start(inode, needed_blocks);
+ handle = ext4_journal_start(inode, needed_blocks, JBD2_FAIL_OK);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
@@ -2558,7 +2560,8 @@ static int __ext4_journalled_writepage(struct page *page,
* references to buffers so we are safe */
unlock_page(page);
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode),
+ JBD2_NO_FAIL);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
@@ -2988,7 +2991,7 @@ retry:
needed_blocks = ext4_da_writepages_trans_blocks(inode);
/* start a new transaction*/
- handle = ext4_journal_start(inode, needed_blocks);
+ handle = ext4_journal_start(inode, needed_blocks, JBD2_NO_FAIL);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
@@ -3124,7 +3127,7 @@ retry:
* to journalling the i_disksize update if writes to the end
* of file which has an already mapped buffer.
*/
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, 1, JBD2_FAIL_OK);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
@@ -3478,7 +3481,7 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
if (final_size > inode->i_size) {
/* Credits for sb + inode write */
- handle = ext4_journal_start(inode, 2);
+ handle = ext4_journal_start(inode, 2, JBD2_FAIL_OK);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
@@ -3521,7 +3524,7 @@ retry:
int err;
/* Credits for sb + inode write */
- handle = ext4_journal_start(inode, 2);
+ handle = ext4_journal_start(inode, 2, JBD2_NO_FAIL);
if (IS_ERR(handle)) {
/* This is really bad luck. We've written the data
* but cannot extend i_size. Bail out and pretend
@@ -5329,8 +5332,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
/* (user+group)*(old+new) structure, inode write (sb,
* inode block, ? - but truncate inode update has it) */
- handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
- EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
+ handle = ext4_journal_start(inode,
+ (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
+ EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3,
+ (JBD2_FAIL_OK | JBD2_TOPLEVEL));
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto err_out;
@@ -5364,7 +5369,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
(attr->ia_size < inode->i_size)) {
handle_t *handle;
- handle = ext4_journal_start(inode, 3);
+ handle = ext4_journal_start(inode, 3,
+ (JBD2_FAIL_OK | JBD2_TOPLEVEL));
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto err_out;
@@ -5384,7 +5390,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_size);
if (error) {
/* Do as much error cleanup as possible */
- handle = ext4_journal_start(inode, 3);
+ handle = ext4_journal_start(inode, 3,
+ JBD2_NO_FAIL);
if (IS_ERR(handle)) {
ext4_orphan_del(NULL, inode);
goto err_out;
@@ -5421,7 +5428,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
rc = ext4_acl_chmod(inode);
err_out:
- ext4_std_error(inode->i_sb, error);
+ if (error != -ENOMEM)
+ ext4_std_error(inode->i_sb, error);
if (!error)
error = rc;
return error;
@@ -5737,7 +5745,7 @@ void ext4_dirty_inode(struct inode *inode)
{
handle_t *handle;
- handle = ext4_journal_start(inode, 2);
+ handle = ext4_journal_start(inode, 2, JBD2_NO_FAIL);
if (IS_ERR(handle))
goto out;
@@ -5821,7 +5829,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
/* Finally we can mark the inode as dirty. */
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, 1, JBD2_NO_FAIL);
if (IS_ERR(handle))
return PTR_ERR(handle);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 808c554..b88e295 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -101,7 +101,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
} else if (oldflags & EXT4_EOFBLOCKS_FL)
ext4_truncate(inode);
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, 1,
+ (JBD2_FAIL_OK | JBD2_TOPLEVEL));
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
goto flags_out;
@@ -157,7 +158,8 @@ flags_out:
goto setversion_out;
}
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, 1,
+ (JBD2_FAIL_OK | JBD2_TOPLEVEL));
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
goto setversion_out;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b57b98f..f233f12 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -485,10 +485,10 @@ int ext4_ext_migrate(struct inode *inode)
return retval;
handle = ext4_journal_start(inode,
- EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
- + 1);
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
+ + 1, (JBD2_FAIL_OK | JBD2_TOPLEVEL));
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
return retval;
@@ -533,7 +533,7 @@ int ext4_ext_migrate(struct inode *inode)
ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
up_read((&EXT4_I(inode)->i_data_sem));
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, 1, JBD2_NO_FAIL);
if (IS_ERR(handle)) {
/*
* It is impossible to update on-disk structures without
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 2b8304b..ff4289f 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -813,7 +813,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
* inode and donor_inode may change each different metadata blocks.
*/
jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
- handle = ext4_journal_start(orig_inode, jblocks);
+ handle = ext4_journal_start(orig_inode, jblocks,
+ (JBD2_FAIL_OK | JBD2_TOPLEVEL));
if (IS_ERR(handle)) {
*err = PTR_ERR(handle);
return 0;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index b754b77..101d5d5 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1750,9 +1750,11 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
dquot_initialize(dir);
retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ handle = ext4_journal_start(dir,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb),
+ (JBD2_FAIL_OK | JBD2_TOPLEVEL));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -1786,9 +1788,11 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
dquot_initialize(dir);
retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ handle = ext4_journal_start(dir,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb),
+ (JBD2_FAIL_OK | JBD2_TOPLEVEL));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -1825,9 +1829,11 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
dquot_initialize(dir);
retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ handle = ext4_journal_start(dir,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb),
+ (JBD2_FAIL_OK | JBD2_TOPLEVEL));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2140,7 +2146,8 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
dquot_initialize(dir);
dquot_initialize(dentry->d_inode);
- handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
+ handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb),
+ (JBD2_FAIL_OK | JBD2_TOPLEVEL));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2202,7 +2209,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
dquot_initialize(dir);
dquot_initialize(dentry->d_inode);
- handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
+ handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb),
+ (JBD2_FAIL_OK | JBD2_TOPLEVEL));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2279,7 +2287,8 @@ static int ext4_symlink(struct inode *dir,
EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
}
retry:
- handle = ext4_journal_start(dir, credits);
+ handle = ext4_journal_start(dir,
+ credits, (JBD2_FAIL_OK | JBD2_TOPLEVEL));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2319,7 +2328,8 @@ retry:
*/
handle = ext4_journal_start(dir,
EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1,
+ (JBD2_FAIL_OK | JBD2_TOPLEVEL));
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
goto err_drop_inode;
@@ -2364,8 +2374,10 @@ static int ext4_link(struct dentry *old_dentry,
dquot_initialize(dir);
retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS);
+ handle = ext4_journal_start(dir,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS,
+ (JBD2_FAIL_OK | JBD2_TOPLEVEL));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2416,8 +2428,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (new_dentry->d_inode)
dquot_initialize(new_dentry->d_inode);
handle = ext4_journal_start(old_dir, 2 *
- EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
+ EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2,
+ (JBD2_FAIL_OK | JBD2_TOPLEVEL));
if (IS_ERR(handle))
return PTR_ERR(handle);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 80bbc9c..6bc36ec 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -176,7 +176,7 @@ static int setup_new_group_blocks(struct super_block *sb,
int err = 0, err2;
/* This transaction may be extended/restarted along the way */
- handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+ handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA, JBD2_NO_FAIL);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -655,7 +655,7 @@ static void update_backups(struct super_block *sb,
handle_t *handle;
int err = 0, err2;
- handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+ handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA, JBD2_NO_FAIL);
if (IS_ERR(handle)) {
group = 1;
err = PTR_ERR(handle);
@@ -793,7 +793,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
*/
handle = ext4_journal_start_sb(sb,
ext4_bg_has_super(sb, input->group) ?
- 3 + reserved_gdb : 4);
+ 3 + reserved_gdb : 4, JBD2_NO_FAIL);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
goto exit_put;
@@ -1031,7 +1031,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
/* We will update the superblock, one block bitmap, and
* one group descriptor via ext4_free_blocks().
*/
- handle = ext4_journal_start_sb(sb, 3);
+ handle = ext4_journal_start_sb(sb, 3, JBD2_NO_FAIL);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
ext4_warning(sb, "error %d on journal start", err);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index aa842f3..8beb687 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -263,7 +263,8 @@ static void ext4_put_nojournal(handle_t *handle)
* ext4 prevents a new handle from being started by s_frozen, which
* is in an upper layer.
*/
-handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
+handle_t *ext4_journal_start_sb(struct super_block *sb,
+ int nblocks, int jbd_alloc_flags)
{
journal_t *journal;
handle_t *handle;
@@ -295,7 +296,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
ext4_abort(sb, "Detected aborted journal");
return ERR_PTR(-EROFS);
}
- return jbd2_journal_start(journal, nblocks, JBD2_NO_FAIL);
+ return jbd2_journal_start(journal, nblocks, jbd_alloc_flags);
}
/*
@@ -4561,7 +4562,8 @@ static int ext4_write_dquot(struct dquot *dquot)
inode = dquot_to_inode(dquot);
handle = ext4_journal_start(inode,
- EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+ EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb),
+ JBD2_NO_FAIL);
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_commit(dquot);
@@ -4577,7 +4579,8 @@ static int ext4_acquire_dquot(struct dquot *dquot)
handle_t *handle;
handle = ext4_journal_start(dquot_to_inode(dquot),
- EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+ EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb),
+ JBD2_NO_FAIL);
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_acquire(dquot);
@@ -4593,7 +4596,8 @@ static int ext4_release_dquot(struct dquot *dquot)
handle_t *handle;
handle = ext4_journal_start(dquot_to_inode(dquot),
- EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+ EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb),
+ JBD2_NO_FAIL);
if (IS_ERR(handle)) {
/* Release dquot anyway to avoid endless cycle in dqput() */
dquot_release(dquot);
@@ -4624,7 +4628,7 @@ static int ext4_write_info(struct super_block *sb, int type)
handle_t *handle;
/* Data block + inode block */
- handle = ext4_journal_start(sb->s_root->d_inode, 2);
+ handle = ext4_journal_start(sb->s_root->d_inode, 2, JBD2_NO_FAIL);
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_commit_info(sb, type);
@@ -4702,7 +4706,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
/* Update modification times of quota files when userspace can
* start looking at them */
- handle = ext4_journal_start(inode, 1);
+ handle = ext4_journal_start(inode, 1, (JBD2_FAIL_OK | JBD2_TOPLEVEL));
if (IS_ERR(handle))
goto out;
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index c757adc..e27b289 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1084,7 +1084,8 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
int error, retries = 0;
retry:
- handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+ handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb),
+ (JBD2_FAIL_OK | JBD2_TOPLEVEL));
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
} else {
--
1.7.4.1
On Fri 27-05-11 23:20:57, Manish Katiyar wrote:
> changes from v1 -> v2 :
> *) Update start_this_handle to take extra parameter to specify whether
> to retry the allocation or not.
> *) Added jbd allocation flags for callers to control the transaction allocation
> behavior. Callers can pass JBD2_TOPLEVEL if allocation needs to be done using GFP_KERNEL.
The above changelog should be below where (*) is.
Also - this is mainly for Ted: I've looked at where we JBD2_TOPLEVEL could
actually be enabled and the results are: Pretty much nowhere.
The problem is that with ext4, we need i_mutex in io completion path to
end page writeback. So we cannot do GFP_KERNEL allocation whenever we hold
i_mutex because mm might wait in direct reclaim for IO to complete and that
cannot happen until we release i_mutex. And pretty much every write path in
ext4 holds i_mutex.
So JBD2_TOPLEVEL looks like a useless excercise to me and I'd just don't do
it.
> Pass extra flags in journal routines to specify if its ok to
> fail in the journal transaction allocation. Passing JBD2_FAIL_OK means caller is
> ok with journal start failures and can handle ENOMEM.
>
> Update ocfs2 and ext4 routines to pass JBD2_NO_FAIL for the updated journal
> interface by default, to retain the existing behavior.
>
> Signed-off-by: Manish Katiyar <[email protected]>
> ---
> fs/ext4/ext4_jbd2.h | 2 +-
> fs/ext4/super.c | 2 +-
> fs/jbd2/transaction.c | 44 ++++++++++++++++----------------------------
> fs/ocfs2/journal.c | 8 ++++----
> include/linux/jbd2.h | 13 +++++++++----
> 5 files changed, 31 insertions(+), 38 deletions(-)
(*) HERE
> +/* JBD2 transaction allocation flags */
> +#define JBD2_NO_FAIL 0x00000001
> +#define JBD2_FAIL_OK 0x00000002
> +#define JBD2_TOPLEVEL 0x00000004
> +
I guess there's no need for JBD2_FAIL_OK - if NOFAIL is not set, we can
fail. Otherwise the patch looks OK.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Fri 27-05-11 23:21:54, Manish Katiyar wrote:
> This patch updates ext4 code to pass appropriate transaction
> allocation flags to callers who are safe to deal with ENOMEM.
> Pass JBD2_TOPLEVEL from the callers who are safe to do allocations
> with GFP_KERNEL instead of GFP_NOFS.
I've indicated places where TOPLEVEL cannot be below. There are about 4
or 5 which are probably OK. Otherwise the patch is OK.
> Signed-off-by: Manish Katiyar <[email protected]>
> ---
> fs/ext4/acl.c | 6 ++++--
> fs/ext4/ext4_jbd2.h | 8 +++++---
> fs/ext4/extents.c | 11 ++++++-----
> fs/ext4/ialloc.c | 2 +-
> fs/ext4/inode.c | 40 ++++++++++++++++++++++++----------------
> fs/ext4/ioctl.c | 6 ++++--
> fs/ext4/migrate.c | 10 +++++-----
> fs/ext4/move_extent.c | 3 ++-
> fs/ext4/namei.c | 47 ++++++++++++++++++++++++++++++-----------------
> fs/ext4/resize.c | 8 ++++----
> fs/ext4/super.c | 18 +++++++++++-------
> fs/ext4/xattr.c | 3 ++-
> 12 files changed, 98 insertions(+), 64 deletions(-)
>
> diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
> index 93dc9a6..95db569 100644
> --- a/fs/ext4/acl.c
> +++ b/fs/ext4/acl.c
> @@ -351,7 +351,8 @@ ext4_acl_chmod(struct inode *inode)
>
> retry:
> handle = ext4_journal_start(inode,
> - EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
> + EXT4_DATA_TRANS_BLOCKS(inode->i_sb),
> + (JBD2_FAIL_OK | JBD2_TOPLEVEL));
As I wrote in my previous email, JBD2_TOPLEVEL is not OK here because we
hold i_mutex.
> if (IS_ERR(handle)) {
> error = PTR_ERR(handle);
> if (error != -ENOMEM)
> @@ -450,7 +451,8 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
> acl = NULL;
>
> retry:
> - handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
> + handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb),
> + (JBD2_FAIL_OK | JBD2_TOPLEVEL));
The same here.
> if (IS_ERR(handle)) {
> error = PTR_ERR(handle);
> goto release_and_out;
...
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 5199bac..b610294 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -3811,7 +3811,8 @@ retry:
> while (ret >= 0 && ret < max_blocks) {
> map.m_lblk = map.m_lblk + ret;
> map.m_len = max_blocks = max_blocks - ret;
> - handle = ext4_journal_start(inode, credits);
> + handle = ext4_journal_start(inode, credits,
> + (JBD2_FAIL_OK | JBD2_TOPLEVEL));
And here as well.
> if (IS_ERR(handle)) {
> ret = PTR_ERR(handle);
> break;
...
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 50d0e9c..0355858 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5329,8 +5332,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
>
> /* (user+group)*(old+new) structure, inode write (sb,
> * inode block, ? - but truncate inode update has it) */
> - handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
> - EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
> + handle = ext4_journal_start(inode,
> + (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
> + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3,
> + (JBD2_FAIL_OK | JBD2_TOPLEVEL));
> if (IS_ERR(handle)) {
> error = PTR_ERR(handle);
> goto err_out;
Again, TOPLEVEL cannot be here.
> @@ -5364,7 +5369,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
> (attr->ia_size < inode->i_size)) {
> handle_t *handle;
>
> - handle = ext4_journal_start(inode, 3);
> + handle = ext4_journal_start(inode, 3,
> + (JBD2_FAIL_OK | JBD2_TOPLEVEL));
> if (IS_ERR(handle)) {
> error = PTR_ERR(handle);
> goto err_out;
And here as well.
> diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
> index 808c554..b88e295 100644
> --- a/fs/ext4/ioctl.c
> +++ b/fs/ext4/ioctl.c
> @@ -101,7 +101,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> } else if (oldflags & EXT4_EOFBLOCKS_FL)
> ext4_truncate(inode);
>
> - handle = ext4_journal_start(inode, 1);
> + handle = ext4_journal_start(inode, 1,
> + (JBD2_FAIL_OK | JBD2_TOPLEVEL));
And here as well.
> if (IS_ERR(handle)) {
> err = PTR_ERR(handle);
> goto flags_out;
> @@ -157,7 +158,8 @@ flags_out:
> goto setversion_out;
> }
>
> - handle = ext4_journal_start(inode, 1);
> + handle = ext4_journal_start(inode, 1,
> + (JBD2_FAIL_OK | JBD2_TOPLEVEL));
Here, actually TOPLEVEL is OK.
> if (IS_ERR(handle)) {
> err = PTR_ERR(handle);
> goto setversion_out;
> diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
> index b57b98f..f233f12 100644
> --- a/fs/ext4/migrate.c
> +++ b/fs/ext4/migrate.c
> @@ -485,10 +485,10 @@ int ext4_ext_migrate(struct inode *inode)
> return retval;
>
> handle = ext4_journal_start(inode,
> - EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
> - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
> - EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
> - + 1);
> + EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
> + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
> + EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
> + + 1, (JBD2_FAIL_OK | JBD2_TOPLEVEL));
TOPLEVEL cannot be here.
> if (IS_ERR(handle)) {
> retval = PTR_ERR(handle);
> return retval;
...
> diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
> index 2b8304b..ff4289f 100644
> --- a/fs/ext4/move_extent.c
> +++ b/fs/ext4/move_extent.c
> @@ -813,7 +813,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
> * inode and donor_inode may change each different metadata blocks.
> */
> jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
> - handle = ext4_journal_start(orig_inode, jblocks);
> + handle = ext4_journal_start(orig_inode, jblocks,
> + (JBD2_FAIL_OK | JBD2_TOPLEVEL));
TOPLEVEL cannot be here.
> if (IS_ERR(handle)) {
> *err = PTR_ERR(handle);
> return 0;
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index b754b77..101d5d5 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -1750,9 +1750,11 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
> dquot_initialize(dir);
>
> retry:
> - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
> - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
> - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
> + handle = ext4_journal_start(dir,
> + EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
> + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
> + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb),
> + (JBD2_FAIL_OK | JBD2_TOPLEVEL));
OK, TOPLEVEL possibly could be here because we hold only directory mutex
so writeback won't be stalled. But it's a bit subtle. The same holds for
mknod, mkdir and rmdir below.
> if (IS_ERR(handle))
> return PTR_ERR(handle);
>
> @@ -1786,9 +1788,11 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
> dquot_initialize(dir);
>
> retry:
> - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
> - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
> - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
> + handle = ext4_journal_start(dir,
> + EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
> + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
> + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb),
> + (JBD2_FAIL_OK | JBD2_TOPLEVEL));
> if (IS_ERR(handle))
> return PTR_ERR(handle);
>
> @@ -1825,9 +1829,11 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
> dquot_initialize(dir);
>
> retry:
> - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
> - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
> - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
> + handle = ext4_journal_start(dir,
> + EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
> + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
> + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb),
> + (JBD2_FAIL_OK | JBD2_TOPLEVEL));
> if (IS_ERR(handle))
> return PTR_ERR(handle);
>
> @@ -2140,7 +2146,8 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
> dquot_initialize(dir);
> dquot_initialize(dentry->d_inode);
>
> - handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
> + handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb),
> + (JBD2_FAIL_OK | JBD2_TOPLEVEL));
> if (IS_ERR(handle))
> return PTR_ERR(handle);
>
> @@ -2202,7 +2209,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
> dquot_initialize(dir);
> dquot_initialize(dentry->d_inode);
>
> - handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
> + handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb),
> + (JBD2_FAIL_OK | JBD2_TOPLEVEL));
TOPLEVEL cannot be here.
> if (IS_ERR(handle))
> return PTR_ERR(handle);
>
> @@ -2279,7 +2287,8 @@ static int ext4_symlink(struct inode *dir,
> EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
> }
> retry:
> - handle = ext4_journal_start(dir, credits);
> + handle = ext4_journal_start(dir,
> + credits, (JBD2_FAIL_OK | JBD2_TOPLEVEL));
> if (IS_ERR(handle))
> return PTR_ERR(handle);
>
> @@ -2319,7 +2328,8 @@ retry:
> */
> handle = ext4_journal_start(dir,
> EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
> - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
> + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1,
> + (JBD2_FAIL_OK | JBD2_TOPLEVEL));
Above two cases should be OK.
> if (IS_ERR(handle)) {
> err = PTR_ERR(handle);
> goto err_drop_inode;
> @@ -2364,8 +2374,10 @@ static int ext4_link(struct dentry *old_dentry,
> dquot_initialize(dir);
>
> retry:
> - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
> - EXT4_INDEX_EXTRA_TRANS_BLOCKS);
> + handle = ext4_journal_start(dir,
> + EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
> + EXT4_INDEX_EXTRA_TRANS_BLOCKS,
> + (JBD2_FAIL_OK | JBD2_TOPLEVEL));
TOPLEVEL cannot be here.
> if (IS_ERR(handle))
> return PTR_ERR(handle);
>
> @@ -2416,8 +2428,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
> if (new_dentry->d_inode)
> dquot_initialize(new_dentry->d_inode);
> handle = ext4_journal_start(old_dir, 2 *
> - EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
> - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
> + EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
> + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2,
> + (JBD2_FAIL_OK | JBD2_TOPLEVEL));
And it cannot be here either.
> if (IS_ERR(handle))
> return PTR_ERR(handle);
>
...
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index aa842f3..8beb687 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
...
> @@ -4702,7 +4706,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
>
> /* Update modification times of quota files when userspace can
> * start looking at them */
> - handle = ext4_journal_start(inode, 1);
> + handle = ext4_journal_start(inode, 1, (JBD2_FAIL_OK | JBD2_TOPLEVEL));
TOPLEVEL cannot be here because we hold s_umount on superblock.
> if (IS_ERR(handle))
> goto out;
> inode->i_mtime = inode->i_ctime = CURRENT_TIME;
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index c757adc..e27b289 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -1084,7 +1084,8 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
> int error, retries = 0;
>
> retry:
> - handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
> + handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb),
> + (JBD2_FAIL_OK | JBD2_TOPLEVEL));
TOPLEVEL cannot be here.
> if (IS_ERR(handle)) {
> error = PTR_ERR(handle);
> } else {
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Tue, May 31, 2011 at 01:22:53PM +0200, Jan Kara wrote:
>
> The problem is that with ext4, we need i_mutex in io completion path to
> end page writeback. So we cannot do GFP_KERNEL allocation whenever we hold
> i_mutex because mm might wait in direct reclaim for IO to complete and that
> cannot happen until we release i_mutex.
OK, maybe I'm being dense, but I'm not seeing it. I see where we need
i_mutex on the ext4_da_writepages() codepath, but that's never used
for direct reclaim. Direct reclaim only calls ext4_writepage(), and
that doesn't seem to try to grab i_mutex as near as I can tell. Am I
missing something?
- Ted
On Tue 31-05-11 18:27:20, Ted Tso wrote:
> On Tue, May 31, 2011 at 01:22:53PM +0200, Jan Kara wrote:
> >
> > The problem is that with ext4, we need i_mutex in io completion path to
> > end page writeback. So we cannot do GFP_KERNEL allocation whenever we hold
> > i_mutex because mm might wait in direct reclaim for IO to complete and that
> > cannot happen until we release i_mutex.
>
> OK, maybe I'm being dense, but I'm not seeing it. I see where we need
> i_mutex on the ext4_da_writepages() codepath, but that's never used
> for direct reclaim. Direct reclaim only calls ext4_writepage(), and
> that doesn't seem to try to grab i_mutex as near as I can tell. Am I
> missing something?
What happens is that direct reclaim sometimes does
wait_on_page_writeback() (e.g. shrink_page_list()) or it explicitely waits
for NR_WRITEBACK statistics to go below some threshold
(throttle_vm_writeout()). And that is deadlockable if we hold i_mutex while
doing this because we may need i_mutex to actually move the page from
PageWriteback state...
As I'm saying this, I've realized ext4 has this problem also with
stable-pages patches because there we can wait for PageWriteback in
grab_cache_page_write_begin() when we also hold i_mutex. So I think we'll
have to come up with a way to convert unwritten extents without having to
hold i_mutex. That's going to be interesting.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Thu, Jun 2, 2011 at 2:54 AM, Jan Kara <[email protected]> wrote:
> On Tue 31-05-11 18:27:20, Ted Tso wrote:
>> On Tue, May 31, 2011 at 01:22:53PM +0200, Jan Kara wrote:
>> >
>> > The problem is that with ext4, we need i_mutex in io completion path to
>> > end page writeback. So we cannot do GFP_KERNEL allocation whenever we hold
>> > i_mutex because mm might wait in direct reclaim for IO to complete and that
>> > cannot happen until we release i_mutex.
>>
>> OK, maybe I'm being dense, but I'm not seeing it. ?I see where we need
>> i_mutex on the ext4_da_writepages() codepath, but that's never used
>> for direct reclaim. ?Direct reclaim only calls ext4_writepage(), and
>> that doesn't seem to try to grab i_mutex as near as I can tell. ?Am I
>> missing something?
> ?What happens is that direct reclaim sometimes does
> wait_on_page_writeback() (e.g. shrink_page_list()) or it explicitely waits
> for NR_WRITEBACK statistics to go below some threshold
> (throttle_vm_writeout()). And that is deadlockable if we hold i_mutex while
> doing this because we may need i_mutex to actually move the page from
> PageWriteback state...
>
> As I'm saying this, I've realized ext4 has this problem also with
> stable-pages patches because there we can wait for PageWriteback in
> grab_cache_page_write_begin() when we also hold i_mutex. So I think we'll
> have to come up with a way to convert unwritten extents without having to
> hold i_mutex. That's going to be interesting.
Hi Jan/Ted,
Does that mean I should remove the whole JBD2_TOPLEVEL thing from my
revised patch ? Or should I fix it as per your feedback in
the other patch ?
--
Thanks -
Manish
On Thu, Jun 02, 2011 at 11:54:24AM +0200, Jan Kara wrote:
> What happens is that direct reclaim sometimes does
> wait_on_page_writeback() (e.g. shrink_page_list()) or it explicitely waits
> for NR_WRITEBACK statistics to go below some threshold
> (throttle_vm_writeout()). And that is deadlockable if we hold i_mutex while
> doing this because we may need i_mutex to actually move the page from
> PageWriteback state...
We don't actully call set_page_writeback() until right before we
submit the page for writeback. And we convert the unwritten extents
in a workqueue, which gets submitted after we call
end_page_writeback(). So I'm still not seeing a problem; sorry if I'm
being dense!
- Ted
> As I'm saying this, I've realized ext4 has this problem also with
> stable-pages patches because there we can wait for PageWriteback in
> grab_cache_page_write_begin() when we also hold i_mutex. So I think we'll
> have to come up with a way to convert unwritten extents without having to
> hold i_mutex. That's going to be interesting.
>
> Honza
> --
> Jan Kara <[email protected]>
> SUSE Labs, CR
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
Ping?
- Ted
On Sun, Jun 05, 2011 at 11:21:13PM -0400, Ted Ts'o wrote:
> On Thu, Jun 02, 2011 at 11:54:24AM +0200, Jan Kara wrote:
> > What happens is that direct reclaim sometimes does
> > wait_on_page_writeback() (e.g. shrink_page_list()) or it explicitely waits
> > for NR_WRITEBACK statistics to go below some threshold
> > (throttle_vm_writeout()). And that is deadlockable if we hold i_mutex while
> > doing this because we may need i_mutex to actually move the page from
> > PageWriteback state...
>
> We don't actully call set_page_writeback() until right before we
> submit the page for writeback. And we convert the unwritten extents
> in a workqueue, which gets submitted after we call
> end_page_writeback(). So I'm still not seeing a problem; sorry if I'm
> being dense!
>
> - Ted
>
> > As I'm saying this, I've realized ext4 has this problem also with
> > stable-pages patches because there we can wait for PageWriteback in
> > grab_cache_page_write_begin() when we also hold i_mutex. So I think we'll
> > have to come up with a way to convert unwritten extents without having to
> > hold i_mutex. That's going to be interesting.
> >
> > Honza
> > --
> > Jan Kara <[email protected]>
> > SUSE Labs, CR
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jun 8, 2011 at 7:10 AM, Ted Ts'o <[email protected]> wrote:
> Ping?
Hi Jan,
Is there anything required from my side on this patch ? From your
other post it seems that holding i_mutex isn't a problem. Please
advise.
--
Thanks -
Manish
Hi,
On Thu 16-06-11 23:32:12, Manish Katiyar wrote:
> On Wed, Jun 8, 2011 at 7:10 AM, Ted Ts'o <[email protected]> wrote:
> > Ping?
>
> Is there anything required from my side on this patch ? From your
> other post it seems that holding i_mutex isn't a problem. Please
> advise.
OK, after my discussion with Ted today, i_mutex really shouldn't be an
issue so your patch should be fine. I just suggest you resend it to Ted to
remind yourself :).
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Mon, Jun 20, 2011 at 7:32 AM, Jan Kara <[email protected]> wrote:
> ?Hi,
>
> On Thu 16-06-11 23:32:12, Manish Katiyar wrote:
>> On Wed, Jun 8, 2011 at 7:10 AM, Ted Ts'o <[email protected]> wrote:
>> > Ping?
>>
>> Is there anything required from my side on this patch ? From your
>> other post it seems that holding i_mutex isn't a problem. Please
>> advise.
> ?OK, after my discussion with Ted today, i_mutex really shouldn't be an
> issue so your patch should be fine. I just suggest you resend it to Ted to
> remind yourself :).
Thanks a lot Jan, Will resend the series to Ted.
--
Thanks -
Manish
On Mon, Jun 20, 2011 at 07:40:53AM -0700, Manish Katiyar wrote:
> > ?OK, after my discussion with Ted today, i_mutex really shouldn't be an
> > issue so your patch should be fine. I just suggest you resend it to Ted to
> > remind yourself :).
>
> Thanks a lot Jan, Will resend the series to Ted.
Thanks, but don't bother resending it unless you have any changes to
update.
I do keep track of patches using patchwork[1], so I haven't forgotten. :-)
[1] http://patchwork.ozlabs.org/project/linux-ext4/list/
I'll be starting to go through patches for the next merge window this
week.
- Ted
On Mon, Jun 20, 2011 at 10:57 AM, Ted Ts'o <[email protected]> wrote:
> On Mon, Jun 20, 2011 at 07:40:53AM -0700, Manish Katiyar wrote:
>> > ?OK, after my discussion with Ted today, i_mutex really shouldn't be an
>> > issue so your patch should be fine. I just suggest you resend it to Ted to
>> > remind yourself :).
>>
>> Thanks a lot Jan, Will resend the series to Ted.
>
> Thanks, but don't bother resending it unless you have any changes to
> update.
Thanks Ted, ... I don't have any other changes.
--
Thanks -
Manish