This patch set fixes three bugs found in pnfsblock testing.
Peng Tao (3):
pnfsblock: fix NULL pointer dereference
pnfsblock: fix writeback deadlock
nfs4: serialize layoutcommit
fs/nfs/blocklayout/blocklayout.c | 11 +++++++++--
fs/nfs/nfs4proc.c | 6 ++++++
fs/nfs/pnfs.c | 24 +++++++++++++++++++++---
include/linux/nfs_fs.h | 1 +
4 files changed, 37 insertions(+), 5 deletions(-)
--
1.7.4.1
From: Peng Tao <[email protected]>
We should check if the sector is already initialized before
trying to grab the page from page cache. Otherwise when two
pages of the same block are written back by two threads each
calling from writepage_locked, it can cause deadlock like bellow.
[ 1080.972099] INFO: task kswapd0:25 blocked for more than 120 seconds.
[ 1080.972377] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 1080.972812] kswapd0 D ffff88000c4926c0 0 25 2 0x00000000
[ 1080.972816] ffff88000df276b0 0000000000000046 ffff88000df27640 ffffffff81013ba7
[ 1080.972821] ffff88000c492310 ffff88000df27fd8 ffff88000df27fd8 00000000001d3440
[ 1080.972824] ffff88000c378000 ffff88000c492310 ffff8800175d3d40 ffff880017fc75a8
[ 1080.972828] Call Trace:
[ 1080.972860] [<ffffffff81013ba7>] ? read_tsc+0x9/0x19
[ 1080.972877] [<ffffffff810e0b23>] ? lock_page+0x2b/0x2b
[ 1080.972899] [<ffffffff81475a1d>] io_schedule+0x63/0x7e
[ 1080.972902] [<ffffffff810e0b31>] sleep_on_page+0xe/0x12
[ 1080.972905] [<ffffffff81475fe8>] __wait_on_bit_lock+0x46/0x8f
[ 1080.972916] [<ffffffff810822d7>] ? lock_release_holdtime.part.7+0x6b/0x72
[ 1080.972919] [<ffffffff810e0af6>] __lock_page+0x66/0x68
[ 1080.972928] [<ffffffff81072705>] ? autoremove_wake_function+0x3d/0x3d
[ 1080.972932] [<ffffffff810e0b1f>] lock_page+0x27/0x2b
[ 1080.972934] [<ffffffff810e0bcf>] find_lock_page+0x34/0x57
[ 1080.972937] [<ffffffff810e1738>] find_or_create_page+0x34/0x8a
[ 1080.972947] [<ffffffffa034245b>] bl_write_pagelist+0x205/0x6da [blocklayoutdriver]
[ 1080.972951] [<ffffffffa034145d>] ? bl_free_lseg+0x38/0x38 [blocklayoutdriver]
[ 1080.972995] [<ffffffffa02e27b9>] ? nfs_write_rpcsetup+0x118/0x123 [nfs]
[ 1080.973033] [<ffffffffa030246b>] pnfs_generic_pg_writepages+0x10b/0x1f4 [nfs]
[ 1080.973089] [<ffffffffa02deaae>] nfs_pageio_doio+0x1a/0x43 [nfs]
[ 1080.973098] [<ffffffffa02df035>] nfs_pageio_complete+0x16/0x2d [nfs]
[ 1080.973108] [<ffffffffa02e2d8f>] nfs_writepage_locked+0xa0/0xbf [nfs]
[ 1080.973119] [<ffffffffa02e36a1>] nfs_writepage+0x16/0x2b [nfs]
[ 1080.973122] [<ffffffff810e8762>] ? clear_page_dirty_for_io+0x87/0x9a
[ 1080.973133] [<ffffffff810efc5b>] shrink_page_list+0x39b/0x6c8
[ 1080.973139] [<ffffffff810f03bb>] shrink_inactive_list+0x22c/0x39e
[ 1080.973144] [<ffffffff810822d7>] ? lock_release_holdtime.part.7+0x6b/0x72
[ 1080.973148] [<ffffffff810f0c33>] shrink_zone+0x445/0x588
[ 1080.973152] [<ffffffff810f1a11>] balance_pgdat+0x2c2/0x56b
[ 1080.973170] [<ffffffff81254208>] ? __bitmap_weight+0x34/0x80
[ 1080.973175] [<ffffffff810f1f78>] kswapd+0x2be/0x2fa
[ 1080.973179] [<ffffffff810726c8>] ? __init_waitqueue_head+0x4b/0x4b
[ 1080.973183] [<ffffffff810f1cba>] ? balance_pgdat+0x56b/0x56b
[ 1080.973187] [<ffffffff81071f69>] kthread+0xa8/0xb0
[ 1080.973200] [<ffffffff814806b4>] kernel_thread_helper+0x4/0x10
[ 1080.973205] [<ffffffff81071ec1>] ? __init_kthread_worker+0x5a/0x5a
[ 1080.973210] [<ffffffff814806b0>] ? gs_change+0x13/0x13
[ 1080.973213] no locks held by kswapd0/25.
Signed-off-by: Peng Tao <[email protected]>
Signed-off-by: Jim Rees <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 8 ++++++--
1 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 8d06d5c..8f97578 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -533,6 +533,11 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
fill_invalid_ext:
dprintk("%s need to zero %d pages\n", __func__, npg_zero);
for (;npg_zero > 0; npg_zero--) {
+ if (bl_is_sector_init(be->be_inval, isect)) {
+ dprintk("isect %llu already init\n",
+ (unsigned long long)isect);
+ goto next_page;
+ }
/* page ref released in bl_end_io_write_zero */
index = isect >> PAGE_CACHE_SECTOR_SHIFT;
dprintk("%s zero %dth page: index %lu isect %llu\n",
@@ -552,8 +557,7 @@ fill_invalid_ext:
* PageUptodate: It was read before
* sector_initialized: already written out
*/
- if (PageDirty(page) || PageWriteback(page) ||
- bl_is_sector_init(be->be_inval, isect)) {
+ if (PageDirty(page) || PageWriteback(page)) {
print_page(page);
unlock_page(page);
page_cache_release(page);
--
1.7.4.1
From: Peng Tao <[email protected]>
Current pnfs_layoutcommit_inode can not handle parallel layoutcommit.
And as Trond suggested , there is no need for client to optimize for
parallel layoutcommit. So add NFS_INO_LAYOUTCOMMITTING flag to mark
inflight layoutcommit and serialize lalyoutcommit with it. Also call
mark_inode_dirty_sync if pnfs_layoutcommit_inode fails to issue
layoutcommit, so that layoutcommit can be retried later.
It also fixes the pls_lc_list corruption that Vitaliy found.
Reported-by: Vitaliy Gusev <[email protected]>
Signed-off-by: Peng Tao <[email protected]>
Signed-off-by: Jim Rees <[email protected]>
---
fs/nfs/nfs4proc.c | 6 ++++++
fs/nfs/pnfs.c | 24 +++++++++++++++++++++---
include/linux/nfs_fs.h | 1 +
3 files changed, 28 insertions(+), 3 deletions(-)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8c77039..d9ffe58 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5962,6 +5962,7 @@ static void nfs4_layoutcommit_release(void *calldata)
{
struct nfs4_layoutcommit_data *data = calldata;
struct pnfs_layout_segment *lseg, *tmp;
+ unsigned long *bitlock = &NFS_I(data->args.inode)->flags;
pnfs_cleanup_layoutcommit(data);
/* Matched by references in pnfs_set_layoutcommit */
@@ -5971,6 +5972,11 @@ static void nfs4_layoutcommit_release(void *calldata)
&lseg->pls_flags))
put_lseg(lseg);
}
+
+ clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
+ smp_mb__after_clear_bit();
+ wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
+
put_rpccred(data->cred);
kfree(data);
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index e5e11b4..fc15aa6 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1446,17 +1446,30 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
/* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
data = kzalloc(sizeof(*data), GFP_NOFS);
if (!data) {
- mark_inode_dirty_sync(inode);
status = -ENOMEM;
goto out;
}
+ if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
+ goto out_free;
+ else if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
+ if (!sync) {
+ status = -EAGAIN;
+ goto out_free;
+ }
+ status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING,
+ nfs_wait_bit_killable, TASK_KILLABLE);
+ if (status)
+ goto out_free;
+ }
+
INIT_LIST_HEAD(&data->lseg_list);
spin_lock(&inode->i_lock);
if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
+ clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags);
spin_unlock(&inode->i_lock);
- kfree(data);
- goto out;
+ wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING);
+ goto out_free;
}
pnfs_list_write_lseg(inode, &data->lseg_list);
@@ -1478,8 +1491,13 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
status = nfs4_proc_layoutcommit(data, sync);
out:
+ if (status)
+ mark_inode_dirty_sync(inode);
dprintk("<-- %s status %d\n", __func__, status);
return status;
+out_free:
+ kfree(data);
+ goto out;
}
/*
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index eaac770..c5b2b30 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -230,6 +230,7 @@ struct nfs_inode {
#define NFS_INO_COMMIT (7) /* inode is committing unstable writes */
#define NFS_INO_PNFS_COMMIT (8) /* use pnfs code for commit */
#define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */
+#define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */
static inline struct nfs_inode *NFS_I(const struct inode *inode)
{
--
1.7.4.1
From: Peng Tao <[email protected]>
bl_add_page_to_bio returns error pointer. bio should be reset to
NULL in failure cases as the out path always calls bl_submit_bio.
Signed-off-by: Peng Tao <[email protected]>
Signed-off-by: Jim Rees <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 3 +++
1 files changed, 3 insertions(+), 0 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 1038b8c..8d06d5c 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -292,6 +292,7 @@ bl_read_pagelist(struct nfs_read_data *rdata)
bl_end_io_read, par);
if (IS_ERR(bio)) {
rdata->pnfs_error = PTR_ERR(bio);
+ bio = NULL;
goto out;
}
}
@@ -581,6 +582,7 @@ fill_invalid_ext:
bl_end_io_write_zero, par);
if (IS_ERR(bio)) {
wdata->pnfs_error = PTR_ERR(bio);
+ bio = NULL;
goto out;
}
/* FIXME: This should be done in bi_end_io */
@@ -629,6 +631,7 @@ next_page:
bl_end_io_write, par);
if (IS_ERR(bio)) {
wdata->pnfs_error = PTR_ERR(bio);
+ bio = NULL;
goto out;
}
isect += PAGE_CACHE_SECTORS;
--
1.7.4.1
On 2011-09-20 06:23, Jim Rees wrote:
> This patch set fixes three bugs found in pnfsblock testing.
>
> Peng Tao (3):
> pnfsblock: fix NULL pointer dereference
> pnfsblock: fix writeback deadlock
> nfs4: serialize layoutcommit
>
> fs/nfs/blocklayout/blocklayout.c | 11 +++++++++--
> fs/nfs/nfs4proc.c | 6 ++++++
> fs/nfs/pnfs.c | 24 +++++++++++++++++++++---
> include/linux/nfs_fs.h | 1 +
> 4 files changed, 37 insertions(+), 5 deletions(-)
>
Submitted these too,
Thanks!
Benny