2015-02-06 04:45:10

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH v3 1/5] NFSv4: Ensure we reference the inode for return-on-close in delegreturn

If we have to do a return-on-close in the delegreturn code, then
we must ensure that the inode and super block remain referenced.

Cc: Peng Tao <[email protected]>
Cc: [email protected] # 3.17.x
Signed-off-by: Trond Myklebust <[email protected]>
Reviewed-by: Peng Tao <[email protected]>
---
fs/nfs/internal.h | 22 +++++++++++++++++++++-
fs/nfs/nfs4proc.c | 14 +++++++++-----
fs/nfs/super.c | 9 ++++++---
3 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index a98cf2006179..21469e6e3834 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -391,7 +391,7 @@ extern struct rpc_stat nfs_rpcstat;

extern int __init register_nfs_fs(void);
extern void __exit unregister_nfs_fs(void);
-extern void nfs_sb_active(struct super_block *sb);
+extern bool nfs_sb_active(struct super_block *sb);
extern void nfs_sb_deactive(struct super_block *sb);

/* namespace.c */
@@ -514,6 +514,26 @@ extern int nfs41_walk_client_list(struct nfs_client *clp,
struct nfs_client **result,
struct rpc_cred *cred);

+static inline struct inode *nfs_igrab_and_active(struct inode *inode)
+{
+ inode = igrab(inode);
+ if (inode != NULL && !nfs_sb_active(inode->i_sb)) {
+ iput(inode);
+ inode = NULL;
+ }
+ return inode;
+}
+
+static inline void nfs_iput_and_deactive(struct inode *inode)
+{
+ if (inode != NULL) {
+ struct super_block *sb = inode->i_sb;
+
+ iput(inode);
+ nfs_sb_deactive(sb);
+ }
+}
+
/*
* Determine the device name as a string
*/
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cd4295d84d54..dd892a4e7eb3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5175,9 +5175,13 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
static void nfs4_delegreturn_release(void *calldata)
{
struct nfs4_delegreturndata *data = calldata;
+ struct inode *inode = data->inode;

- if (data->roc)
- pnfs_roc_release(data->inode);
+ if (inode) {
+ if (data->roc)
+ pnfs_roc_release(inode);
+ nfs_iput_and_deactive(inode);
+ }
kfree(calldata);
}

@@ -5234,9 +5238,9 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
nfs_fattr_init(data->res.fattr);
data->timestamp = jiffies;
data->rpc_status = 0;
- data->inode = inode;
- data->roc = list_empty(&NFS_I(inode)->open_files) ?
- pnfs_roc(inode) : false;
+ data->inode = nfs_igrab_and_active(inode);
+ if (data->inode)
+ data->roc = nfs4_roc(inode);

task_setup_data.callback_data = data;
msg.rpc_argp = &data->args;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 31a11b0e885d..368d9395d2e7 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -405,12 +405,15 @@ void __exit unregister_nfs_fs(void)
unregister_filesystem(&nfs_fs_type);
}

-void nfs_sb_active(struct super_block *sb)
+bool nfs_sb_active(struct super_block *sb)
{
struct nfs_server *server = NFS_SB(sb);

- if (atomic_inc_return(&server->active) == 1)
- atomic_inc(&sb->s_active);
+ if (!atomic_inc_not_zero(&sb->s_active))
+ return false;
+ if (atomic_inc_return(&server->active) != 1)
+ atomic_dec(&sb->s_active);
+ return true;
}
EXPORT_SYMBOL_GPL(nfs_sb_active);

--
2.1.0



2015-02-06 04:45:11

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH v3 2/5] NFSv4.1: Pin the inode and super block in asynchronous layoutcommit

If we're sending an asynchronous layoutcommit, then we need to ensure
that the inode and the super block remain pinned.

Signed-off-by: Trond Myklebust <[email protected]>
Reviewed-by: Peng Tao <[email protected]>
---
fs/nfs/nfs4proc.c | 19 +++++++++++--------
include/linux/nfs_xdr.h | 1 +
2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index dd892a4e7eb3..e092b8540e2e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7989,6 +7989,7 @@ static void nfs4_layoutcommit_release(void *calldata)
nfs_post_op_update_inode_force_wcc(data->args.inode,
data->res.fattr);
put_rpccred(data->cred);
+ nfs_iput_and_deactive(data->inode);
kfree(data);
}

@@ -8013,7 +8014,6 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
.rpc_message = &msg,
.callback_ops = &nfs4_layoutcommit_ops,
.callback_data = data,
- .flags = RPC_TASK_ASYNC,
};
struct rpc_task *task;
int status = 0;
@@ -8024,18 +8024,21 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
data->args.lastbytewritten,
data->args.inode->i_ino);

+ if (!sync) {
+ data->inode = nfs_igrab_and_active(data->args.inode);
+ if (data->inode == NULL) {
+ nfs4_layoutcommit_release(data);
+ return -EAGAIN;
+ }
+ task_setup_data.flags = RPC_TASK_ASYNC;
+ }
nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
- if (sync == false)
- goto out;
- status = nfs4_wait_for_completion_rpc_task(task);
- if (status != 0)
- goto out;
- status = task->tk_status;
+ if (sync)
+ status = task->tk_status;
trace_nfs4_layoutcommit(data->args.inode, status);
-out:
dprintk("%s: status %d\n", __func__, status);
rpc_put_task(task);
return status;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 2c35e2affa6f..bb0d56f737e0 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -285,6 +285,7 @@ struct nfs4_layoutcommit_data {
struct nfs_fattr fattr;
struct list_head lseg_list;
struct rpc_cred *cred;
+ struct inode *inode;
struct nfs4_layoutcommit_args args;
struct nfs4_layoutcommit_res res;
};
--
2.1.0


2015-02-06 04:45:12

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH v3 3/5] NFSv4.1: Pin the inode and super block in asynchronous layoutreturns

If we're sending an asynchronous layoutreturn, then we need to ensure
that the inode and the super block remain pinned.

Cc: Peng Tao <[email protected]>
Signed-off-by: Trond Myklebust <[email protected]>
Reviewed-by: Peng Tao <[email protected]>
---
fs/nfs/nfs4proc.c | 19 +++++++++++--------
include/linux/nfs_xdr.h | 1 +
2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e092b8540e2e..2e7c9f7a6f7c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7856,6 +7856,7 @@ static void nfs4_layoutreturn_release(void *calldata)
lo->plh_block_lgets--;
spin_unlock(&lo->plh_inode->i_lock);
pnfs_put_layout_hdr(lrp->args.layout);
+ nfs_iput_and_deactive(lrp->inode);
kfree(calldata);
dprintk("<-- %s\n", __func__);
}
@@ -7880,23 +7881,25 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
.rpc_message = &msg,
.callback_ops = &nfs4_layoutreturn_call_ops,
.callback_data = lrp,
- .flags = RPC_TASK_ASYNC,
};
int status = 0;

dprintk("--> %s\n", __func__);
+ if (!sync) {
+ lrp->inode = nfs_igrab_and_active(lrp->args.inode);
+ if (!lrp->inode) {
+ nfs4_layoutreturn_release(lrp);
+ return -EAGAIN;
+ }
+ task_setup_data.flags |= RPC_TASK_ASYNC;
+ }
nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1);
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
- if (sync == false)
- goto out;
- status = nfs4_wait_for_completion_rpc_task(task);
- if (status != 0)
- goto out;
- status = task->tk_status;
+ if (sync)
+ status = task->tk_status;
trace_nfs4_layoutreturn(lrp->args.inode, status);
-out:
dprintk("<-- %s status=%d\n", __func__, status);
rpc_put_task(task);
return status;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index bb0d56f737e0..38d96ba935c2 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -310,6 +310,7 @@ struct nfs4_layoutreturn {
struct nfs4_layoutreturn_res res;
struct rpc_cred *cred;
struct nfs_client *clp;
+ struct inode *inode;
int rpc_status;
};

--
2.1.0


2015-02-06 04:45:13

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH v3 4/5] NFSv4.1: pnfs_send_layoutreturn should use GFP_NOFS

In we want to be able to call pnfs_send_layoutreturn() from within the
writeback path, we really want it to use GFP_NOFS in order to prevent
recursion.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/pnfs.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 703501d3ed19..a1d8620e8cb7 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -948,7 +948,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
struct nfs4_layoutreturn *lrp;
int status = 0;

- lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
+ lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
if (unlikely(lrp == NULL)) {
status = -ENOMEM;
spin_lock(&ino->i_lock);
--
2.1.0


2015-02-06 04:45:14

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH v3 5/5] NFSv4.1: Fix pnfs_put_lseg races

pnfs_layoutreturn_free_lseg_async() can also race with inode put in
the general case. We can now fix this, and also simplify the code.

Cc: Peng Tao <[email protected]>
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/pnfs.c | 53 +++++++++++++++++++----------------------------------
1 file changed, 19 insertions(+), 34 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a1d8620e8cb7..107b321be7d4 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -361,14 +361,9 @@ pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
return true;
}

-static void pnfs_layoutreturn_free_lseg(struct work_struct *work)
+static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
+ struct pnfs_layout_hdr *lo, struct inode *inode)
{
- struct pnfs_layout_segment *lseg;
- struct pnfs_layout_hdr *lo;
- struct inode *inode;
-
- lseg = container_of(work, struct pnfs_layout_segment, pls_work);
- WARN_ON(atomic_read(&lseg->pls_refcount));
lo = lseg->pls_layout;
inode = lo->plh_inode;

@@ -383,24 +378,12 @@ static void pnfs_layoutreturn_free_lseg(struct work_struct *work)
lo->plh_block_lgets++;
lo->plh_return_iomode = 0;
spin_unlock(&inode->i_lock);
+ pnfs_get_layout_hdr(lo);

- pnfs_send_layoutreturn(lo, stateid, iomode, true);
- spin_lock(&inode->i_lock);
+ /* Send an async layoutreturn so we dont deadlock */
+ pnfs_send_layoutreturn(lo, stateid, iomode, false);
} else
- /* match pnfs_get_layout_hdr #2 in pnfs_put_lseg */
- pnfs_put_layout_hdr(lo);
- pnfs_layout_remove_lseg(lo, lseg);
- spin_unlock(&inode->i_lock);
- pnfs_free_lseg(lseg);
- /* match pnfs_get_layout_hdr #1 in pnfs_put_lseg */
- pnfs_put_layout_hdr(lo);
-}
-
-static void
-pnfs_layoutreturn_free_lseg_async(struct pnfs_layout_segment *lseg)
-{
- INIT_WORK(&lseg->pls_work, pnfs_layoutreturn_free_lseg);
- queue_work(nfsiod_workqueue, &lseg->pls_work);
+ spin_unlock(&inode->i_lock);
}

void
@@ -415,21 +398,23 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
atomic_read(&lseg->pls_refcount),
test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+
+ /* Handle the case where refcount != 1 */
+ if (atomic_add_unless(&lseg->pls_refcount, -1, 1))
+ return;
+
lo = lseg->pls_layout;
inode = lo->plh_inode;
+ /* Do we need a layoutreturn? */
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ pnfs_layoutreturn_before_put_lseg(lseg, lo, inode);
+
if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
pnfs_get_layout_hdr(lo);
- if (pnfs_layout_need_return(lo, lseg)) {
- spin_unlock(&inode->i_lock);
- /* hdr reference dropped in nfs4_layoutreturn_release */
- pnfs_get_layout_hdr(lo);
- pnfs_layoutreturn_free_lseg_async(lseg);
- } else {
- pnfs_layout_remove_lseg(lo, lseg);
- spin_unlock(&inode->i_lock);
- pnfs_free_lseg(lseg);
- pnfs_put_layout_hdr(lo);
- }
+ pnfs_layout_remove_lseg(lo, lseg);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg(lseg);
+ pnfs_put_layout_hdr(lo);
}
}
EXPORT_SYMBOL_GPL(pnfs_put_lseg);
--
2.1.0


2015-02-06 06:54:12

by Peng Tao

[permalink] [raw]
Subject: Re: [PATCH v3 5/5] NFSv4.1: Fix pnfs_put_lseg races

On Fri, Feb 6, 2015 at 12:45 PM, Trond Myklebust
<[email protected]> wrote:
> pnfs_layoutreturn_free_lseg_async() can also race with inode put in
> the general case. We can now fix this, and also simplify the code.
>
> Cc: Peng Tao <[email protected]>
> Signed-off-by: Trond Myklebust <[email protected]>
> ---
> fs/nfs/pnfs.c | 53 +++++++++++++++++++----------------------------------
> 1 file changed, 19 insertions(+), 34 deletions(-)
>
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index a1d8620e8cb7..107b321be7d4 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -361,14 +361,9 @@ pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
> return true;
> }
>
> -static void pnfs_layoutreturn_free_lseg(struct work_struct *work)
> +static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
> + struct pnfs_layout_hdr *lo, struct inode *inode)
> {
> - struct pnfs_layout_segment *lseg;
> - struct pnfs_layout_hdr *lo;
> - struct inode *inode;
> -
> - lseg = container_of(work, struct pnfs_layout_segment, pls_work);
> - WARN_ON(atomic_read(&lseg->pls_refcount));
> lo = lseg->pls_layout;
> inode = lo->plh_inode;
>
> @@ -383,24 +378,12 @@ static void pnfs_layoutreturn_free_lseg(struct work_struct *work)
> lo->plh_block_lgets++;
> lo->plh_return_iomode = 0;
> spin_unlock(&inode->i_lock);
> + pnfs_get_layout_hdr(lo);
>
> - pnfs_send_layoutreturn(lo, stateid, iomode, true);
> - spin_lock(&inode->i_lock);
> + /* Send an async layoutreturn so we dont deadlock */
> + pnfs_send_layoutreturn(lo, stateid, iomode, false);
> } else
> - /* match pnfs_get_layout_hdr #2 in pnfs_put_lseg */
> - pnfs_put_layout_hdr(lo);
> - pnfs_layout_remove_lseg(lo, lseg);
> - spin_unlock(&inode->i_lock);
> - pnfs_free_lseg(lseg);
> - /* match pnfs_get_layout_hdr #1 in pnfs_put_lseg */
> - pnfs_put_layout_hdr(lo);
> -}
> -
> -static void
> -pnfs_layoutreturn_free_lseg_async(struct pnfs_layout_segment *lseg)
> -{
> - INIT_WORK(&lseg->pls_work, pnfs_layoutreturn_free_lseg);
> - queue_work(nfsiod_workqueue, &lseg->pls_work);
> + spin_unlock(&inode->i_lock);
> }
>
> void
> @@ -415,21 +398,23 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
> dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
> atomic_read(&lseg->pls_refcount),
> test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
> +
> + /* Handle the case where refcount != 1 */
> + if (atomic_add_unless(&lseg->pls_refcount, -1, 1))
> + return;
> +
> lo = lseg->pls_layout;
> inode = lo->plh_inode;
> + /* Do we need a layoutreturn? */
> + if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
pnfs_layout_need_return() iterates through all layout segments to make
sure that if we have other segments that need to be returned, we do
not send layoutreturn for current lseg so that if they happen to
overlap, we don't return a layout while still holding one captive.

I guess the right thing to do is to move pnfs_layout_need_return() and
pnfs_layoutreturn_before_put_lseg() under atomic_dec_and_lock.

Cheers,
Tao

> + pnfs_layoutreturn_before_put_lseg(lseg, lo, inode);
> +
> if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
> pnfs_get_layout_hdr(lo);
> - if (pnfs_layout_need_return(lo, lseg)) {
> - spin_unlock(&inode->i_lock);
> - /* hdr reference dropped in nfs4_layoutreturn_release */
> - pnfs_get_layout_hdr(lo);
> - pnfs_layoutreturn_free_lseg_async(lseg);
> - } else {
> - pnfs_layout_remove_lseg(lo, lseg);
> - spin_unlock(&inode->i_lock);
> - pnfs_free_lseg(lseg);
> - pnfs_put_layout_hdr(lo);
> - }
> + pnfs_layout_remove_lseg(lo, lseg);
> + spin_unlock(&inode->i_lock);
> + pnfs_free_lseg(lseg);
> + pnfs_put_layout_hdr(lo);
> }
> }
> EXPORT_SYMBOL_GPL(pnfs_put_lseg);
> --
> 2.1.0
>

2015-02-06 07:02:49

by Peng Tao

[permalink] [raw]
Subject: Re: [PATCH v3 5/5] NFSv4.1: Fix pnfs_put_lseg races

On Fri, Feb 6, 2015 at 2:53 PM, Peng Tao <[email protected]> wrote:
> On Fri, Feb 6, 2015 at 12:45 PM, Trond Myklebust
> <[email protected]> wrote:
>> pnfs_layoutreturn_free_lseg_async() can also race with inode put in
>> the general case. We can now fix this, and also simplify the code.
>>
>> Cc: Peng Tao <[email protected]>
>> Signed-off-by: Trond Myklebust <[email protected]>
>> ---
>> fs/nfs/pnfs.c | 53 +++++++++++++++++++----------------------------------
>> 1 file changed, 19 insertions(+), 34 deletions(-)
>>
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index a1d8620e8cb7..107b321be7d4 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -361,14 +361,9 @@ pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
>> return true;
>> }
>>
>> -static void pnfs_layoutreturn_free_lseg(struct work_struct *work)
>> +static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
>> + struct pnfs_layout_hdr *lo, struct inode *inode)
>> {
>> - struct pnfs_layout_segment *lseg;
>> - struct pnfs_layout_hdr *lo;
>> - struct inode *inode;
>> -
>> - lseg = container_of(work, struct pnfs_layout_segment, pls_work);
>> - WARN_ON(atomic_read(&lseg->pls_refcount));
>> lo = lseg->pls_layout;
>> inode = lo->plh_inode;
>>
>> @@ -383,24 +378,12 @@ static void pnfs_layoutreturn_free_lseg(struct work_struct *work)
>> lo->plh_block_lgets++;
>> lo->plh_return_iomode = 0;
>> spin_unlock(&inode->i_lock);
>> + pnfs_get_layout_hdr(lo);
>>
>> - pnfs_send_layoutreturn(lo, stateid, iomode, true);
>> - spin_lock(&inode->i_lock);
>> + /* Send an async layoutreturn so we dont deadlock */
>> + pnfs_send_layoutreturn(lo, stateid, iomode, false);
>> } else
>> - /* match pnfs_get_layout_hdr #2 in pnfs_put_lseg */
>> - pnfs_put_layout_hdr(lo);
>> - pnfs_layout_remove_lseg(lo, lseg);
>> - spin_unlock(&inode->i_lock);
>> - pnfs_free_lseg(lseg);
>> - /* match pnfs_get_layout_hdr #1 in pnfs_put_lseg */
>> - pnfs_put_layout_hdr(lo);
>> -}
>> -
>> -static void
>> -pnfs_layoutreturn_free_lseg_async(struct pnfs_layout_segment *lseg)
>> -{
>> - INIT_WORK(&lseg->pls_work, pnfs_layoutreturn_free_lseg);
>> - queue_work(nfsiod_workqueue, &lseg->pls_work);
>> + spin_unlock(&inode->i_lock);
>> }
>>
>> void
>> @@ -415,21 +398,23 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
>> dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
>> atomic_read(&lseg->pls_refcount),
>> test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
>> +
>> + /* Handle the case where refcount != 1 */
>> + if (atomic_add_unless(&lseg->pls_refcount, -1, 1))
>> + return;
>> +
>> lo = lseg->pls_layout;
>> inode = lo->plh_inode;
>> + /* Do we need a layoutreturn? */
>> + if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
> pnfs_layout_need_return() iterates through all layout segments to make
> sure that if we have other segments that need to be returned, we do
> not send layoutreturn for current lseg so that if they happen to
> overlap, we don't return a layout while still holding one captive.
>
This might not be a problem for files and flexfiles for now but block
layout should be able to see it fairly easy if server is returning
overlapping layout segments to client.

Cheers,
Tao

> I guess the right thing to do is to move pnfs_layout_need_return() and
> pnfs_layoutreturn_before_put_lseg() under atomic_dec_and_lock.
>
> Cheers,
> Tao
>
>> + pnfs_layoutreturn_before_put_lseg(lseg, lo, inode);
>> +
>> if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
>> pnfs_get_layout_hdr(lo);
>> - if (pnfs_layout_need_return(lo, lseg)) {
>> - spin_unlock(&inode->i_lock);
>> - /* hdr reference dropped in nfs4_layoutreturn_release */
>> - pnfs_get_layout_hdr(lo);
>> - pnfs_layoutreturn_free_lseg_async(lseg);
>> - } else {
>> - pnfs_layout_remove_lseg(lo, lseg);
>> - spin_unlock(&inode->i_lock);
>> - pnfs_free_lseg(lseg);
>> - pnfs_put_layout_hdr(lo);
>> - }
>> + pnfs_layout_remove_lseg(lo, lseg);
>> + spin_unlock(&inode->i_lock);
>> + pnfs_free_lseg(lseg);
>> + pnfs_put_layout_hdr(lo);
>> }
>> }
>> EXPORT_SYMBOL_GPL(pnfs_put_lseg);
>> --
>> 2.1.0
>>

2015-02-06 12:26:40

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH v3 1/5] NFSv4: Ensure we reference the inode for return-on-close in delegreturn

On Thu, 5 Feb 2015 23:45:03 -0500
Trond Myklebust <[email protected]> wrote:

> If we have to do a return-on-close in the delegreturn code, then
> we must ensure that the inode and super block remain referenced.
>
> Cc: Peng Tao <[email protected]>
> Cc: [email protected] # 3.17.x
> Signed-off-by: Trond Myklebust <[email protected]>
> Reviewed-by: Peng Tao <[email protected]>
> ---
> fs/nfs/internal.h | 22 +++++++++++++++++++++-
> fs/nfs/nfs4proc.c | 14 +++++++++-----
> fs/nfs/super.c | 9 ++++++---
> 3 files changed, 36 insertions(+), 9 deletions(-)
>
> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> index a98cf2006179..21469e6e3834 100644
> --- a/fs/nfs/internal.h
> +++ b/fs/nfs/internal.h
> @@ -391,7 +391,7 @@ extern struct rpc_stat nfs_rpcstat;
>
> extern int __init register_nfs_fs(void);
> extern void __exit unregister_nfs_fs(void);
> -extern void nfs_sb_active(struct super_block *sb);
> +extern bool nfs_sb_active(struct super_block *sb);
> extern void nfs_sb_deactive(struct super_block *sb);
>
> /* namespace.c */
> @@ -514,6 +514,26 @@ extern int nfs41_walk_client_list(struct nfs_client *clp,
> struct nfs_client **result,
> struct rpc_cred *cred);
>
> +static inline struct inode *nfs_igrab_and_active(struct inode *inode)
> +{
> + inode = igrab(inode);

I would expect that you already hold a reference to the inode so
shouldn't that never return NULL? If so, then you could use ihold()
instead and simplify this a little.

> + if (inode != NULL && !nfs_sb_active(inode->i_sb)) {
> + iput(inode);
> + inode = NULL;
> + }
> + return inode;
> +}
> +
> +static inline void nfs_iput_and_deactive(struct inode *inode)
> +{
> + if (inode != NULL) {
> + struct super_block *sb = inode->i_sb;
> +
> + iput(inode);
> + nfs_sb_deactive(sb);
> + }
> +}
> +
> /*
> * Determine the device name as a string
> */
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index cd4295d84d54..dd892a4e7eb3 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -5175,9 +5175,13 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
> static void nfs4_delegreturn_release(void *calldata)
> {
> struct nfs4_delegreturndata *data = calldata;
> + struct inode *inode = data->inode;
>
> - if (data->roc)
> - pnfs_roc_release(data->inode);
> + if (inode) {
> + if (data->roc)
> + pnfs_roc_release(inode);
> + nfs_iput_and_deactive(inode);
> + }
> kfree(calldata);
> }
>
> @@ -5234,9 +5238,9 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
> nfs_fattr_init(data->res.fattr);
> data->timestamp = jiffies;
> data->rpc_status = 0;
> - data->inode = inode;
> - data->roc = list_empty(&NFS_I(inode)->open_files) ?
> - pnfs_roc(inode) : false;
> + data->inode = nfs_igrab_and_active(inode);
> + if (data->inode)
> + data->roc = nfs4_roc(inode);
>
> task_setup_data.callback_data = data;
> msg.rpc_argp = &data->args;
> diff --git a/fs/nfs/super.c b/fs/nfs/super.c
> index 31a11b0e885d..368d9395d2e7 100644
> --- a/fs/nfs/super.c
> +++ b/fs/nfs/super.c
> @@ -405,12 +405,15 @@ void __exit unregister_nfs_fs(void)
> unregister_filesystem(&nfs_fs_type);
> }
>
> -void nfs_sb_active(struct super_block *sb)
> +bool nfs_sb_active(struct super_block *sb)
> {
> struct nfs_server *server = NFS_SB(sb);
>
> - if (atomic_inc_return(&server->active) == 1)
> - atomic_inc(&sb->s_active);
> + if (!atomic_inc_not_zero(&sb->s_active))
> + return false;
> + if (atomic_inc_return(&server->active) != 1)
> + atomic_dec(&sb->s_active);

Could you end up doing a 1->0 s_active transition here? Shouldn't this
be a deactivate_super instead?

> + return true;
> }
> EXPORT_SYMBOL_GPL(nfs_sb_active);
>


--
Jeff Layton <[email protected]>

2015-02-06 13:31:22

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v3 1/5] NFSv4: Ensure we reference the inode for return-on-close in delegreturn

On Fri, Feb 6, 2015 at 7:26 AM, Jeff Layton <[email protected]> wrote:
> On Thu, 5 Feb 2015 23:45:03 -0500
> Trond Myklebust <[email protected]> wrote:
>
>> If we have to do a return-on-close in the delegreturn code, then
>> we must ensure that the inode and super block remain referenced.
>>
>> Cc: Peng Tao <[email protected]>
>> Cc: [email protected] # 3.17.x
>> Signed-off-by: Trond Myklebust <[email protected]>
>> Reviewed-by: Peng Tao <[email protected]>
>> ---
>> fs/nfs/internal.h | 22 +++++++++++++++++++++-
>> fs/nfs/nfs4proc.c | 14 +++++++++-----
>> fs/nfs/super.c | 9 ++++++---
>> 3 files changed, 36 insertions(+), 9 deletions(-)
>>
>> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
>> index a98cf2006179..21469e6e3834 100644
>> --- a/fs/nfs/internal.h
>> +++ b/fs/nfs/internal.h
>> @@ -391,7 +391,7 @@ extern struct rpc_stat nfs_rpcstat;
>>
>> extern int __init register_nfs_fs(void);
>> extern void __exit unregister_nfs_fs(void);
>> -extern void nfs_sb_active(struct super_block *sb);
>> +extern bool nfs_sb_active(struct super_block *sb);
>> extern void nfs_sb_deactive(struct super_block *sb);
>>
>> /* namespace.c */
>> @@ -514,6 +514,26 @@ extern int nfs41_walk_client_list(struct nfs_client *clp,
>> struct nfs_client **result,
>> struct rpc_cred *cred);
>>
>> +static inline struct inode *nfs_igrab_and_active(struct inode *inode)
>> +{
>> + inode = igrab(inode);
>
> I would expect that you already hold a reference to the inode so
> shouldn't that never return NULL? If so, then you could use ihold()
> instead and simplify this a little.

The choice of igrab() is deliberate here because we may be called in
situations where the inode is in the process of being freed. Both
delegreturn and layoutreturn can be called as part of an 'evict_inode'
callback.

>
>> + if (inode != NULL && !nfs_sb_active(inode->i_sb)) {
>> + iput(inode);
>> + inode = NULL;
>> + }
>> + return inode;
>> +}
>> +
>> +static inline void nfs_iput_and_deactive(struct inode *inode)
>> +{
>> + if (inode != NULL) {
>> + struct super_block *sb = inode->i_sb;
>> +
>> + iput(inode);
>> + nfs_sb_deactive(sb);
>> + }
>> +}
>> +
>> /*
>> * Determine the device name as a string
>> */
>> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
>> index cd4295d84d54..dd892a4e7eb3 100644
>> --- a/fs/nfs/nfs4proc.c
>> +++ b/fs/nfs/nfs4proc.c
>> @@ -5175,9 +5175,13 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
>> static void nfs4_delegreturn_release(void *calldata)
>> {
>> struct nfs4_delegreturndata *data = calldata;
>> + struct inode *inode = data->inode;
>>
>> - if (data->roc)
>> - pnfs_roc_release(data->inode);
>> + if (inode) {
>> + if (data->roc)
>> + pnfs_roc_release(inode);
>> + nfs_iput_and_deactive(inode);
>> + }
>> kfree(calldata);
>> }
>>
>> @@ -5234,9 +5238,9 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
>> nfs_fattr_init(data->res.fattr);
>> data->timestamp = jiffies;
>> data->rpc_status = 0;
>> - data->inode = inode;
>> - data->roc = list_empty(&NFS_I(inode)->open_files) ?
>> - pnfs_roc(inode) : false;
>> + data->inode = nfs_igrab_and_active(inode);
>> + if (data->inode)
>> + data->roc = nfs4_roc(inode);
>>
>> task_setup_data.callback_data = data;
>> msg.rpc_argp = &data->args;
>> diff --git a/fs/nfs/super.c b/fs/nfs/super.c
>> index 31a11b0e885d..368d9395d2e7 100644
>> --- a/fs/nfs/super.c
>> +++ b/fs/nfs/super.c
>> @@ -405,12 +405,15 @@ void __exit unregister_nfs_fs(void)
>> unregister_filesystem(&nfs_fs_type);
>> }
>>
>> -void nfs_sb_active(struct super_block *sb)
>> +bool nfs_sb_active(struct super_block *sb)
>> {
>> struct nfs_server *server = NFS_SB(sb);
>>
>> - if (atomic_inc_return(&server->active) == 1)
>> - atomic_inc(&sb->s_active);
>> + if (!atomic_inc_not_zero(&sb->s_active))
>> + return false;
>> + if (atomic_inc_return(&server->active) != 1)
>> + atomic_dec(&sb->s_active);
>
> Could you end up doing a 1->0 s_active transition here? Shouldn't this
> be a deactivate_super instead?

No. The above line ensures that we take 1 reference to sb->s_active
(when server->active does a 0->1 transition) and only that reference.

--
Trond Myklebust
Linux NFS client maintainer, PrimaryData
[email protected]

2015-02-06 13:35:32

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v3 1/5] NFSv4: Ensure we reference the inode for return-on-close in delegreturn

On Fri, Feb 6, 2015 at 8:31 AM, Trond Myklebust
<[email protected]> wrote:
> On Fri, Feb 6, 2015 at 7:26 AM, Jeff Layton <[email protected]> wrote:
>> On Thu, 5 Feb 2015 23:45:03 -0500
>> Trond Myklebust <[email protected]> wrote:
>>
>>> If we have to do a return-on-close in the delegreturn code, then
>>> we must ensure that the inode and super block remain referenced.
>>>
>>> Cc: Peng Tao <[email protected]>
>>> Cc: [email protected] # 3.17.x
>>> Signed-off-by: Trond Myklebust <[email protected]>
>>> Reviewed-by: Peng Tao <[email protected]>
>>> ---
>>> fs/nfs/internal.h | 22 +++++++++++++++++++++-
>>> fs/nfs/nfs4proc.c | 14 +++++++++-----
>>> fs/nfs/super.c | 9 ++++++---
>>> 3 files changed, 36 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
>>> index a98cf2006179..21469e6e3834 100644
>>> --- a/fs/nfs/internal.h
>>> +++ b/fs/nfs/internal.h
>>> @@ -391,7 +391,7 @@ extern struct rpc_stat nfs_rpcstat;
>>>
>>> extern int __init register_nfs_fs(void);
>>> extern void __exit unregister_nfs_fs(void);
>>> -extern void nfs_sb_active(struct super_block *sb);
>>> +extern bool nfs_sb_active(struct super_block *sb);
>>> extern void nfs_sb_deactive(struct super_block *sb);
>>>
>>> /* namespace.c */
>>> @@ -514,6 +514,26 @@ extern int nfs41_walk_client_list(struct nfs_client *clp,
>>> struct nfs_client **result,
>>> struct rpc_cred *cred);
>>>
>>> +static inline struct inode *nfs_igrab_and_active(struct inode *inode)
>>> +{
>>> + inode = igrab(inode);
>>
>> I would expect that you already hold a reference to the inode so
>> shouldn't that never return NULL? If so, then you could use ihold()
>> instead and simplify this a little.
>
> The choice of igrab() is deliberate here because we may be called in
> situations where the inode is in the process of being freed. Both
> delegreturn and layoutreturn can be called as part of an 'evict_inode'
> callback.

Just to clarify; the calls in evict_inode are safe, because we no
longer apply the asynchronous flag in the '!sync' case, however it
would still be bad to call ihold()+iput() in that situation.

>>
>>> + if (inode != NULL && !nfs_sb_active(inode->i_sb)) {
>>> + iput(inode);
>>> + inode = NULL;
>>> + }
>>> + return inode;
>>> +}
>>> +
>>> +static inline void nfs_iput_and_deactive(struct inode *inode)
>>> +{
>>> + if (inode != NULL) {
>>> + struct super_block *sb = inode->i_sb;
>>> +
>>> + iput(inode);
>>> + nfs_sb_deactive(sb);
>>> + }
>>> +}
>>> +
>>> /*
>>> * Determine the device name as a string
>>> */
>>> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
>>> index cd4295d84d54..dd892a4e7eb3 100644
>>> --- a/fs/nfs/nfs4proc.c
>>> +++ b/fs/nfs/nfs4proc.c
>>> @@ -5175,9 +5175,13 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
>>> static void nfs4_delegreturn_release(void *calldata)
>>> {
>>> struct nfs4_delegreturndata *data = calldata;
>>> + struct inode *inode = data->inode;
>>>
>>> - if (data->roc)
>>> - pnfs_roc_release(data->inode);
>>> + if (inode) {
>>> + if (data->roc)
>>> + pnfs_roc_release(inode);
>>> + nfs_iput_and_deactive(inode);
>>> + }
>>> kfree(calldata);
>>> }
>>>
>>> @@ -5234,9 +5238,9 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
>>> nfs_fattr_init(data->res.fattr);
>>> data->timestamp = jiffies;
>>> data->rpc_status = 0;
>>> - data->inode = inode;
>>> - data->roc = list_empty(&NFS_I(inode)->open_files) ?
>>> - pnfs_roc(inode) : false;
>>> + data->inode = nfs_igrab_and_active(inode);
>>> + if (data->inode)
>>> + data->roc = nfs4_roc(inode);
>>>
>>> task_setup_data.callback_data = data;
>>> msg.rpc_argp = &data->args;
>>> diff --git a/fs/nfs/super.c b/fs/nfs/super.c
>>> index 31a11b0e885d..368d9395d2e7 100644
>>> --- a/fs/nfs/super.c
>>> +++ b/fs/nfs/super.c
>>> @@ -405,12 +405,15 @@ void __exit unregister_nfs_fs(void)
>>> unregister_filesystem(&nfs_fs_type);
>>> }
>>>
>>> -void nfs_sb_active(struct super_block *sb)
>>> +bool nfs_sb_active(struct super_block *sb)
>>> {
>>> struct nfs_server *server = NFS_SB(sb);
>>>
>>> - if (atomic_inc_return(&server->active) == 1)
>>> - atomic_inc(&sb->s_active);
>>> + if (!atomic_inc_not_zero(&sb->s_active))
>>> + return false;
>>> + if (atomic_inc_return(&server->active) != 1)
>>> + atomic_dec(&sb->s_active);
>>
>> Could you end up doing a 1->0 s_active transition here? Shouldn't this
>> be a deactivate_super instead?
>
> No. The above line ensures that we take 1 reference to sb->s_active
> (when server->active does a 0->1 transition) and only that reference.
>
> --
> Trond Myklebust
> Linux NFS client maintainer, PrimaryData
> [email protected]



--
Trond Myklebust
Linux NFS client maintainer, PrimaryData
[email protected]