Version 4 of the wave 2 submission.
changes from version 3:
- send DELAY on CB_LAYOUTRECALL only if lsegs remain after try to cleanup, instead of if they exist when start cleanup
These patches implement wave 2 of the pnfs submission, which
encompasses CB_LAYOUTRECALL and its serialization with LAYOUTGET,
as well as the "forgetful model" in which LAYOUTRETURNs are never
sent, but instead merely discarded by the client.
No functional changes, just some code minor code rearrangement and
comments.
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/nfs4proc.c | 19 ++++++++++---------
1 files changed, 10 insertions(+), 9 deletions(-)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f475516..1609a07 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5297,10 +5297,14 @@ static void
nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
{
struct nfs4_layoutget *lgp = calldata;
- struct inode *ino = lgp->args.inode;
- struct nfs_server *server = NFS_SERVER(ino);
+ struct nfs_server *server = NFS_SERVER(lgp->args.inode);
dprintk("--> %s\n", __func__);
+ /* Note the is a race here, where a CB_LAYOUTRECALL can come in
+ * right now covering the LAYOUTGET we are about to send.
+ * However, that is not so catastrophic, and there seems
+ * to be no way to prevent it completely.
+ */
if (nfs4_setup_sequence(server, &lgp->args.seq_args,
&lgp->res.seq_res, 0, task))
return;
@@ -5383,13 +5387,10 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
if (IS_ERR(task))
return PTR_ERR(task);
status = nfs4_wait_for_completion_rpc_task(task);
- if (status != 0)
- goto out;
- status = task->tk_status;
- if (status != 0)
- goto out;
- status = pnfs_layout_process(lgp);
-out:
+ if (status == 0)
+ status = task->tk_status;
+ if (status == 0)
+ status = pnfs_layout_process(lgp);
rpc_put_task(task);
dprintk("<-- %s status=%d\n", __func__, status);
return status;
--
1.7.2.1
This is the heart of the wave 2 submission. Add the code to trigger
drain and forget of any afected layouts. In addition, we set a
"barrier", below which any LAYOUTGET reply is ignored. This is to
compensate for the fact that we do not wait for outstanding LAYOUTGETs
to complete as per section 12.5.5.2.1 of RFC 5661.
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/callback_proc.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++-
fs/nfs/nfs4_fs.h | 1 +
fs/nfs/pnfs.c | 84 ++++++++++++++++++++++++++--------
fs/nfs/pnfs.h | 12 +++++
4 files changed, 196 insertions(+), 20 deletions(-)
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index c1bb157..a8e8e20 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -12,6 +12,7 @@
#include "callback.h"
#include "delegation.h"
#include "internal.h"
+#include "pnfs.h"
#ifdef NFS_DEBUG
#define NFSDBG_FACILITY NFSDBG_CALLBACK
@@ -107,10 +108,126 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
#if defined(CONFIG_NFS_V4_1)
+static int initiate_layout_draining(struct nfs_client *clp,
+ struct cb_layoutrecallargs *args)
+{
+ struct pnfs_layout_hdr *lo;
+ int rv = NFS4ERR_NOMATCHING_LAYOUT;
+
+ if (args->cbl_recall_type == RETURN_FILE) {
+ LIST_HEAD(free_me_list);
+
+ args->cbl_inode = NULL;
+ spin_lock(&clp->cl_lock);
+ list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+ if (nfs_compare_fh(&args->cbl_fh,
+ &NFS_I(lo->plh_inode)->fh))
+ continue;
+ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+ rv = NFS4ERR_DELAY;
+ else {
+ /* Without this, layout can be freed as soon
+ * as we release cl_lock. Matched in
+ * do_callback_layoutrecall.
+ */
+ get_layout_hdr(lo);
+ args->cbl_inode = lo->plh_inode;
+ rv = NFS4_OK;
+ }
+ break;
+ }
+ spin_unlock(&clp->cl_lock);
+
+ spin_lock(&lo->plh_inode->i_lock);
+ if (rv == NFS4_OK) {
+ lo->plh_block_lgets++;
+ mark_matching_lsegs_invalid(lo, &free_me_list,
+ args->cbl_range.iomode);
+ if (list_empty(&lo->plh_segs))
+ rv = NFS4ERR_NOMATCHING_LAYOUT;
+ }
+ pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+ spin_unlock(&lo->plh_inode->i_lock);
+ pnfs_free_lseg_list(&free_me_list);
+ } else {
+ struct pnfs_layout_hdr *tmp;
+ LIST_HEAD(recall_list);
+ LIST_HEAD(free_me_list);
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+
+ spin_lock(&clp->cl_lock);
+ list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+ if ((args->cbl_recall_type == RETURN_FSID) &&
+ memcmp(&NFS_SERVER(lo->plh_inode)->fsid,
+ &args->cbl_fsid, sizeof(struct nfs_fsid)))
+ continue;
+ get_layout_hdr(lo);
+ BUG_ON(!list_empty(&lo->plh_bulk_recall));
+ list_add(&lo->plh_bulk_recall, &recall_list);
+ }
+ spin_unlock(&clp->cl_lock);
+ list_for_each_entry_safe(lo, tmp,
+ &recall_list, plh_bulk_recall) {
+ spin_lock(&lo->plh_inode->i_lock);
+ set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+ mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode);
+ list_del_init(&lo->plh_bulk_recall);
+ if (!list_empty(&lo->plh_segs))
+ rv = NFS4_OK;
+ spin_unlock(&lo->plh_inode->i_lock);
+ put_layout_hdr(lo);
+ }
+ pnfs_free_lseg_list(&free_me_list);
+ }
+ return rv;
+}
+
+static u32 do_callback_layoutrecall(struct nfs_client *clp,
+ struct cb_layoutrecallargs *args)
+{
+ u32 status, res = NFS4ERR_DELAY;
+
+ dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
+ if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state))
+ goto out;
+ status = initiate_layout_draining(clp, args);
+ if (status)
+ res = status;
+ else if (args->cbl_recall_type == RETURN_FILE) {
+ struct pnfs_layout_hdr *lo;
+
+ lo = NFS_I(args->cbl_inode)->layout;
+ spin_lock(&lo->plh_inode->i_lock);
+ lo->plh_block_lgets--;
+ spin_unlock(&lo->plh_inode->i_lock);
+ put_layout_hdr(lo);
+ res = NFS4ERR_DELAY;
+ }
+ clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state);
+out:
+ dprintk("%s returning %i\n", __func__, res);
+ return res;
+
+}
+
__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
void *dummy, struct cb_process_state *cps)
{
- return cpu_to_be32(NFS4ERR_NOTSUPP); /* STUB */
+ u32 res;
+
+ dprintk("%s: -->\n", __func__);
+
+ if (cps->clp)
+ res = do_callback_layoutrecall(cps->clp, args);
+ else
+ res = NFS4ERR_OP_NOT_IN_SESSION;
+
+ dprintk("%s: exit with status = %d\n", __func__, res);
+ return cpu_to_be32(res);
}
int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 7a6eecf..d927251 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
NFS4CLNT_RECLAIM_REBOOT,
NFS4CLNT_RECLAIM_NOGRACE,
NFS4CLNT_DELEGRETURN,
+ NFS4CLNT_LAYOUTRECALL,
NFS4CLNT_SESSION_RESET,
NFS4CLNT_RECALL_SLOT,
};
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 5a2ca98..83f92fe 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -178,7 +178,7 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
*/
/* Need to hold i_lock if caller does not already hold reference */
-static void
+void
get_layout_hdr(struct pnfs_layout_hdr *lo)
{
atomic_inc(&lo->plh_refcount);
@@ -253,6 +253,7 @@ put_lseg_locked(struct pnfs_layout_segment *lseg,
/* List does not take a reference, so no need for put here */
list_del_init(&lseg->pls_layout->plh_layouts);
spin_unlock(&clp->cl_lock);
+ clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
}
list_add(&lseg->pls_list, tmp_list);
}
@@ -278,7 +279,7 @@ static void mark_lseg_invalid(struct pnfs_layout_segment *lseg,
}
/* Returns false if no lsegs match, true otherwise */
-static bool
+bool
mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
u32 iomode)
@@ -301,7 +302,7 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
return rv;
}
-static void
+void
pnfs_free_lseg_list(struct list_head *free_me)
{
struct pnfs_layout_segment *lseg, *tmp;
@@ -354,23 +355,46 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
}
/* update lo->plh_stateid with new if is more recent */
-static void
-pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
- const nfs4_stateid *new)
+void
+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
+ bool update_barrier)
{
u32 oldseq, newseq;
oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid);
newseq = be32_to_cpu(new->stateid.seqid);
- if ((int)(newseq - oldseq) > 0)
+ if ((int)(newseq - oldseq) > 0) {
memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
+ if (update_barrier) {
+ u32 new_barrier = be32_to_cpu(new->stateid.seqid);
+
+ if ((int)(new_barrier - lo->plh_barrier))
+ lo->plh_barrier = new_barrier;
+ } else {
+ /* Because of wraparound, we want to keep the barrier
+ * "close" to the current seqids. It needs to be
+ * within 2**31 to count as "behind", so if it
+ * gets too near that limit, give us a litle leeway
+ * and bring it to within 2**30.
+ * NOTE - and yes, this is all unsigned arithmetic.
+ */
+ if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
+ lo->plh_barrier = newseq - (1 << 30);
+ }
+ }
}
/* lget is set to 1 if called from inside send_layoutget call chain */
static bool
-pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, int lget)
-{
- return (list_empty(&lo->plh_segs) &&
+pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
+ int lget)
+{
+ if ((stateid) &&
+ (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
+ return true;
+ return lo->plh_block_lgets ||
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+ (list_empty(&lo->plh_segs) &&
(atomic_read(&lo->plh_outstanding) > lget));
}
@@ -382,7 +406,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
dprintk("--> %s\n", __func__);
spin_lock(&lo->plh_inode->i_lock);
- if (pnfs_layoutgets_blocked(lo, 1)) {
+ if (pnfs_layoutgets_blocked(lo, NULL, 1)) {
status = -EAGAIN;
} else if (list_empty(&lo->plh_segs)) {
int seq;
@@ -501,6 +525,7 @@ alloc_init_layout_hdr(struct inode *ino)
atomic_set(&lo->plh_refcount, 1);
INIT_LIST_HEAD(&lo->plh_layouts);
INIT_LIST_HEAD(&lo->plh_segs);
+ INIT_LIST_HEAD(&lo->plh_bulk_recall);
lo->plh_inode = ino;
return lo;
}
@@ -552,7 +577,7 @@ is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
* lookup range in layout
*/
static struct pnfs_layout_segment *
-pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
+pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
{
struct pnfs_layout_segment *lseg, *ret = NULL;
@@ -597,19 +622,22 @@ pnfs_update_layout(struct inode *ino,
goto out_unlock;
}
- /* Check to see if the layout for the given range already exists */
- lseg = pnfs_has_layout(lo, iomode);
- if (lseg) {
- dprintk("%s: Using cached lseg %p for iomode %d)\n",
- __func__, lseg, iomode);
+ /* Do we even need to bother with this? */
+ if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+ dprintk("%s matches recall, use MDS\n", __func__);
goto out_unlock;
}
+ /* Check to see if the layout for the given range already exists */
+ lseg = pnfs_find_lseg(lo, iomode);
+ if (lseg)
+ goto out_unlock;
/* if LAYOUTGET already failed once we don't try again */
if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
goto out_unlock;
- if (pnfs_layoutgets_blocked(lo, 0))
+ if (pnfs_layoutgets_blocked(lo, NULL, 0))
goto out_unlock;
atomic_inc(&lo->plh_outstanding);
@@ -632,6 +660,7 @@ pnfs_update_layout(struct inode *ino,
spin_lock(&clp->cl_lock);
list_del_init(&lo->plh_layouts);
spin_unlock(&clp->cl_lock);
+ clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
}
spin_unlock(&ino->i_lock);
}
@@ -653,6 +682,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
struct nfs4_layoutget_res *res = &lgp->res;
struct pnfs_layout_segment *lseg;
struct inode *ino = lo->plh_inode;
+ struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
int status = 0;
/* Verify we got what we asked for.
@@ -679,16 +709,32 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
}
spin_lock(&ino->i_lock);
+ if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+ dprintk("%s forget reply due to recall\n", __func__);
+ goto out_forget_reply;
+ }
+
+ if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) {
+ dprintk("%s forget reply due to state\n", __func__);
+ goto out_forget_reply;
+ }
init_lseg(lo, lseg);
lseg->pls_range = res->range;
*lgp->lsegpp = lseg;
pnfs_insert_layout(lo, lseg);
/* Done processing layoutget. Set the layout stateid */
- pnfs_set_layout_stateid(lo, &res->stateid);
+ pnfs_set_layout_stateid(lo, &res->stateid, false);
spin_unlock(&ino->i_lock);
out:
return status;
+
+out_forget_reply:
+ spin_unlock(&ino->i_lock);
+ lseg->pls_layout = lo;
+ NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+ goto out;
}
/*
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 8aaab56..9c81d82 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -49,6 +49,7 @@ struct pnfs_layout_segment {
enum {
NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
+ NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */
};
@@ -67,9 +68,12 @@ struct pnfs_layoutdriver_type {
struct pnfs_layout_hdr {
atomic_t plh_refcount;
struct list_head plh_layouts; /* other client layouts */
+ struct list_head plh_bulk_recall; /* clnt list of bulk recalls */
struct list_head plh_segs; /* layout segments list */
nfs4_stateid plh_stateid;
atomic_t plh_outstanding; /* number of RPCs out */
+ unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
+ u32 plh_barrier; /* ignore lower seqids */
unsigned long plh_flags;
struct inode *plh_inode;
};
@@ -139,18 +143,26 @@ extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
/* pnfs.c */
+void get_layout_hdr(struct pnfs_layout_hdr *lo);
struct pnfs_layout_segment *
pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
enum pnfs_iomode access_type);
void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
void unset_pnfs_layoutdriver(struct nfs_server *);
int pnfs_layout_process(struct nfs4_layoutget *lgp);
+void pnfs_free_lseg_list(struct list_head *tmp_list);
void pnfs_destroy_layout(struct nfs_inode *);
void pnfs_destroy_all_layouts(struct nfs_client *);
void put_layout_hdr(struct pnfs_layout_hdr *lo);
+void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *new,
+ bool update_barrier);
int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
struct pnfs_layout_hdr *lo,
struct nfs4_state *open_state);
+bool mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+ struct list_head *tmp_list,
+ u32 iomode);
static inline int lo_fail_bit(u32 iomode)
--
1.7.2.1
On 2010-12-23 02:19, Fred Isaman wrote:
>
> On Dec 22, 2010, at 5:00 PM, Trond Myklebust wrote:
>
>> On Tue, 2010-12-21 at 23:00 -0500, Fred Isaman wrote:
>>> A lsyout can request return-on-close. How this interacts with the
>>> forgetful model of never sending LAYOUTRETURNS is a bit ambiguous.
>>> We forget any layouts marked roc, and wait for them to be completely
>>> forgotten before continuing with the close. In addition, to compensate
>>> for races with any inflight LAYOUTGETs, and the fact that we do not get
>>> any layout stateid back from the server, we set the barrier to the worst
>>> case scenario of current_seqid + number of outstanding LAYOUTGETS.
>>>
>>> Signed-off-by: Fred Isaman <[email protected]>
>>> ---
>>> fs/nfs/inode.c | 1 +
>>> fs/nfs/nfs4_fs.h | 2 +-
>>> fs/nfs/nfs4proc.c | 21 +++++++++++-
>>> fs/nfs/nfs4state.c | 7 +++-
>>> fs/nfs/pnfs.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++
>>> fs/nfs/pnfs.h | 28 ++++++++++++++++
>>> include/linux/nfs_fs.h | 1 +
>>> 7 files changed, 138 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
>>> index 43a69da..c64bb40 100644
>>> diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
>>> index 29d504d..90515de 100644
>>> --- a/include/linux/nfs_fs.h
>>> +++ b/include/linux/nfs_fs.h
>>> @@ -190,6 +190,7 @@ struct nfs_inode {
>>> struct rw_semaphore rwsem;
>>>
>>> /* pNFS layout information */
>>> + struct rpc_wait_queue lo_rpcwaitq;
>>> struct pnfs_layout_hdr *layout;
>>> #endif /* CONFIG_NFS_V4*/
>>> #ifdef CONFIG_NFS_FSCACHE
>>
>> I believe that I've asked this before. Why do we need a per-inode
>> rpc_wait_queue just to support pnfs? That's a significant expansion of
>> an already bloated structure.
>>
>> Can we please either make this a single per-filesystem wait queue, or
>> else possibly a pool of wait queues?
>>
>> Trond
>
> This was introduced to avoid deadlocks that were occurring when we had a single wait queue. However, the deadlocks I remember were due to a combination of the fact that, at the time, we handled EAGAIN errors of IO outside the RPC code, and we sent LAYOUTRETURN on such error. Since we do neither now, I believe a single per-filesystem wait queue will suffice. Anyone disagree?
The dead locks were also because we didn't use rpc wait queue but rather a thread based one.
Doing the serialization in the rpc prepare phase using a shared queue should cause dead locks.
Benny
>
> Fred
>
>>
>> --
>> Trond Myklebust
>> Linux NFS client maintainer
>>
>> NetApp
>> [email protected]
>> http://www.netapp.com
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
We shouldn't send a LAYOUTGET(openstateid) unless all outstanding RPCs
using the previous stateid are completed. This requires choosing the
stateid to encode earlier, so we can abort if one is not available (we
want to use the open stateid, but a LAYOUTGET is already out using
it), and adding a count of the number of outstanding rpc calls using
layout state (which for now consist solely of LAYOUTGETs).
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/nfs4proc.c | 7 ++++++-
fs/nfs/nfs4xdr.c | 5 +----
fs/nfs/pnfs.c | 24 +++++++++++++++++++-----
fs/nfs/pnfs.h | 1 +
include/linux/nfs_xdr.h | 1 +
5 files changed, 28 insertions(+), 10 deletions(-)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1609a07..3b059d3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5308,6 +5308,12 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
if (nfs4_setup_sequence(server, &lgp->args.seq_args,
&lgp->res.seq_res, 0, task))
return;
+ if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
+ NFS_I(lgp->args.inode)->layout,
+ lgp->args.ctx->state)) {
+ rpc_exit(task, NFS4_OK);
+ return;
+ }
rpc_call_start(task);
}
@@ -5342,7 +5348,6 @@ static void nfs4_layoutget_release(void *calldata)
struct nfs4_layoutget *lgp = calldata;
dprintk("--> %s\n", __func__);
- put_layout_hdr(lgp->args.inode);
if (lgp->res.layout.buf != NULL)
free_page((unsigned long) lgp->res.layout.buf);
put_nfs_open_context(lgp->args.ctx);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4e28242..3cbdd0c 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1787,7 +1787,6 @@ encode_layoutget(struct xdr_stream *xdr,
const struct nfs4_layoutget_args *args,
struct compound_hdr *hdr)
{
- nfs4_stateid stateid;
__be32 *p;
p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
@@ -1798,9 +1797,7 @@ encode_layoutget(struct xdr_stream *xdr,
p = xdr_encode_hyper(p, args->range.offset);
p = xdr_encode_hyper(p, args->range.length);
p = xdr_encode_hyper(p, args->minlength);
- pnfs_choose_layoutget_stateid(&stateid, NFS_I(args->inode)->layout,
- args->ctx->state);
- p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
+ p = xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
*p = cpu_to_be32(args->maxcount);
dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index bbca2e5..60fbd41 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -362,6 +362,14 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
}
+/* lget is set to 1 if called from inside send_layoutget call chain */
+static bool
+pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, int lget)
+{
+ return (list_empty(&lo->plh_segs) &&
+ (atomic_read(&lo->plh_outstanding) > lget));
+}
+
int
pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
struct nfs4_state *open_state)
@@ -370,7 +378,9 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
dprintk("--> %s\n", __func__);
spin_lock(&lo->plh_inode->i_lock);
- if (list_empty(&lo->plh_segs)) {
+ if (pnfs_layoutgets_blocked(lo, 1)) {
+ status = -EAGAIN;
+ } else if (list_empty(&lo->plh_segs)) {
int seq;
do {
@@ -405,10 +415,8 @@ send_layoutget(struct pnfs_layout_hdr *lo,
BUG_ON(ctx == NULL);
lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
- if (lgp == NULL) {
- put_layout_hdr(lo->plh_inode);
+ if (lgp == NULL)
return NULL;
- }
lgp->args.minlength = NFS4_MAX_UINT64;
lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
lgp->args.range.iomode = iomode;
@@ -604,10 +612,16 @@ pnfs_update_layout(struct inode *ino,
if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
goto out_unlock;
- get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */
+ if (pnfs_layoutgets_blocked(lo, 0))
+ goto out_unlock;
+ atomic_inc(&lo->plh_outstanding);
+
+ get_layout_hdr_locked(lo);
spin_unlock(&ino->i_lock);
lseg = send_layoutget(lo, ctx, iomode);
+ atomic_dec(&lo->plh_outstanding);
+ put_layout_hdr(ino);
out:
dprintk("%s end, state 0x%lx lseg %p\n", __func__,
nfsi->layout->plh_flags, lseg);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 787253e..698380d 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -69,6 +69,7 @@ struct pnfs_layout_hdr {
struct list_head plh_layouts; /* other client layouts */
struct list_head plh_segs; /* layout segments list */
nfs4_stateid plh_stateid;
+ atomic_t plh_outstanding; /* number of RPCs out */
unsigned long plh_flags;
struct inode *plh_inode;
};
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index f923831..0843703 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -208,6 +208,7 @@ struct nfs4_layoutget_args {
struct inode *inode;
struct nfs_open_context *ctx;
struct nfs4_sequence_args seq_args;
+ nfs4_stateid stateid;
};
struct nfs4_layoutget_res {
--
1.7.2.1
Comment references get_layout_hdr_locked, which never existed in
submitted code.
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/pnfs.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index db77342..6e9daff 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -225,7 +225,7 @@ destroy_lseg(struct kref *kref)
dprintk("--> %s\n", __func__);
NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
- /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
+ /* Matched by get_layout_hdr in pnfs_insert_layout */
put_layout_hdr(ino);
}
--
1.7.2.1
On Thu, 2010-12-23 at 12:29 -0500, Fred Isaman wrote:
> This is the heart of the wave 2 submission. Add the code to trigger
> drain and forget of any afected layouts. In addition, we set a
> "barrier", below which any LAYOUTGET reply is ignored. This is to
> compensate for the fact that we do not wait for outstanding LAYOUTGETs
> to complete as per section 12.5.5.2.1 of RFC 5661.
>
> Signed-off-by: Fred Isaman <[email protected]>
> ---
> fs/nfs/callback_proc.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++-
> fs/nfs/nfs4_fs.h | 1 +
> fs/nfs/pnfs.c | 84 ++++++++++++++++++++++++++--------
> fs/nfs/pnfs.h | 12 +++++
> 4 files changed, 196 insertions(+), 20 deletions(-)
>
> diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
> index c1bb157..a8e8e20 100644
> --- a/fs/nfs/callback_proc.c
> +++ b/fs/nfs/callback_proc.c
> @@ -12,6 +12,7 @@
> #include "callback.h"
> #include "delegation.h"
> #include "internal.h"
> +#include "pnfs.h"
>
> #ifdef NFS_DEBUG
> #define NFSDBG_FACILITY NFSDBG_CALLBACK
> @@ -107,10 +108,126 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
>
> #if defined(CONFIG_NFS_V4_1)
>
> +static int initiate_layout_draining(struct nfs_client *clp,
^^^ u32
> + struct cb_layoutrecallargs *args)
> +{
> + struct pnfs_layout_hdr *lo;
> + int rv = NFS4ERR_NOMATCHING_LAYOUT;
^^^ u32
> +
> + if (args->cbl_recall_type == RETURN_FILE) {
> + LIST_HEAD(free_me_list);
> +
> + args->cbl_inode = NULL;
> + spin_lock(&clp->cl_lock);
> + list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
> + if (nfs_compare_fh(&args->cbl_fh,
> + &NFS_I(lo->plh_inode)->fh))
> + continue;
> + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
> + rv = NFS4ERR_DELAY;
> + else {
> + /* Without this, layout can be freed as soon
> + * as we release cl_lock. Matched in
> + * do_callback_layoutrecall.
> + */
> + get_layout_hdr(lo);
> + args->cbl_inode = lo->plh_inode;
> + rv = NFS4_OK;
> + }
> + break;
> + }
> + spin_unlock(&clp->cl_lock);
> +
> + spin_lock(&lo->plh_inode->i_lock);
What guarantees do you have that the inode still exists here? Does the
layout segment hold a reference to it?
Also, How do you know that (!test_bit(NFS_LAYOUT_BULK_RECALL,
&lo->plh_flags)) is still valid when you get here?
> + if (rv == NFS4_OK) {
> + lo->plh_block_lgets++;
> + mark_matching_lsegs_invalid(lo, &free_me_list,
> + args->cbl_range.iomode);
> + if (list_empty(&lo->plh_segs))
> + rv = NFS4ERR_NOMATCHING_LAYOUT;
> + }
> + pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
> + spin_unlock(&lo->plh_inode->i_lock);
> + pnfs_free_lseg_list(&free_me_list);
> + } else {
> + struct pnfs_layout_hdr *tmp;
> + LIST_HEAD(recall_list);
> + LIST_HEAD(free_me_list);
> + struct pnfs_layout_range range = {
> + .iomode = IOMODE_ANY,
> + .offset = 0,
> + .length = NFS4_MAX_UINT64,
> + };
> +
> + spin_lock(&clp->cl_lock);
> + list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
> + if ((args->cbl_recall_type == RETURN_FSID) &&
> + memcmp(&NFS_SERVER(lo->plh_inode)->fsid,
> + &args->cbl_fsid, sizeof(struct nfs_fsid)))
> + continue;
> + get_layout_hdr(lo);
> + BUG_ON(!list_empty(&lo->plh_bulk_recall));
> + list_add(&lo->plh_bulk_recall, &recall_list);
> + }
> + spin_unlock(&clp->cl_lock);
> + list_for_each_entry_safe(lo, tmp,
> + &recall_list, plh_bulk_recall) {
> + spin_lock(&lo->plh_inode->i_lock);
Ditto. What guarantees that the inode still exists here, and how do you
avoid races with a file return request?
> + set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
> + mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode);
> + list_del_init(&lo->plh_bulk_recall);
> + if (!list_empty(&lo->plh_segs))
> + rv = NFS4_OK;
> + spin_unlock(&lo->plh_inode->i_lock);
> + put_layout_hdr(lo);
> + }
> + pnfs_free_lseg_list(&free_me_list);
> + }
> + return rv;
> +}
Shouldn't the above really be 2 different functions? One for file
return, and the other for bulk layout returns?
> +static u32 do_callback_layoutrecall(struct nfs_client *clp,
> + struct cb_layoutrecallargs *args)
> +{
> + u32 status, res = NFS4ERR_DELAY;
> +
> + dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
> + if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state))
> + goto out;
> + status = initiate_layout_draining(clp, args);
> + if (status)
^^^^^^^^^^^
If you are going to insist on spelling out 'rv = NFS4_OK' instead of 'rv
= 0' in initiate_layout_draining(), then shouldn't the above test be 'if
(status != NFS4_OK)'?
Please just dump all these NFS4_OK references, and replace them with 'rv
= 0' so that tests like the above make sense. That's consistent with the
rest of the NFS code.
> + res = status;
> + else if (args->cbl_recall_type == RETURN_FILE) {
> + struct pnfs_layout_hdr *lo;
> +
> + lo = NFS_I(args->cbl_inode)->layout;
> + spin_lock(&lo->plh_inode->i_lock);
> + lo->plh_block_lgets--;
> + spin_unlock(&lo->plh_inode->i_lock);
> + put_layout_hdr(lo);
> + res = NFS4ERR_DELAY;
Eh? Why doesn't this belong in (the split up version of)
initiate_layout_draining()?
> + }
> + clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state);
> +out:
> + dprintk("%s returning %i\n", __func__, res);
> + return res;
> +
> +}
> +
--
Trond Myklebust
Linux NFS client maintainer
NetApp
[email protected]
http://www.netapp.com
A layout can request return-on-close. How this interacts with the
forgetful model of never sending LAYOUTRETURNS is a bit ambiguous.
We forget any layouts marked roc, and wait for them to be completely
forgotten before continuing with the close. In addition, to compensate
for races with any inflight LAYOUTGETs, and the fact that we do not get
any layout stateid back from the server, we set the barrier to the worst
case scenario of current_seqid + number of outstanding LAYOUTGETS.
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/client.c | 13 +++++++
fs/nfs/nfs4_fs.h | 2 +-
fs/nfs/nfs4proc.c | 21 ++++++++++-
fs/nfs/nfs4state.c | 7 +++-
fs/nfs/pnfs.c | 83 +++++++++++++++++++++++++++++++++++++++++++++
fs/nfs/pnfs.h | 28 +++++++++++++++
include/linux/nfs_fs_sb.h | 1 +
7 files changed, 150 insertions(+), 5 deletions(-)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 17bfc25..a5cf28a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -202,10 +202,21 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
}
+
+static void pnfs_init_server(struct nfs_server *server)
+{
+ rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
+}
+
#else
static void nfs4_shutdown_client(struct nfs_client *clp)
{
}
+
+static void pnfs_init_server(struct nfs_server *server)
+{
+}
+
#endif /* CONFIG_NFS_V4 */
/*
@@ -973,6 +984,8 @@ static struct nfs_server *nfs_alloc_server(void)
return NULL;
}
+ pnfs_init_server(server);
+
return server;
}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index d927251..e0c3cea 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -242,7 +242,7 @@ extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
+extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
struct nfs4_fs_locations *fs_locations, struct page *page);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3b059d3..b234642 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1838,6 +1838,8 @@ struct nfs4_closedata {
struct nfs_closeres res;
struct nfs_fattr fattr;
unsigned long timestamp;
+ bool roc;
+ u32 roc_barrier;
};
static void nfs4_free_closedata(void *data)
@@ -1845,6 +1847,8 @@ static void nfs4_free_closedata(void *data)
struct nfs4_closedata *calldata = data;
struct nfs4_state_owner *sp = calldata->state->owner;
+ if (calldata->roc)
+ pnfs_roc_release(calldata->state->inode);
nfs4_put_open_state(calldata->state);
nfs_free_seqid(calldata->arg.seqid);
nfs4_put_state_owner(sp);
@@ -1877,6 +1881,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
*/
switch (task->tk_status) {
case 0:
+ if (calldata->roc)
+ pnfs_roc_set_barrier(state->inode,
+ calldata->roc_barrier);
nfs_set_open_stateid(state, &calldata->res.stateid, 0);
renew_lease(server, calldata->timestamp);
nfs4_close_clear_stateid_flags(state,
@@ -1929,8 +1936,15 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
return;
}
- if (calldata->arg.fmode == 0)
+ if (calldata->arg.fmode == 0) {
task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
+ if (calldata->roc &&
+ pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) {
+ rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq,
+ task, NULL);
+ return;
+ }
+ }
nfs_fattr_init(calldata->res.fattr);
calldata->timestamp = jiffies;
@@ -1958,7 +1972,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
*
* NOTE: Caller must be holding the sp->so_owner semaphore!
*/
-int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait)
+int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
{
struct nfs_server *server = NFS_SERVER(state->inode);
struct nfs4_closedata *calldata;
@@ -1993,6 +2007,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
calldata->res.fattr = &calldata->fattr;
calldata->res.seqid = calldata->arg.seqid;
calldata->res.server = server;
+ calldata->roc = roc;
path_get(path);
calldata->path = *path;
@@ -2010,6 +2025,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
out_free_calldata:
kfree(calldata);
out:
+ if (roc)
+ pnfs_roc_release(state->inode);
nfs4_put_open_state(state);
nfs4_put_state_owner(sp);
return status;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index fb23a32..a472b7c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -607,8 +607,11 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state,
if (!call_close) {
nfs4_put_open_state(state);
nfs4_put_state_owner(owner);
- } else
- nfs4_do_close(path, state, gfp_mask, wait);
+ } else {
+ bool roc = pnfs_roc(state->inode);
+
+ nfs4_do_close(path, state, gfp_mask, wait, roc);
+ }
}
void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 83f92fe..dd45249 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -255,6 +255,7 @@ put_lseg_locked(struct pnfs_layout_segment *lseg,
spin_unlock(&clp->cl_lock);
clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
}
+ rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
list_add(&lseg->pls_list, tmp_list);
}
}
@@ -466,6 +467,83 @@ send_layoutget(struct pnfs_layout_hdr *lo,
return lseg;
}
+bool pnfs_roc(struct inode *ino)
+{
+ struct pnfs_layout_hdr *lo;
+ struct pnfs_layout_segment *lseg, *tmp;
+ LIST_HEAD(tmp_list);
+ bool found = false;
+
+ spin_lock(&ino->i_lock);
+ lo = NFS_I(ino)->layout;
+ if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+ goto out_nolayout;
+ list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
+ if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+ mark_lseg_invalid(lseg, &tmp_list);
+ found = true;
+ }
+ if (!found)
+ goto out_nolayout;
+ lo->plh_block_lgets++;
+ get_layout_hdr(lo); /* matched in pnfs_roc_release */
+ spin_unlock(&ino->i_lock);
+ pnfs_free_lseg_list(&tmp_list);
+ return true;
+
+out_nolayout:
+ spin_unlock(&ino->i_lock);
+ return false;
+}
+
+void pnfs_roc_release(struct inode *ino)
+{
+ struct pnfs_layout_hdr *lo;
+
+ spin_lock(&ino->i_lock);
+ lo = NFS_I(ino)->layout;
+ lo->plh_block_lgets--;
+ put_layout_hdr_locked(lo);
+ spin_unlock(&ino->i_lock);
+}
+
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+ struct pnfs_layout_hdr *lo;
+
+ spin_lock(&ino->i_lock);
+ lo = NFS_I(ino)->layout;
+ if ((int)(barrier - lo->plh_barrier) > 0)
+ lo->plh_barrier = barrier;
+ spin_unlock(&ino->i_lock);
+}
+
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
+{
+ struct nfs_inode *nfsi = NFS_I(ino);
+ struct pnfs_layout_segment *lseg;
+ bool found = false;
+
+ spin_lock(&ino->i_lock);
+ list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
+ if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+ found = true;
+ break;
+ }
+ if (!found) {
+ struct pnfs_layout_hdr *lo = nfsi->layout;
+ u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid);
+
+ /* Since close does not return a layout stateid for use as
+ * a barrier, we choose the worst-case barrier.
+ */
+ *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
+ }
+ spin_unlock(&ino->i_lock);
+ return found;
+}
+
/*
* Compare two layout segments for sorting into layout cache.
* We want to preferentially return RW over RO layouts, so ensure those
@@ -724,6 +802,11 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
*lgp->lsegpp = lseg;
pnfs_insert_layout(lo, lseg);
+ if (res->return_on_close) {
+ set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
+ set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
+ }
+
/* Done processing layoutget. Set the layout stateid */
pnfs_set_layout_stateid(lo, &res->stateid, false);
spin_unlock(&ino->i_lock);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 9c81d82..ab4c011 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -32,6 +32,7 @@
enum {
NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
+ NFS_LSEG_ROC, /* roc bit received from server */
};
struct pnfs_layout_segment {
@@ -50,6 +51,7 @@ enum {
NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
+ NFS_LAYOUT_ROC, /* some lseg had roc bit set */
NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */
};
@@ -163,6 +165,10 @@ int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
bool mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
u32 iomode);
+bool pnfs_roc(struct inode *ino);
+void pnfs_roc_release(struct inode *ino);
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
static inline int lo_fail_bit(u32 iomode)
@@ -194,6 +200,28 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
return NULL;
}
+static inline bool
+pnfs_roc(struct inode *ino)
+{
+ return false;
+}
+
+static inline void
+pnfs_roc_release(struct inode *ino)
+{
+}
+
+static inline void
+pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+}
+
+static inline bool
+pnfs_roc_drain(struct inode *ino, u32 *barrier)
+{
+ return false;
+}
+
static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
{
}
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index e93ada0..7f20c0b 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -149,6 +149,7 @@ struct nfs_server {
that are supported on this
filesystem */
struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */
+ struct rpc_wait_queue roc_rpcwaitq;
#endif
void (*destroy)(struct nfs_server *);
--
1.7.2.1
Since this list will be used to search for layouts to recall,
this is necessary to avoid a race where the recall comes in,
sees there is nothing in the client list, and prepares to return
NOMATCHING, while the LAYOUTGET gets processed before the recall
updates the stateid.
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/pnfs.c | 27 +++++++++++++++++++--------
1 files changed, 19 insertions(+), 8 deletions(-)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 60fbd41..b11b46a 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -460,14 +460,6 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
dprintk("%s:Begin\n", __func__);
assert_spin_locked(&lo->plh_inode->i_lock);
- if (list_empty(&lo->plh_segs)) {
- struct nfs_client *clp = NFS_SERVER(lo->plh_inode)->nfs_client;
-
- spin_lock(&clp->cl_lock);
- BUG_ON(!list_empty(&lo->plh_layouts));
- list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
- spin_unlock(&clp->cl_lock);
- }
list_for_each_entry(lp, &lo->plh_segs, pls_list) {
if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
continue;
@@ -588,6 +580,7 @@ pnfs_update_layout(struct inode *ino,
enum pnfs_iomode iomode)
{
struct nfs_inode *nfsi = NFS_I(ino);
+ struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
struct pnfs_layout_hdr *lo;
struct pnfs_layout_segment *lseg = NULL;
@@ -617,9 +610,27 @@ pnfs_update_layout(struct inode *ino,
atomic_inc(&lo->plh_outstanding);
get_layout_hdr_locked(lo);
+ if (list_empty(&lo->plh_segs)) {
+ /* The lo must be on the clp list if there is any
+ * chance of a CB_LAYOUTRECALL(FILE) coming in.
+ */
+ spin_lock(&clp->cl_lock);
+ BUG_ON(!list_empty(&lo->plh_layouts));
+ list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
+ spin_unlock(&clp->cl_lock);
+ }
spin_unlock(&ino->i_lock);
lseg = send_layoutget(lo, ctx, iomode);
+ if (!lseg) {
+ spin_lock(&ino->i_lock);
+ if (list_empty(&lo->plh_segs)) {
+ spin_lock(&clp->cl_lock);
+ list_del_init(&lo->plh_layouts);
+ spin_unlock(&clp->cl_lock);
+ }
+ spin_unlock(&ino->i_lock);
+ }
atomic_dec(&lo->plh_outstanding);
put_layout_hdr(ino);
out:
--
1.7.2.1
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/nfs4proc.c | 3 +--
include/linux/nfs_xdr.h | 1 -
2 files changed, 1 insertions(+), 3 deletions(-)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index fcf2658..f475516 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5330,7 +5330,6 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
return;
}
}
- lgp->status = task->tk_status;
dprintk("<-- %s\n", __func__);
}
@@ -5386,7 +5385,7 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
status = nfs4_wait_for_completion_rpc_task(task);
if (status != 0)
goto out;
- status = lgp->status;
+ status = task->tk_status;
if (status != 0)
goto out;
status = pnfs_layout_process(lgp);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index d325def..f923831 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -223,7 +223,6 @@ struct nfs4_layoutget {
struct nfs4_layoutget_args args;
struct nfs4_layoutget_res res;
struct pnfs_layout_segment **lsegpp;
- int status;
};
struct nfs4_getdeviceinfo_args {
--
1.7.2.1
On Dec 22, 2010, at 5:00 PM, Trond Myklebust wrote:
> On Tue, 2010-12-21 at 23:00 -0500, Fred Isaman wrote:
>> A lsyout can request return-on-close. How this interacts with the
>> forgetful model of never sending LAYOUTRETURNS is a bit ambiguous.
>> We forget any layouts marked roc, and wait for them to be completely
>> forgotten before continuing with the close. In addition, to compensate
>> for races with any inflight LAYOUTGETs, and the fact that we do not get
>> any layout stateid back from the server, we set the barrier to the worst
>> case scenario of current_seqid + number of outstanding LAYOUTGETS.
>>
>> Signed-off-by: Fred Isaman <[email protected]>
>> ---
>> fs/nfs/inode.c | 1 +
>> fs/nfs/nfs4_fs.h | 2 +-
>> fs/nfs/nfs4proc.c | 21 +++++++++++-
>> fs/nfs/nfs4state.c | 7 +++-
>> fs/nfs/pnfs.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++
>> fs/nfs/pnfs.h | 28 ++++++++++++++++
>> include/linux/nfs_fs.h | 1 +
>> 7 files changed, 138 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
>> index 43a69da..c64bb40 100644
>> diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
>> index 29d504d..90515de 100644
>> --- a/include/linux/nfs_fs.h
>> +++ b/include/linux/nfs_fs.h
>> @@ -190,6 +190,7 @@ struct nfs_inode {
>> struct rw_semaphore rwsem;
>>
>> /* pNFS layout information */
>> + struct rpc_wait_queue lo_rpcwaitq;
>> struct pnfs_layout_hdr *layout;
>> #endif /* CONFIG_NFS_V4*/
>> #ifdef CONFIG_NFS_FSCACHE
>
> I believe that I've asked this before. Why do we need a per-inode
> rpc_wait_queue just to support pnfs? That's a significant expansion of
> an already bloated structure.
>
> Can we please either make this a single per-filesystem wait queue, or
> else possibly a pool of wait queues?
>
> Trond
This was introduced to avoid deadlocks that were occurring when we had a single wait queue. However, the deadlocks I remember were due to a combination of the fact that, at the time, we handled EAGAIN errors of IO outside the RPC code, and we sent LAYOUTRETURN on such error. Since we do neither now, I believe a single per-filesystem wait queue will suffice. Anyone disagree?
Fred
>
> --
> Trond Myklebust
> Linux NFS client maintainer
>
> NetApp
> [email protected]
> http://www.netapp.com
>
This is the xdr decoding for CB_LAYOUTRECALL.
Signed-off-by: Alexandros Batsakis <[email protected]>
Signed-off-by: Dean Hildebrand <[email protected]>
Signed-off-by: Marc Eshel <[email protected]>
Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/callback.h | 20 ++++++++++++++
fs/nfs/callback_proc.c | 6 ++++
fs/nfs/callback_xdr.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 94 insertions(+), 1 deletions(-)
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index b678e3e..19570f4 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -138,6 +138,26 @@ extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
void *dummy,
struct cb_process_state *cps);
+struct cb_layoutrecallargs {
+ struct sockaddr *cbl_addr;
+ uint32_t cbl_recall_type;
+ uint32_t cbl_layout_type;
+ uint32_t cbl_layoutchanged;
+ union {
+ struct {
+ struct nfs_fh cbl_fh;
+ struct inode *cbl_inode;
+ struct pnfs_layout_range cbl_range;
+ nfs4_stateid cbl_stateid;
+ };
+ struct nfs_fsid cbl_fsid;
+ };
+};
+
+extern unsigned nfs4_callback_layoutrecall(
+ struct cb_layoutrecallargs *args,
+ void *dummy, struct cb_process_state *cps);
+
extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
extern void nfs4_cb_take_slot(struct nfs_client *clp);
#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index c1bead2..c1bb157 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -107,6 +107,12 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
#if defined(CONFIG_NFS_V4_1)
+__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
+ void *dummy, struct cb_process_state *cps)
+{
+ return cpu_to_be32(NFS4ERR_NOTSUPP); /* STUB */
+}
+
int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
{
if (delegation == NULL)
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 9dbccf4..1523c99 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -24,6 +24,7 @@
#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#if defined(CONFIG_NFS_V4_1)
+#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
4 + 1 + 3)
#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
@@ -223,6 +224,66 @@ out:
#if defined(CONFIG_NFS_V4_1)
+static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct cb_layoutrecallargs *args)
+{
+ __be32 *p;
+ __be32 status = 0;
+ uint32_t iomode;
+
+ args->cbl_addr = svc_addr(rqstp);
+ p = read_buf(xdr, 4 * sizeof(uint32_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto out;
+ }
+
+ args->cbl_layout_type = ntohl(*p++);
+ /* Depite the spec's xdr, iomode really belongs in the FILE switch,
+ * as it is unuseable and ignored with the other types.
+ */
+ iomode = ntohl(*p++);
+ args->cbl_layoutchanged = ntohl(*p++);
+ args->cbl_recall_type = ntohl(*p++);
+
+ if (args->cbl_recall_type == RETURN_FILE) {
+ args->cbl_range.iomode = iomode;
+ status = decode_fh(xdr, &args->cbl_fh);
+ if (unlikely(status != 0))
+ goto out;
+
+ p = read_buf(xdr, 2 * sizeof(uint64_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto out;
+ }
+ p = xdr_decode_hyper(p, &args->cbl_range.offset);
+ p = xdr_decode_hyper(p, &args->cbl_range.length);
+ status = decode_stateid(xdr, &args->cbl_stateid);
+ if (unlikely(status != 0))
+ goto out;
+ } else if (args->cbl_recall_type == RETURN_FSID) {
+ p = read_buf(xdr, 2 * sizeof(uint64_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto out;
+ }
+ p = xdr_decode_hyper(p, &args->cbl_fsid.major);
+ p = xdr_decode_hyper(p, &args->cbl_fsid.minor);
+ } else if (args->cbl_recall_type != RETURN_ALL) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto out;
+ }
+ dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n",
+ __func__,
+ args->cbl_layout_type, iomode,
+ args->cbl_layoutchanged, args->cbl_recall_type);
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+ return status;
+}
+
static __be32 decode_sessionid(struct xdr_stream *xdr,
struct nfs4_sessionid *sid)
{
@@ -577,10 +638,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
case OP_CB_SEQUENCE:
case OP_CB_RECALL_ANY:
case OP_CB_RECALL_SLOT:
+ case OP_CB_LAYOUTRECALL:
*op = &callback_ops[op_nr];
break;
- case OP_CB_LAYOUTRECALL:
case OP_CB_NOTIFY_DEVICEID:
case OP_CB_NOTIFY:
case OP_CB_PUSH_DELEG:
@@ -784,6 +845,12 @@ static struct callback_op callback_ops[] = {
.res_maxsize = CB_OP_RECALL_RES_MAXSZ,
},
#if defined(CONFIG_NFS_V4_1)
+ [OP_CB_LAYOUTRECALL] = {
+ .process_op = (callback_process_op_t)nfs4_callback_layoutrecall,
+ .decode_args =
+ (callback_decode_arg_t)decode_layoutrecall_args,
+ .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
+ },
[OP_CB_SEQUENCE] = {
.process_op = (callback_process_op_t)nfs4_callback_sequence,
.decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
--
1.7.2.1
While we are renaming all the fields, change lo->state to lo->plh_flags.
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/nfs4filelayout.c | 2 +-
fs/nfs/pnfs.c | 66 +++++++++++++++++++++++-----------------------
fs/nfs/pnfs.h | 10 +++---
3 files changed, 39 insertions(+), 39 deletions(-)
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 2e92f0d..738d6a4 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -243,7 +243,7 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
static void
filelayout_free_lseg(struct pnfs_layout_segment *lseg)
{
- struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode);
+ struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->inode);
struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
dprintk("--> %s\n", __func__);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 6e9daff..c3ca5fe 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -210,9 +210,9 @@ put_layout_hdr(struct inode *inode)
static void
init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
{
- INIT_LIST_HEAD(&lseg->fi_list);
- kref_init(&lseg->kref);
- lseg->layout = lo;
+ INIT_LIST_HEAD(&lseg->pls_list);
+ kref_init(&lseg->pls_refcount);
+ lseg->pls_layout = lo;
}
/* Called without i_lock held, as the free_lseg call may sleep */
@@ -220,8 +220,8 @@ static void
destroy_lseg(struct kref *kref)
{
struct pnfs_layout_segment *lseg =
- container_of(kref, struct pnfs_layout_segment, kref);
- struct inode *ino = lseg->layout->inode;
+ container_of(kref, struct pnfs_layout_segment, pls_refcount);
+ struct inode *ino = lseg->pls_layout->inode;
dprintk("--> %s\n", __func__);
NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
@@ -236,8 +236,8 @@ put_lseg(struct pnfs_layout_segment *lseg)
return;
dprintk("%s: lseg %p ref %d\n", __func__, lseg,
- atomic_read(&lseg->kref.refcount));
- kref_put(&lseg->kref, destroy_lseg);
+ atomic_read(&lseg->pls_refcount.refcount));
+ kref_put(&lseg->pls_refcount, destroy_lseg);
}
static void
@@ -249,9 +249,9 @@ pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
dprintk("%s:Begin lo %p\n", __func__, lo);
assert_spin_locked(&lo->inode->i_lock);
- list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) {
+ list_for_each_entry_safe(lseg, next, &lo->segs, pls_list) {
dprintk("%s: freeing lseg %p\n", __func__, lseg);
- list_move(&lseg->fi_list, tmp_list);
+ list_move(&lseg->pls_list, tmp_list);
}
clp = NFS_SERVER(lo->inode)->nfs_client;
spin_lock(&clp->cl_lock);
@@ -259,7 +259,7 @@ pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
list_del_init(&lo->layouts);
spin_unlock(&clp->cl_lock);
write_seqlock(&lo->seqlock);
- clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
+ clear_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
write_sequnlock(&lo->seqlock);
dprintk("%s:Return\n", __func__);
@@ -272,9 +272,9 @@ pnfs_free_lseg_list(struct list_head *tmp_list)
while (!list_empty(tmp_list)) {
lseg = list_entry(tmp_list->next, struct pnfs_layout_segment,
- fi_list);
+ pls_list);
dprintk("%s calling put_lseg on %p\n", __func__, lseg);
- list_del(&lseg->fi_list);
+ list_del(&lseg->pls_list);
put_lseg(lseg);
}
}
@@ -331,7 +331,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
bool overwrite = false;
write_seqlock(&lo->seqlock);
- if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) ||
+ if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags) ||
memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
overwrite = true;
else {
@@ -360,7 +360,7 @@ pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
memcpy(lo->stateid.data, state->stateid.data,
sizeof(state->stateid.data));
} while (read_seqretry(&state->seqlock, seq));
- set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
+ set_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
write_sequnlock(&lo->seqlock);
dprintk("<-- %s\n", __func__);
}
@@ -374,7 +374,7 @@ pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
dprintk("--> %s\n", __func__);
do {
seq = read_seqbegin(&lo->seqlock);
- if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
+ if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags)) {
/* This will trigger retry of the read */
pnfs_layout_from_open_stateid(lo, open_state);
} else
@@ -424,7 +424,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
nfs4_proc_layoutget(lgp);
if (!lseg) {
/* remember that LAYOUTGET failed and suspend trying */
- set_bit(lo_fail_bit(iomode), &lo->state);
+ set_bit(lo_fail_bit(iomode), &lo->plh_flags);
}
return lseg;
}
@@ -459,26 +459,26 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
list_add_tail(&lo->layouts, &clp->cl_layouts);
spin_unlock(&clp->cl_lock);
}
- list_for_each_entry(lp, &lo->segs, fi_list) {
- if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0)
+ list_for_each_entry(lp, &lo->segs, pls_list) {
+ if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
continue;
- list_add_tail(&lseg->fi_list, &lp->fi_list);
+ list_add_tail(&lseg->pls_list, &lp->pls_list);
dprintk("%s: inserted lseg %p "
"iomode %d offset %llu length %llu before "
"lp %p iomode %d offset %llu length %llu\n",
- __func__, lseg, lseg->range.iomode,
- lseg->range.offset, lseg->range.length,
- lp, lp->range.iomode, lp->range.offset,
- lp->range.length);
+ __func__, lseg, lseg->pls_range.iomode,
+ lseg->pls_range.offset, lseg->pls_range.length,
+ lp, lp->pls_range.iomode, lp->pls_range.offset,
+ lp->pls_range.length);
found = 1;
break;
}
if (!found) {
- list_add_tail(&lseg->fi_list, &lo->segs);
+ list_add_tail(&lseg->pls_list, &lo->segs);
dprintk("%s: inserted lseg %p "
"iomode %d offset %llu length %llu at tail\n",
- __func__, lseg, lseg->range.iomode,
- lseg->range.offset, lseg->range.length);
+ __func__, lseg, lseg->pls_range.iomode,
+ lseg->pls_range.offset, lseg->pls_range.length);
}
get_layout_hdr_locked(lo);
@@ -538,7 +538,7 @@ pnfs_find_alloc_layout(struct inode *ino)
static int
is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
{
- return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW);
+ return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
}
/*
@@ -552,17 +552,17 @@ pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
dprintk("%s:Begin\n", __func__);
assert_spin_locked(&lo->inode->i_lock);
- list_for_each_entry(lseg, &lo->segs, fi_list) {
+ list_for_each_entry(lseg, &lo->segs, pls_list) {
if (is_matching_lseg(lseg, iomode)) {
ret = lseg;
break;
}
- if (cmp_layout(iomode, lseg->range.iomode) > 0)
+ if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
break;
}
dprintk("%s:Return lseg %p ref %d\n",
- __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0);
+ __func__, ret, ret ? atomic_read(&ret->pls_refcount.refcount) : 0);
return ret;
}
@@ -597,7 +597,7 @@ pnfs_update_layout(struct inode *ino,
}
/* if LAYOUTGET already failed once we don't try again */
- if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state))
+ if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
goto out_unlock;
get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */
@@ -606,7 +606,7 @@ pnfs_update_layout(struct inode *ino,
lseg = send_layoutget(lo, ctx, iomode);
out:
dprintk("%s end, state 0x%lx lseg %p\n", __func__,
- nfsi->layout->state, lseg);
+ nfsi->layout->plh_flags, lseg);
return lseg;
out_unlock:
spin_unlock(&ino->i_lock);
@@ -636,7 +636,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
spin_lock(&ino->i_lock);
init_lseg(lo, lseg);
- lseg->range = res->range;
+ lseg->pls_range = res->range;
*lgp->lsegpp = lseg;
pnfs_insert_layout(lo, lseg);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e12367d..6fcc073 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -31,10 +31,10 @@
#define FS_NFS_PNFS_H
struct pnfs_layout_segment {
- struct list_head fi_list;
- struct pnfs_layout_range range;
- struct kref kref;
- struct pnfs_layout_hdr *layout;
+ struct list_head pls_list;
+ struct pnfs_layout_range pls_range;
+ struct kref pls_refcount;
+ struct pnfs_layout_hdr *pls_layout;
};
#ifdef CONFIG_NFS_V4_1
@@ -65,7 +65,7 @@ struct pnfs_layout_hdr {
struct list_head segs; /* layout segments list */
seqlock_t seqlock; /* Protects the stateid */
nfs4_stateid stateid;
- unsigned long state;
+ unsigned long plh_flags;
struct inode *inode;
};
--
1.7.2.1
This is to prepare the way for sensible io draining. Instead of just
removing the lseg from the list, we instead clear the VALID flag
(preventing new io from grabbing references to the lseg) and remove
the reference holding it in the list. Thus the lseg will be removed
once any io in progress completes and any references still held are
dropped.
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/inode.c | 2 +-
fs/nfs/pnfs.c | 117 +++++++++++++++++++++++++++++++++++++-------------------
fs/nfs/pnfs.h | 8 +++-
3 files changed, 85 insertions(+), 42 deletions(-)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e67e31c..43a69da 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1410,9 +1410,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
*/
void nfs4_evict_inode(struct inode *inode)
{
+ pnfs_destroy_layout(NFS_I(inode));
truncate_inode_pages(&inode->i_data, 0);
end_writeback(inode);
- pnfs_destroy_layout(NFS_I(inode));
/* If we are holding a delegation, return it! */
nfs_inode_return_delegation_noreclaim(inode);
/* First call standard NFS clear_inode() code */
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 08313f53..bbca2e5 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -211,68 +211,100 @@ static void
init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
{
INIT_LIST_HEAD(&lseg->pls_list);
- kref_init(&lseg->pls_refcount);
+ atomic_set(&lseg->pls_refcount, 1);
+ smp_mb();
+ set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
lseg->pls_layout = lo;
}
-/* Called without i_lock held, as the free_lseg call may sleep */
-static void
-destroy_lseg(struct kref *kref)
+static void free_lseg(struct pnfs_layout_segment *lseg)
{
- struct pnfs_layout_segment *lseg =
- container_of(kref, struct pnfs_layout_segment, pls_refcount);
struct inode *ino = lseg->pls_layout->plh_inode;
- dprintk("--> %s\n", __func__);
NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
/* Matched by get_layout_hdr in pnfs_insert_layout */
put_layout_hdr(ino);
}
+/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
+ * could sleep, so must be called outside of the lock.
+ */
static void
-put_lseg(struct pnfs_layout_segment *lseg)
+put_lseg_locked(struct pnfs_layout_segment *lseg,
+ struct list_head *tmp_list)
{
- if (!lseg)
- return;
+ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
+ atomic_read(&lseg->pls_refcount),
+ test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+ if (atomic_dec_and_test(&lseg->pls_refcount)) {
+ struct inode *ino = lseg->pls_layout->plh_inode;
- dprintk("%s: lseg %p ref %d\n", __func__, lseg,
- atomic_read(&lseg->pls_refcount.refcount));
- kref_put(&lseg->pls_refcount, destroy_lseg);
+ BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+ list_del(&lseg->pls_list);
+ if (list_empty(&lseg->pls_layout->plh_segs)) {
+ struct nfs_client *clp;
+
+ clp = NFS_SERVER(ino)->nfs_client;
+ spin_lock(&clp->cl_lock);
+ /* List does not take a reference, so no need for put here */
+ list_del_init(&lseg->pls_layout->plh_layouts);
+ spin_unlock(&clp->cl_lock);
+ }
+ list_add(&lseg->pls_list, tmp_list);
+ }
}
-static void
-pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
+static bool
+should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
+{
+ return (recall_iomode == IOMODE_ANY ||
+ lseg_iomode == recall_iomode);
+}
+
+static void mark_lseg_invalid(struct pnfs_layout_segment *lseg,
+ struct list_head *tmp_list)
+{
+ if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
+ /* Remove the reference keeping the lseg in the
+ * list. It will now be removed when all
+ * outstanding io is finished.
+ */
+ put_lseg_locked(lseg, tmp_list);
+ }
+}
+
+/* Returns false if no lsegs match, true otherwise */
+static bool
+mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+ struct list_head *tmp_list,
+ u32 iomode)
{
struct pnfs_layout_segment *lseg, *next;
- struct nfs_client *clp;
+ bool rv = false;
dprintk("%s:Begin lo %p\n", __func__, lo);
- assert_spin_locked(&lo->plh_inode->i_lock);
- list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) {
- dprintk("%s: freeing lseg %p\n", __func__, lseg);
- list_move(&lseg->pls_list, tmp_list);
- }
- clp = NFS_SERVER(lo->plh_inode)->nfs_client;
- spin_lock(&clp->cl_lock);
- /* List does not take a reference, so no need for put here */
- list_del_init(&lo->plh_layouts);
- spin_unlock(&clp->cl_lock);
-
+ list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+ if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
+ dprintk("%s: freeing lseg %p iomode %d "
+ "offset %llu length %llu\n", __func__,
+ lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
+ lseg->pls_range.length);
+ mark_lseg_invalid(lseg, tmp_list);
+ rv = true;
+ }
dprintk("%s:Return\n", __func__);
+ return rv;
}
static void
-pnfs_free_lseg_list(struct list_head *tmp_list)
+pnfs_free_lseg_list(struct list_head *free_me)
{
- struct pnfs_layout_segment *lseg;
+ struct pnfs_layout_segment *lseg, *tmp;
- while (!list_empty(tmp_list)) {
- lseg = list_entry(tmp_list->next, struct pnfs_layout_segment,
- pls_list);
- dprintk("%s calling put_lseg on %p\n", __func__, lseg);
+ list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
list_del(&lseg->pls_list);
- put_lseg(lseg);
+ free_lseg(lseg);
}
}
@@ -285,7 +317,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
spin_lock(&nfsi->vfs_inode.i_lock);
lo = nfsi->layout;
if (lo) {
- pnfs_clear_lseg_list(lo, &tmp_list);
+ set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
+ mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
/* Matched by refcount set to 1 in alloc_init_layout_hdr */
put_layout_hdr_locked(lo);
}
@@ -477,9 +510,12 @@ pnfs_find_alloc_layout(struct inode *ino)
dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
assert_spin_locked(&ino->i_lock);
- if (nfsi->layout)
- return nfsi->layout;
-
+ if (nfsi->layout) {
+ if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags))
+ return NULL;
+ else
+ return nfsi->layout;
+ }
spin_unlock(&ino->i_lock);
new = alloc_init_layout_hdr(ino);
spin_lock(&ino->i_lock);
@@ -520,7 +556,8 @@ pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
assert_spin_locked(&lo->plh_inode->i_lock);
list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
- if (is_matching_lseg(lseg, iomode)) {
+ if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
+ is_matching_lseg(lseg, iomode)) {
ret = lseg;
break;
}
@@ -529,7 +566,7 @@ pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
}
dprintk("%s:Return lseg %p ref %d\n",
- __func__, ret, ret ? atomic_read(&ret->pls_refcount.refcount) : 0);
+ __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
return ret;
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 1093720..787253e 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,10 +30,15 @@
#ifndef FS_NFS_PNFS_H
#define FS_NFS_PNFS_H
+enum {
+ NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
+};
+
struct pnfs_layout_segment {
struct list_head pls_list;
struct pnfs_layout_range pls_range;
- struct kref pls_refcount;
+ atomic_t pls_refcount;
+ unsigned long pls_flags;
struct pnfs_layout_hdr *pls_layout;
};
@@ -44,6 +49,7 @@ struct pnfs_layout_segment {
enum {
NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
+ NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */
};
/* Per-layout driver specific registration structure */
--
1.7.2.1
From: Alexandros Batsakis <[email protected]>
While here, update the code a bit.
Signed-off-by: Alexandros Batsakis <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/callback.h | 8 ++++++++
fs/nfs/callback_proc.c | 29 ++++++++++++++++++++++++++---
2 files changed, 34 insertions(+), 3 deletions(-)
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 19570f4..70e6435d 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -119,6 +119,14 @@ extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
#define RCA4_TYPE_MASK_RDATA_DLG 0
#define RCA4_TYPE_MASK_WDATA_DLG 1
+#define RCA4_TYPE_MASK_DIR_DLG 2
+#define RCA4_TYPE_MASK_FILE_LAYOUT 3
+#define RCA4_TYPE_MASK_BLK_LAYOUT 4
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15
+#define RCA4_TYPE_MASK_ALL 0xf31f
struct cb_recallanyargs {
struct sockaddr *craa_addr;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index a8e8e20..d29c769 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -230,6 +230,17 @@ __be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
return cpu_to_be32(res);
}
+static void pnfs_recall_all_layouts(struct nfs_client *clp)
+{
+ struct cb_layoutrecallargs args;
+
+ /* Pretend we got a CB_LAYOUTRECALL(ALL) */
+ memset(&args, 0, sizeof(args));
+ args.cbl_recall_type = RETURN_ALL;
+ /* FIXME we ignore errors, what should we do? */
+ do_callback_layoutrecall(clp, &args);
+}
+
int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
{
if (delegation == NULL)
@@ -421,29 +432,41 @@ out:
return status;
}
+static bool
+validate_bitmap_values(unsigned long mask)
+{
+ return (mask & ~RCA4_TYPE_MASK_ALL) == 0;
+}
+
__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
struct cb_process_state *cps)
{
__be32 status;
fmode_t flags = 0;
- status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+ status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
if (!cps->clp) /* set in cb_sequence */
goto out;
dprintk("NFS: RECALL_ANY callback request from %s\n",
rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+ status = cpu_to_be32(NFS4ERR_INVAL);
+ if (!validate_bitmap_values(args->craa_type_mask))
+ goto out;
+
+ status = cpu_to_be32(NFS4_OK);
if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
&args->craa_type_mask))
flags = FMODE_READ;
if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *)
&args->craa_type_mask))
flags |= FMODE_WRITE;
-
+ if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
+ &args->craa_type_mask))
+ pnfs_recall_all_layouts(cps->clp);
if (flags)
nfs_expire_all_delegation_types(cps->clp, flags);
- status = htonl(NFS4_OK);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
return status;
--
1.7.2.1
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/nfs4filelayout.c | 6 +-
fs/nfs/pnfs.c | 94 +++++++++++++++++++++++-----------------------
fs/nfs/pnfs.h | 12 +++---
3 files changed, 56 insertions(+), 56 deletions(-)
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 738d6a4..23f930c 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -82,7 +82,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
{
struct nfs4_file_layout_dsaddr *dsaddr;
int status = -EINVAL;
- struct nfs_server *nfss = NFS_SERVER(lo->inode);
+ struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
dprintk("--> %s\n", __func__);
@@ -101,7 +101,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
/* find and reference the deviceid */
dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
if (dsaddr == NULL) {
- dsaddr = get_device_info(lo->inode, id);
+ dsaddr = get_device_info(lo->plh_inode, id);
if (dsaddr == NULL)
goto out;
}
@@ -243,7 +243,7 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
static void
filelayout_free_lseg(struct pnfs_layout_segment *lseg)
{
- struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->inode);
+ struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
dprintk("--> %s\n", __func__);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c3ca5fe..6736f9e 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -180,21 +180,21 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
static void
get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
{
- assert_spin_locked(&lo->inode->i_lock);
- lo->refcount++;
+ assert_spin_locked(&lo->plh_inode->i_lock);
+ lo->plh_refcount++;
}
static void
put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
{
- assert_spin_locked(&lo->inode->i_lock);
- BUG_ON(lo->refcount == 0);
+ assert_spin_locked(&lo->plh_inode->i_lock);
+ BUG_ON(lo->plh_refcount == 0);
- lo->refcount--;
- if (!lo->refcount) {
+ lo->plh_refcount--;
+ if (!lo->plh_refcount) {
dprintk("%s: freeing layout cache %p\n", __func__, lo);
- BUG_ON(!list_empty(&lo->layouts));
- NFS_I(lo->inode)->layout = NULL;
+ BUG_ON(!list_empty(&lo->plh_layouts));
+ NFS_I(lo->plh_inode)->layout = NULL;
kfree(lo);
}
}
@@ -221,7 +221,7 @@ destroy_lseg(struct kref *kref)
{
struct pnfs_layout_segment *lseg =
container_of(kref, struct pnfs_layout_segment, pls_refcount);
- struct inode *ino = lseg->pls_layout->inode;
+ struct inode *ino = lseg->pls_layout->plh_inode;
dprintk("--> %s\n", __func__);
NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
@@ -248,19 +248,19 @@ pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
dprintk("%s:Begin lo %p\n", __func__, lo);
- assert_spin_locked(&lo->inode->i_lock);
- list_for_each_entry_safe(lseg, next, &lo->segs, pls_list) {
+ assert_spin_locked(&lo->plh_inode->i_lock);
+ list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) {
dprintk("%s: freeing lseg %p\n", __func__, lseg);
list_move(&lseg->pls_list, tmp_list);
}
- clp = NFS_SERVER(lo->inode)->nfs_client;
+ clp = NFS_SERVER(lo->plh_inode)->nfs_client;
spin_lock(&clp->cl_lock);
/* List does not take a reference, so no need for put here */
- list_del_init(&lo->layouts);
+ list_del_init(&lo->plh_layouts);
spin_unlock(&clp->cl_lock);
- write_seqlock(&lo->seqlock);
+ write_seqlock(&lo->plh_seqlock);
clear_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
- write_sequnlock(&lo->seqlock);
+ write_sequnlock(&lo->plh_seqlock);
dprintk("%s:Return\n", __func__);
}
@@ -312,25 +312,25 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
while (!list_empty(&tmp_list)) {
lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
- layouts);
+ plh_layouts);
dprintk("%s freeing layout for inode %lu\n", __func__,
- lo->inode->i_ino);
- pnfs_destroy_layout(NFS_I(lo->inode));
+ lo->plh_inode->i_ino);
+ pnfs_destroy_layout(NFS_I(lo->plh_inode));
}
}
-/* update lo->stateid with new if is more recent
+/* update lo->plh_stateid with new if is more recent
*
- * lo->stateid could be the open stateid, in which case we just use what given.
+ * lo->plh_stateid could be the open stateid, in which case we just use what given.
*/
static void
pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
const nfs4_stateid *new)
{
- nfs4_stateid *old = &lo->stateid;
+ nfs4_stateid *old = &lo->plh_stateid;
bool overwrite = false;
- write_seqlock(&lo->seqlock);
+ write_seqlock(&lo->plh_seqlock);
if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags) ||
memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
overwrite = true;
@@ -344,7 +344,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
}
if (overwrite)
memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
- write_sequnlock(&lo->seqlock);
+ write_sequnlock(&lo->plh_seqlock);
}
static void
@@ -354,14 +354,14 @@ pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
int seq;
dprintk("--> %s\n", __func__);
- write_seqlock(&lo->seqlock);
+ write_seqlock(&lo->plh_seqlock);
do {
seq = read_seqbegin(&state->seqlock);
- memcpy(lo->stateid.data, state->stateid.data,
+ memcpy(lo->plh_stateid.data, state->stateid.data,
sizeof(state->stateid.data));
} while (read_seqretry(&state->seqlock, seq));
set_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
- write_sequnlock(&lo->seqlock);
+ write_sequnlock(&lo->plh_seqlock);
dprintk("<-- %s\n", __func__);
}
@@ -373,14 +373,14 @@ pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
dprintk("--> %s\n", __func__);
do {
- seq = read_seqbegin(&lo->seqlock);
+ seq = read_seqbegin(&lo->plh_seqlock);
if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags)) {
/* This will trigger retry of the read */
pnfs_layout_from_open_stateid(lo, open_state);
} else
- memcpy(dst->data, lo->stateid.data,
- sizeof(lo->stateid.data));
- } while (read_seqretry(&lo->seqlock, seq));
+ memcpy(dst->data, lo->plh_stateid.data,
+ sizeof(lo->plh_stateid.data));
+ } while (read_seqretry(&lo->plh_seqlock, seq));
dprintk("<-- %s\n", __func__);
}
@@ -395,7 +395,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
struct nfs_open_context *ctx,
u32 iomode)
{
- struct inode *ino = lo->inode;
+ struct inode *ino = lo->plh_inode;
struct nfs_server *server = NFS_SERVER(ino);
struct nfs4_layoutget *lgp;
struct pnfs_layout_segment *lseg = NULL;
@@ -405,7 +405,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
BUG_ON(ctx == NULL);
lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
if (lgp == NULL) {
- put_layout_hdr(lo->inode);
+ put_layout_hdr(lo->plh_inode);
return NULL;
}
lgp->args.minlength = NFS4_MAX_UINT64;
@@ -450,16 +450,16 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
dprintk("%s:Begin\n", __func__);
- assert_spin_locked(&lo->inode->i_lock);
- if (list_empty(&lo->segs)) {
- struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
+ assert_spin_locked(&lo->plh_inode->i_lock);
+ if (list_empty(&lo->plh_segs)) {
+ struct nfs_client *clp = NFS_SERVER(lo->plh_inode)->nfs_client;
spin_lock(&clp->cl_lock);
- BUG_ON(!list_empty(&lo->layouts));
- list_add_tail(&lo->layouts, &clp->cl_layouts);
+ BUG_ON(!list_empty(&lo->plh_layouts));
+ list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
spin_unlock(&clp->cl_lock);
}
- list_for_each_entry(lp, &lo->segs, pls_list) {
+ list_for_each_entry(lp, &lo->plh_segs, pls_list) {
if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
continue;
list_add_tail(&lseg->pls_list, &lp->pls_list);
@@ -474,7 +474,7 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
break;
}
if (!found) {
- list_add_tail(&lseg->pls_list, &lo->segs);
+ list_add_tail(&lseg->pls_list, &lo->plh_segs);
dprintk("%s: inserted lseg %p "
"iomode %d offset %llu length %llu at tail\n",
__func__, lseg, lseg->pls_range.iomode,
@@ -493,11 +493,11 @@ alloc_init_layout_hdr(struct inode *ino)
lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
if (!lo)
return NULL;
- lo->refcount = 1;
- INIT_LIST_HEAD(&lo->layouts);
- INIT_LIST_HEAD(&lo->segs);
- seqlock_init(&lo->seqlock);
- lo->inode = ino;
+ lo->plh_refcount = 1;
+ INIT_LIST_HEAD(&lo->plh_layouts);
+ INIT_LIST_HEAD(&lo->plh_segs);
+ seqlock_init(&lo->plh_seqlock);
+ lo->plh_inode = ino;
return lo;
}
@@ -551,8 +551,8 @@ pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
dprintk("%s:Begin\n", __func__);
- assert_spin_locked(&lo->inode->i_lock);
- list_for_each_entry(lseg, &lo->segs, pls_list) {
+ assert_spin_locked(&lo->plh_inode->i_lock);
+ list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
if (is_matching_lseg(lseg, iomode)) {
ret = lseg;
break;
@@ -619,7 +619,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
struct nfs4_layoutget_res *res = &lgp->res;
struct pnfs_layout_segment *lseg;
- struct inode *ino = lo->inode;
+ struct inode *ino = lo->plh_inode;
int status = 0;
/* Inject layout blob into I/O device driver */
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 6fcc073..c2f1086 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -60,13 +60,13 @@ struct pnfs_layoutdriver_type {
};
struct pnfs_layout_hdr {
- unsigned long refcount;
- struct list_head layouts; /* other client layouts */
- struct list_head segs; /* layout segments list */
- seqlock_t seqlock; /* Protects the stateid */
- nfs4_stateid stateid;
+ unsigned long plh_refcount;
+ struct list_head plh_layouts; /* other client layouts */
+ struct list_head plh_segs; /* layout segments list */
+ seqlock_t plh_seqlock; /* Protects the stateid */
+ nfs4_stateid plh_stateid;
unsigned long plh_flags;
- struct inode *inode;
+ struct inode *plh_inode;
};
struct pnfs_device {
--
1.7.2.1
This will be required to allow us to grab reference outside of i_lock.
While we are at it, make put_layout_hdr take the same argument as all the
related functions.
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/pnfs.c | 48 ++++++++++++++++++++++++++----------------------
fs/nfs/pnfs.h | 4 ++--
2 files changed, 28 insertions(+), 24 deletions(-)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 18f9ddc..5a2ca98 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -177,34 +177,38 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
* pNFS client layout cache
*/
+/* Need to hold i_lock if caller does not already hold reference */
static void
-get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+get_layout_hdr(struct pnfs_layout_hdr *lo)
{
- assert_spin_locked(&lo->plh_inode->i_lock);
- lo->plh_refcount++;
+ atomic_inc(&lo->plh_refcount);
}
static void
-put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+destroy_layout_hdr(struct pnfs_layout_hdr *lo)
{
- assert_spin_locked(&lo->plh_inode->i_lock);
- BUG_ON(lo->plh_refcount == 0);
+ dprintk("%s: freeing layout cache %p\n", __func__, lo);
+ BUG_ON(!list_empty(&lo->plh_layouts));
+ NFS_I(lo->plh_inode)->layout = NULL;
+ kfree(lo);
+}
- lo->plh_refcount--;
- if (!lo->plh_refcount) {
- dprintk("%s: freeing layout cache %p\n", __func__, lo);
- BUG_ON(!list_empty(&lo->plh_layouts));
- NFS_I(lo->plh_inode)->layout = NULL;
- kfree(lo);
- }
+static void
+put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+{
+ if (atomic_dec_and_test(&lo->plh_refcount))
+ destroy_layout_hdr(lo);
}
void
-put_layout_hdr(struct inode *inode)
+put_layout_hdr(struct pnfs_layout_hdr *lo)
{
- spin_lock(&inode->i_lock);
- put_layout_hdr_locked(NFS_I(inode)->layout);
- spin_unlock(&inode->i_lock);
+ struct inode *inode = lo->plh_inode;
+
+ if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+ destroy_layout_hdr(lo);
+ spin_unlock(&inode->i_lock);
+ }
}
static void
@@ -223,7 +227,7 @@ static void free_lseg(struct pnfs_layout_segment *lseg)
NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
/* Matched by get_layout_hdr in pnfs_insert_layout */
- put_layout_hdr(ino);
+ put_layout_hdr(NFS_I(ino)->layout);
}
/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
@@ -481,7 +485,7 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
__func__, lseg, lseg->pls_range.iomode,
lseg->pls_range.offset, lseg->pls_range.length);
}
- get_layout_hdr_locked(lo);
+ get_layout_hdr(lo);
dprintk("%s:Return\n", __func__);
}
@@ -494,7 +498,7 @@ alloc_init_layout_hdr(struct inode *ino)
lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
if (!lo)
return NULL;
- lo->plh_refcount = 1;
+ atomic_set(&lo->plh_refcount, 1);
INIT_LIST_HEAD(&lo->plh_layouts);
INIT_LIST_HEAD(&lo->plh_segs);
lo->plh_inode = ino;
@@ -609,7 +613,7 @@ pnfs_update_layout(struct inode *ino,
goto out_unlock;
atomic_inc(&lo->plh_outstanding);
- get_layout_hdr_locked(lo);
+ get_layout_hdr(lo);
if (list_empty(&lo->plh_segs)) {
/* The lo must be on the clp list if there is any
* chance of a CB_LAYOUTRECALL(FILE) coming in.
@@ -632,7 +636,7 @@ pnfs_update_layout(struct inode *ino,
spin_unlock(&ino->i_lock);
}
atomic_dec(&lo->plh_outstanding);
- put_layout_hdr(ino);
+ put_layout_hdr(lo);
out:
dprintk("%s end, state 0x%lx lseg %p\n", __func__,
nfsi->layout->plh_flags, lseg);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 698380d..8aaab56 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -65,7 +65,7 @@ struct pnfs_layoutdriver_type {
};
struct pnfs_layout_hdr {
- unsigned long plh_refcount;
+ atomic_t plh_refcount;
struct list_head plh_layouts; /* other client layouts */
struct list_head plh_segs; /* layout segments list */
nfs4_stateid plh_stateid;
@@ -147,7 +147,7 @@ void unset_pnfs_layoutdriver(struct nfs_server *);
int pnfs_layout_process(struct nfs4_layoutget *lgp);
void pnfs_destroy_layout(struct nfs_inode *);
void pnfs_destroy_all_layouts(struct nfs_client *);
-void put_layout_hdr(struct inode *inode);
+void put_layout_hdr(struct pnfs_layout_hdr *lo);
int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
struct pnfs_layout_hdr *lo,
struct nfs4_state *open_state);
--
1.7.2.1
On Dec 26, 2010, at 3:40 AM, Benny Halevy wrote:
> On 2010-12-23 02:19, Fred Isaman wrote:
>>
>> On Dec 22, 2010, at 5:00 PM, Trond Myklebust wrote:
>>
>>> On Tue, 2010-12-21 at 23:00 -0500, Fred Isaman wrote:
>>>> A lsyout can request return-on-close. How this interacts with the
>>>> forgetful model of never sending LAYOUTRETURNS is a bit ambiguous.
>>>> We forget any layouts marked roc, and wait for them to be completely
>>>> forgotten before continuing with the close. In addition, to compensate
>>>> for races with any inflight LAYOUTGETs, and the fact that we do not get
>>>> any layout stateid back from the server, we set the barrier to the worst
>>>> case scenario of current_seqid + number of outstanding LAYOUTGETS.
>>>>
>>>> Signed-off-by: Fred Isaman <[email protected]>
>>>> ---
>>>> fs/nfs/inode.c | 1 +
>>>> fs/nfs/nfs4_fs.h | 2 +-
>>>> fs/nfs/nfs4proc.c | 21 +++++++++++-
>>>> fs/nfs/nfs4state.c | 7 +++-
>>>> fs/nfs/pnfs.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>> fs/nfs/pnfs.h | 28 ++++++++++++++++
>>>> include/linux/nfs_fs.h | 1 +
>>>> 7 files changed, 138 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
>>>> index 43a69da..c64bb40 100644
>>>> diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
>>>> index 29d504d..90515de 100644
>>>> --- a/include/linux/nfs_fs.h
>>>> +++ b/include/linux/nfs_fs.h
>>>> @@ -190,6 +190,7 @@ struct nfs_inode {
>>>> struct rw_semaphore rwsem;
>>>>
>>>> /* pNFS layout information */
>>>> + struct rpc_wait_queue lo_rpcwaitq;
>>>> struct pnfs_layout_hdr *layout;
>>>> #endif /* CONFIG_NFS_V4*/
>>>> #ifdef CONFIG_NFS_FSCACHE
>>>
>>> I believe that I've asked this before. Why do we need a per-inode
>>> rpc_wait_queue just to support pnfs? That's a significant expansion of
>>> an already bloated structure.
>>>
>>> Can we please either make this a single per-filesystem wait queue, or
>>> else possibly a pool of wait queues?
>>>
>>> Trond
>>
>> This was introduced to avoid deadlocks that were occurring when we had a single wait queue. However, the deadlocks I remember were due to a combination of the fact that, at the time, we handled EAGAIN errors of IO outside the RPC code, and we sent LAYOUTRETURN on such error. Since we do neither now, I believe a single per-filesystem wait queue will suffice. Anyone disagree?
>
> The dead locks were also because we didn't use rpc wait queue but rather a thread based one.
> Doing the serialization in the rpc prepare phase using a shared queue should cause dead locks.
>
> Benny
>
In the revised code, we use a per-fs rpc waitq to wait for IO to drain before sending CLOSE to the MDS. For a deadlock to occur, we would have to have an IO thread get stuck waiting for the CLOSE to complete. At least for the file layout driver, I don't see anyplace that is likely to happen.
Fred
>>
>> Fred
>>
>>>
>>> --
>>> Trond Myklebust
>>> Linux NFS client maintainer
>>>
>>> NetApp
>>> [email protected]
>>> http://www.netapp.com
>>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
This prepares for future changes, where the layout state needs
to change atomically with several other variables. In particular,
it will need to know if lo->segs is empty, as we test that instead
of manipulating the NFS_LAYOUT_STATEID_SET bit. Moreover, the
layoutstateid is not really a read-mostly structure, as it is
written almost as often as it is read.
The behavior of pnfs_get_layout_stateid is also slightly changed, so that
it no longer changes the stateid. Its name is changed to +pnfs_choose_layoutget_stateid.
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/nfs4xdr.c | 2 +-
fs/nfs/pnfs.c | 79 +++++++++++++++--------------------------------------
fs/nfs/pnfs.h | 7 ++---
3 files changed, 27 insertions(+), 61 deletions(-)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f3f9915..4e28242 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1798,7 +1798,7 @@ encode_layoutget(struct xdr_stream *xdr,
p = xdr_encode_hyper(p, args->range.offset);
p = xdr_encode_hyper(p, args->range.length);
p = xdr_encode_hyper(p, args->minlength);
- pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
+ pnfs_choose_layoutget_stateid(&stateid, NFS_I(args->inode)->layout,
args->ctx->state);
p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
*p = cpu_to_be32(args->maxcount);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 6736f9e..08313f53 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -258,9 +258,6 @@ pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
/* List does not take a reference, so no need for put here */
list_del_init(&lo->plh_layouts);
spin_unlock(&clp->cl_lock);
- write_seqlock(&lo->plh_seqlock);
- clear_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
- write_sequnlock(&lo->plh_seqlock);
dprintk("%s:Return\n", __func__);
}
@@ -319,69 +316,40 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
}
}
-/* update lo->plh_stateid with new if is more recent
- *
- * lo->plh_stateid could be the open stateid, in which case we just use what given.
- */
+/* update lo->plh_stateid with new if is more recent */
static void
pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
const nfs4_stateid *new)
{
- nfs4_stateid *old = &lo->plh_stateid;
- bool overwrite = false;
-
- write_seqlock(&lo->plh_seqlock);
- if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags) ||
- memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
- overwrite = true;
- else {
- u32 oldseq, newseq;
-
- oldseq = be32_to_cpu(old->stateid.seqid);
- newseq = be32_to_cpu(new->stateid.seqid);
- if ((int)(newseq - oldseq) > 0)
- overwrite = true;
- }
- if (overwrite)
- memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
- write_sequnlock(&lo->plh_seqlock);
-}
-
-static void
-pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
- struct nfs4_state *state)
-{
- int seq;
+ u32 oldseq, newseq;
- dprintk("--> %s\n", __func__);
- write_seqlock(&lo->plh_seqlock);
- do {
- seq = read_seqbegin(&state->seqlock);
- memcpy(lo->plh_stateid.data, state->stateid.data,
- sizeof(state->stateid.data));
- } while (read_seqretry(&state->seqlock, seq));
- set_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
- write_sequnlock(&lo->plh_seqlock);
- dprintk("<-- %s\n", __func__);
+ oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid);
+ newseq = be32_to_cpu(new->stateid.seqid);
+ if ((int)(newseq - oldseq) > 0)
+ memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
}
-void
-pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
- struct nfs4_state *open_state)
+int
+pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+ struct nfs4_state *open_state)
{
- int seq;
+ int status = 0;
dprintk("--> %s\n", __func__);
- do {
- seq = read_seqbegin(&lo->plh_seqlock);
- if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags)) {
- /* This will trigger retry of the read */
- pnfs_layout_from_open_stateid(lo, open_state);
- } else
- memcpy(dst->data, lo->plh_stateid.data,
- sizeof(lo->plh_stateid.data));
- } while (read_seqretry(&lo->plh_seqlock, seq));
+ spin_lock(&lo->plh_inode->i_lock);
+ if (list_empty(&lo->plh_segs)) {
+ int seq;
+
+ do {
+ seq = read_seqbegin(&open_state->seqlock);
+ memcpy(dst->data, open_state->stateid.data,
+ sizeof(open_state->stateid.data));
+ } while (read_seqretry(&open_state->seqlock, seq));
+ } else
+ memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data));
+ spin_unlock(&lo->plh_inode->i_lock);
dprintk("<-- %s\n", __func__);
+ return status;
}
/*
@@ -496,7 +464,6 @@ alloc_init_layout_hdr(struct inode *ino)
lo->plh_refcount = 1;
INIT_LIST_HEAD(&lo->plh_layouts);
INIT_LIST_HEAD(&lo->plh_segs);
- seqlock_init(&lo->plh_seqlock);
lo->plh_inode = ino;
return lo;
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index c2f1086..1093720 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -44,7 +44,6 @@ struct pnfs_layout_segment {
enum {
NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
- NFS_LAYOUT_STATEID_SET, /* have a valid layout stateid */
};
/* Per-layout driver specific registration structure */
@@ -63,7 +62,6 @@ struct pnfs_layout_hdr {
unsigned long plh_refcount;
struct list_head plh_layouts; /* other client layouts */
struct list_head plh_segs; /* layout segments list */
- seqlock_t plh_seqlock; /* Protects the stateid */
nfs4_stateid plh_stateid;
unsigned long plh_flags;
struct inode *plh_inode;
@@ -143,8 +141,9 @@ int pnfs_layout_process(struct nfs4_layoutget *lgp);
void pnfs_destroy_layout(struct nfs_inode *);
void pnfs_destroy_all_layouts(struct nfs_client *);
void put_layout_hdr(struct inode *inode);
-void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
- struct nfs4_state *open_state);
+int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
+ struct pnfs_layout_hdr *lo,
+ struct nfs4_state *open_state);
static inline int lo_fail_bit(u32 iomode)
--
1.7.2.1
Either a bad server reply, or our ignoring of multiple array segments in
a reply, can cause a reply to not meet our requirements. Ensure
that we ignore such replies.
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/pnfs.c | 11 +++++++++++
1 files changed, 11 insertions(+), 0 deletions(-)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index b11b46a..18f9ddc 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -651,6 +651,17 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
struct inode *ino = lo->plh_inode;
int status = 0;
+ /* Verify we got what we asked for.
+ * Note that because the xdr parsing only accepts a single
+ * element array, this can fail even if the server is behaving
+ * correctly.
+ */
+ if (lgp->args.range.iomode > res->range.iomode ||
+ res->range.offset != 0 ||
+ res->range.length != NFS4_MAX_UINT64) {
+ status = -EINVAL;
+ goto out;
+ }
/* Inject layout blob into I/O device driver */
lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
if (!lseg || IS_ERR(lseg)) {
--
1.7.2.1