Version 5 of the wave 2 submission.
changes from version 4:
- change return value of mark_matching_lsegs_invalid, so that we can...
- fix algorithm for choosing DELAY or NOMATCHING for CB_LAYOUTRECALL return
- grab inode where needed
- avoid crash if no layout found in list
- code cleanups suggested by Trond
- remove cbl_inode field, enabled by above cleanups
These patches implement wave 2 of the pnfs submission, which
encompasses CB_LAYOUTRECALL and its serialization with LAYOUTGET,
as well as the "forgetful model" in which LAYOUTRETURNs are never
sent, but instead merely discarded by the client.
Fred
[PATCH 01/15] pnfs: fix incorrect comment in destroy_lseg
[PATCH 02/15] pnfs: remove unnecessary field lgp->status
[PATCH 03/15] pnfs: add prefix to struct pnfs_layout_segment fields
[PATCH 04/15] pnfs: add prefix to struct pnfs_layout_hdr fields
[PATCH 05/15] pnfs: change layout state seqlock to a spinlock
[PATCH 06/15] pnfs: change how lsegs are removed from layout list
[PATCH 07/15] pnfs: layoutget rpc code cleanup
[PATCH 08/15] pnfs: serialize LAYOUTGET(openstateid)
[PATCH 09/15] pnfs: add layout to client list before sending rpc
[PATCH 10/15] pnfs: check that partial LAYOUTGET return is ignored
[PATCH 11/15] pnfs: change lo refcounting to atomic_t
[PATCH 12/15] pnfs: CB_LAYOUTRECALL xdr code
[PATCH 13/15] pnfs: add CB_LAYOUTRECALL handling
[PATCH 14/15] pnfs: update nfs4_callback_recallany to handle layouts
[PATCH 15/15] pnfs: layout roc code
This is to prepare the way for sensible io draining. Instead of just
removing the lseg from the list, we instead clear the VALID flag
(preventing new io from grabbing references to the lseg) and remove
the reference holding it in the list. Thus the lseg will be removed
once any io in progress completes and any references still held are
dropped.
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/inode.c | 2 +-
fs/nfs/pnfs.c | 130 ++++++++++++++++++++++++++++++++++++++------------------
fs/nfs/pnfs.h | 8 +++-
3 files changed, 96 insertions(+), 44 deletions(-)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e67e31c..43a69da 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1410,9 +1410,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
*/
void nfs4_evict_inode(struct inode *inode)
{
+ pnfs_destroy_layout(NFS_I(inode));
truncate_inode_pages(&inode->i_data, 0);
end_writeback(inode);
- pnfs_destroy_layout(NFS_I(inode));
/* If we are holding a delegation, return it! */
nfs_inode_return_delegation_noreclaim(inode);
/* First call standard NFS clear_inode() code */
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 08313f53..212cbc2 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -211,68 +211,109 @@ static void
init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
{
INIT_LIST_HEAD(&lseg->pls_list);
- kref_init(&lseg->pls_refcount);
+ atomic_set(&lseg->pls_refcount, 1);
+ smp_mb();
+ set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
lseg->pls_layout = lo;
}
-/* Called without i_lock held, as the free_lseg call may sleep */
-static void
-destroy_lseg(struct kref *kref)
+static void free_lseg(struct pnfs_layout_segment *lseg)
{
- struct pnfs_layout_segment *lseg =
- container_of(kref, struct pnfs_layout_segment, pls_refcount);
struct inode *ino = lseg->pls_layout->plh_inode;
- dprintk("--> %s\n", __func__);
NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
/* Matched by get_layout_hdr in pnfs_insert_layout */
put_layout_hdr(ino);
}
-static void
-put_lseg(struct pnfs_layout_segment *lseg)
+/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
+ * could sleep, so must be called outside of the lock.
+ * Returns 1 if object was removed, otherwise return 0.
+ */
+static int
+put_lseg_locked(struct pnfs_layout_segment *lseg,
+ struct list_head *tmp_list)
{
- if (!lseg)
- return;
+ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
+ atomic_read(&lseg->pls_refcount),
+ test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+ if (atomic_dec_and_test(&lseg->pls_refcount)) {
+ struct inode *ino = lseg->pls_layout->plh_inode;
- dprintk("%s: lseg %p ref %d\n", __func__, lseg,
- atomic_read(&lseg->pls_refcount.refcount));
- kref_put(&lseg->pls_refcount, destroy_lseg);
+ BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+ list_del(&lseg->pls_list);
+ if (list_empty(&lseg->pls_layout->plh_segs)) {
+ struct nfs_client *clp;
+
+ clp = NFS_SERVER(ino)->nfs_client;
+ spin_lock(&clp->cl_lock);
+ /* List does not take a reference, so no need for put here */
+ list_del_init(&lseg->pls_layout->plh_layouts);
+ spin_unlock(&clp->cl_lock);
+ }
+ list_add(&lseg->pls_list, tmp_list);
+ return 1;
+ }
+ return 0;
}
-static void
-pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
+static bool
+should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
{
- struct pnfs_layout_segment *lseg, *next;
- struct nfs_client *clp;
+ return (recall_iomode == IOMODE_ANY ||
+ lseg_iomode == recall_iomode);
+}
- dprintk("%s:Begin lo %p\n", __func__, lo);
+/* Returns 1 if lseg is removed from list, 0 otherwise */
+static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
+ struct list_head *tmp_list)
+{
+ int rv = 0;
- assert_spin_locked(&lo->plh_inode->i_lock);
- list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) {
- dprintk("%s: freeing lseg %p\n", __func__, lseg);
- list_move(&lseg->pls_list, tmp_list);
+ if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
+ /* Remove the reference keeping the lseg in the
+ * list. It will now be removed when all
+ * outstanding io is finished.
+ */
+ rv = put_lseg_locked(lseg, tmp_list);
}
- clp = NFS_SERVER(lo->plh_inode)->nfs_client;
- spin_lock(&clp->cl_lock);
- /* List does not take a reference, so no need for put here */
- list_del_init(&lo->plh_layouts);
- spin_unlock(&clp->cl_lock);
+ return rv;
+}
- dprintk("%s:Return\n", __func__);
+/* Returns count of number of matching invalid lsegs remaining in list
+ * after call.
+ */
+static int
+mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+ struct list_head *tmp_list,
+ u32 iomode)
+{
+ struct pnfs_layout_segment *lseg, *next;
+ int invalid = 0, removed = 0;
+
+ dprintk("%s:Begin lo %p\n", __func__, lo);
+
+ list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+ if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
+ dprintk("%s: freeing lseg %p iomode %d "
+ "offset %llu length %llu\n", __func__,
+ lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
+ lseg->pls_range.length);
+ invalid++;
+ removed += mark_lseg_invalid(lseg, tmp_list);
+ }
+ dprintk("%s:Return %i\n", __func__, invalid - removed);
+ return invalid - removed;
}
static void
-pnfs_free_lseg_list(struct list_head *tmp_list)
+pnfs_free_lseg_list(struct list_head *free_me)
{
- struct pnfs_layout_segment *lseg;
+ struct pnfs_layout_segment *lseg, *tmp;
- while (!list_empty(tmp_list)) {
- lseg = list_entry(tmp_list->next, struct pnfs_layout_segment,
- pls_list);
- dprintk("%s calling put_lseg on %p\n", __func__, lseg);
+ list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
list_del(&lseg->pls_list);
- put_lseg(lseg);
+ free_lseg(lseg);
}
}
@@ -285,7 +326,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
spin_lock(&nfsi->vfs_inode.i_lock);
lo = nfsi->layout;
if (lo) {
- pnfs_clear_lseg_list(lo, &tmp_list);
+ set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
+ mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
/* Matched by refcount set to 1 in alloc_init_layout_hdr */
put_layout_hdr_locked(lo);
}
@@ -477,9 +519,12 @@ pnfs_find_alloc_layout(struct inode *ino)
dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
assert_spin_locked(&ino->i_lock);
- if (nfsi->layout)
- return nfsi->layout;
-
+ if (nfsi->layout) {
+ if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags))
+ return NULL;
+ else
+ return nfsi->layout;
+ }
spin_unlock(&ino->i_lock);
new = alloc_init_layout_hdr(ino);
spin_lock(&ino->i_lock);
@@ -520,7 +565,8 @@ pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
assert_spin_locked(&lo->plh_inode->i_lock);
list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
- if (is_matching_lseg(lseg, iomode)) {
+ if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
+ is_matching_lseg(lseg, iomode)) {
ret = lseg;
break;
}
@@ -529,7 +575,7 @@ pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
}
dprintk("%s:Return lseg %p ref %d\n",
- __func__, ret, ret ? atomic_read(&ret->pls_refcount.refcount) : 0);
+ __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
return ret;
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 1093720..787253e 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,10 +30,15 @@
#ifndef FS_NFS_PNFS_H
#define FS_NFS_PNFS_H
+enum {
+ NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
+};
+
struct pnfs_layout_segment {
struct list_head pls_list;
struct pnfs_layout_range pls_range;
- struct kref pls_refcount;
+ atomic_t pls_refcount;
+ unsigned long pls_flags;
struct pnfs_layout_hdr *pls_layout;
};
@@ -44,6 +49,7 @@ struct pnfs_layout_segment {
enum {
NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
+ NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */
};
/* Per-layout driver specific registration structure */
--
1.7.2.1
On Dec 22, 2010, at 6:35 PM, Trond Myklebust wrote:
> On Wed, 2010-12-22 at 17:08 -0500, Fred Isaman wrote:
>> On Dec 22, 2010, at 4:43 PM, Trond Myklebust wrote:
>>
>>> On Tue, 2010-12-21 at 23:00 -0500, Fred Isaman wrote:
>>>> This is to prepare the way for sensible io draining. Instead of just
>>>> removing the lseg from the list, we instead clear the VALID flag
>>>> (preventing new io from grabbing references to the lseg) and remove
>>>> the reference holding it in the list. Thus the lseg will be removed
>>>> once any io in progress completes and any references still held are
>>>> dropped.
>>>>
>>>> Signed-off-by: Fred Isaman <[email protected]>
>>>> ---
>>>> fs/nfs/inode.c | 2 +-
>>>> fs/nfs/pnfs.c | 121 ++++++++++++++++++++++++++++++++++++-------------------
>>>> fs/nfs/pnfs.h | 8 +++-
>>>> 3 files changed, 87 insertions(+), 44 deletions(-)
>>>>
>>>> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
>>>> index e67e31c..43a69da 100644
>>>> --- a/fs/nfs/inode.c
>>>> +++ b/fs/nfs/inode.c
>>>
>>>> -/* Called without i_lock held, as the free_lseg call may sleep */
>>>> -static void
>>>> -destroy_lseg(struct kref *kref)
>>>> +static void free_lseg(struct pnfs_layout_segment *lseg)
>>>> {
>>>> - struct pnfs_layout_segment *lseg =
>>>> - container_of(kref, struct pnfs_layout_segment, pls_refcount);
>>>> struct inode *ino = lseg->pls_layout->plh_inode;
>>>>
>>>> - dprintk("--> %s\n", __func__);
>>>> + BUG_ON(atomic_read(&lseg->pls_refcount) != 0);
>>>> NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
>>>> /* Matched by get_layout_hdr in pnfs_insert_layout */
>>>> put_layout_hdr(ino);
>>>> }
>>>
>>> <snip>
>>>
>>>> static void
>>>> -pnfs_free_lseg_list(struct list_head *tmp_list)
>>>> +pnfs_free_lseg_list(struct list_head *free_me)
>>>> {
>>>> - struct pnfs_layout_segment *lseg;
>>>> + struct pnfs_layout_segment *lseg, *tmp;
>>>>
>>>> - while (!list_empty(tmp_list)) {
>>>> - lseg = list_entry(tmp_list->next, struct pnfs_layout_segment,
>>>> - pls_list);
>>>> - dprintk("%s calling put_lseg on %p\n", __func__, lseg);
>>>> - list_del(&lseg->pls_list);
>>>> - put_lseg(lseg);
>>>> - }
>>>> + list_for_each_entry_safe(lseg, tmp, free_me, pls_list)
>>>> + free_lseg(lseg);
>>>> + INIT_LIST_HEAD(free_me);
>>>> }
>>>
>>> The above looks very dubious to me. Why is this change needed, and what
>>> guarantees do we have that free_lseg() will do the right thing w.r.t.
>>> removing stuff from the list?
>>
>> Note that this is called with a list of lsegs which already have refcount==0 and have been removed from generic layer lists such that no new reference is coming. All that is expected of free_lseg is that the layout driver gets a chance to free any resources it has attached to the lseg, and that we release the reference the lseg holds on the layout header itself.
>>
>> The primary reason for this is that the file layouts ld->free_lseg call can call nfs_put_client, which can sleep. So this allows the ld->free_lseg to be postponed until we can drop locks, similar to the dispose_list function in fs/inode.c
>
> Sure, but why do these things have to be on the free_me list when you
> call ld->free_lseg(lseg)? Why do you suddenly need the INIT_LIST_HEAD()?
>
> As I said, that all looks very dubious...
That was my attempt at optimization. This leaves the free_me list in the same state as if list_del had been called on each entry with a single call. However, I can just call list_del before calling free_lseg.
Fred
>
> --
> Trond Myklebust
> Linux NFS client maintainer
>
> NetApp
> [email protected]
> http://www.netapp.com
>
This is the heart of the wave 2 submission. Add the code to trigger
drain and forget of any afected layouts. In addition, we set a
"barrier", below which any LAYOUTGET reply is ignored. This is to
compensate for the fact that we do not wait for outstanding LAYOUTGETs
to complete as per section 12.5.5.2.1 of RFC 5661.
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/callback_proc.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++-
fs/nfs/nfs4_fs.h | 1 +
fs/nfs/pnfs.c | 83 ++++++++++++++++++++++++++--------
fs/nfs/pnfs.h | 11 ++++
4 files changed, 194 insertions(+), 20 deletions(-)
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index c1bb157..6619c05 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -12,6 +12,7 @@
#include "callback.h"
#include "delegation.h"
#include "internal.h"
+#include "pnfs.h"
#ifdef NFS_DEBUG
#define NFSDBG_FACILITY NFSDBG_CALLBACK
@@ -107,10 +108,126 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
#if defined(CONFIG_NFS_V4_1)
+static u32 initiate_file_draining(struct nfs_client *clp,
+ struct cb_layoutrecallargs *args)
+{
+ struct pnfs_layout_hdr *lo;
+ struct inode *ino;
+ bool found = false;
+ u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+ LIST_HEAD(free_me_list);
+
+ spin_lock(&clp->cl_lock);
+ list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+ if (nfs_compare_fh(&args->cbl_fh,
+ &NFS_I(lo->plh_inode)->fh))
+ continue;
+ ino = igrab(lo->plh_inode);
+ if (!ino)
+ continue;
+ found = true;
+ /* Without this, layout can be freed as soon
+ * as we release cl_lock.
+ */
+ get_layout_hdr(lo);
+ break;
+ }
+ spin_unlock(&clp->cl_lock);
+ if (!found)
+ return NFS4ERR_NOMATCHING_LAYOUT;
+
+ spin_lock(&ino->i_lock);
+ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+ mark_matching_lsegs_invalid(lo, &free_me_list,
+ args->cbl_range.iomode))
+ rv = NFS4ERR_DELAY;
+ else
+ rv = NFS4ERR_NOMATCHING_LAYOUT;
+ pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+ spin_unlock(&ino->i_lock);
+ pnfs_free_lseg_list(&free_me_list);
+ put_layout_hdr(lo);
+ iput(ino);
+ return rv;
+}
+
+static u32 initiate_bulk_draining(struct nfs_client *clp,
+ struct cb_layoutrecallargs *args)
+{
+ struct pnfs_layout_hdr *lo;
+ struct inode *ino;
+ u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
+ struct pnfs_layout_hdr *tmp;
+ LIST_HEAD(recall_list);
+ LIST_HEAD(free_me_list);
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+
+ spin_lock(&clp->cl_lock);
+ list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+ if ((args->cbl_recall_type == RETURN_FSID) &&
+ memcmp(&NFS_SERVER(lo->plh_inode)->fsid,
+ &args->cbl_fsid, sizeof(struct nfs_fsid)))
+ continue;
+ if (!igrab(lo->plh_inode))
+ continue;
+ get_layout_hdr(lo);
+ BUG_ON(!list_empty(&lo->plh_bulk_recall));
+ list_add(&lo->plh_bulk_recall, &recall_list);
+ }
+ spin_unlock(&clp->cl_lock);
+ list_for_each_entry_safe(lo, tmp,
+ &recall_list, plh_bulk_recall) {
+ ino = lo->plh_inode;
+ spin_lock(&ino->i_lock);
+ set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+ if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode))
+ rv = NFS4ERR_DELAY;
+ list_del_init(&lo->plh_bulk_recall);
+ spin_unlock(&ino->i_lock);
+ put_layout_hdr(lo);
+ iput(ino);
+ }
+ pnfs_free_lseg_list(&free_me_list);
+ return rv;
+}
+
+static u32 do_callback_layoutrecall(struct nfs_client *clp,
+ struct cb_layoutrecallargs *args)
+{
+ u32 res = NFS4ERR_DELAY;
+
+ dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
+ if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state))
+ goto out;
+ if (args->cbl_recall_type == RETURN_FILE)
+ res = initiate_file_draining(clp, args);
+ else
+ res = initiate_bulk_draining(clp, args);
+ clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state);
+out:
+ dprintk("%s returning %i\n", __func__, res);
+ return res;
+
+}
+
__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
void *dummy, struct cb_process_state *cps)
{
- return cpu_to_be32(NFS4ERR_NOTSUPP); /* STUB */
+ u32 res;
+
+ dprintk("%s: -->\n", __func__);
+
+ if (cps->clp)
+ res = do_callback_layoutrecall(cps->clp, args);
+ else
+ res = NFS4ERR_OP_NOT_IN_SESSION;
+
+ dprintk("%s: exit with status = %d\n", __func__, res);
+ return cpu_to_be32(res);
}
int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 7a6eecf..d927251 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
NFS4CLNT_RECLAIM_REBOOT,
NFS4CLNT_RECLAIM_NOGRACE,
NFS4CLNT_DELEGRETURN,
+ NFS4CLNT_LAYOUTRECALL,
NFS4CLNT_SESSION_RESET,
NFS4CLNT_RECALL_SLOT,
};
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 32b6646..bf4186b 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -178,7 +178,7 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
*/
/* Need to hold i_lock if caller does not already hold reference */
-static void
+void
get_layout_hdr(struct pnfs_layout_hdr *lo)
{
atomic_inc(&lo->plh_refcount);
@@ -254,6 +254,7 @@ put_lseg_locked(struct pnfs_layout_segment *lseg,
/* List does not take a reference, so no need for put here */
list_del_init(&lseg->pls_layout->plh_layouts);
spin_unlock(&clp->cl_lock);
+ clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
}
list_add(&lseg->pls_list, tmp_list);
return 1;
@@ -287,7 +288,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
/* Returns count of number of matching invalid lsegs remaining in list
* after call.
*/
-static int
+int
mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
u32 iomode)
@@ -310,7 +311,7 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
return invalid - removed;
}
-static void
+void
pnfs_free_lseg_list(struct list_head *free_me)
{
struct pnfs_layout_segment *lseg, *tmp;
@@ -363,23 +364,45 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
}
/* update lo->plh_stateid with new if is more recent */
-static void
-pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
- const nfs4_stateid *new)
+void
+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
+ bool update_barrier)
{
u32 oldseq, newseq;
oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid);
newseq = be32_to_cpu(new->stateid.seqid);
- if ((int)(newseq - oldseq) > 0)
+ if ((int)(newseq - oldseq) > 0) {
memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
+ if (update_barrier) {
+ u32 new_barrier = be32_to_cpu(new->stateid.seqid);
+
+ if ((int)(new_barrier - lo->plh_barrier))
+ lo->plh_barrier = new_barrier;
+ } else {
+ /* Because of wraparound, we want to keep the barrier
+ * "close" to the current seqids. It needs to be
+ * within 2**31 to count as "behind", so if it
+ * gets too near that limit, give us a litle leeway
+ * and bring it to within 2**30.
+ * NOTE - and yes, this is all unsigned arithmetic.
+ */
+ if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
+ lo->plh_barrier = newseq - (1 << 30);
+ }
+ }
}
/* lget is set to 1 if called from inside send_layoutget call chain */
static bool
-pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, int lget)
-{
- return (list_empty(&lo->plh_segs) &&
+pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
+ int lget)
+{
+ if ((stateid) &&
+ (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
+ return true;
+ return test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+ (list_empty(&lo->plh_segs) &&
(atomic_read(&lo->plh_outstanding) > lget));
}
@@ -391,7 +414,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
dprintk("--> %s\n", __func__);
spin_lock(&lo->plh_inode->i_lock);
- if (pnfs_layoutgets_blocked(lo, 1)) {
+ if (pnfs_layoutgets_blocked(lo, NULL, 1)) {
status = -EAGAIN;
} else if (list_empty(&lo->plh_segs)) {
int seq;
@@ -510,6 +533,7 @@ alloc_init_layout_hdr(struct inode *ino)
atomic_set(&lo->plh_refcount, 1);
INIT_LIST_HEAD(&lo->plh_layouts);
INIT_LIST_HEAD(&lo->plh_segs);
+ INIT_LIST_HEAD(&lo->plh_bulk_recall);
lo->plh_inode = ino;
return lo;
}
@@ -561,7 +585,7 @@ is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
* lookup range in layout
*/
static struct pnfs_layout_segment *
-pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
+pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
{
struct pnfs_layout_segment *lseg, *ret = NULL;
@@ -606,19 +630,22 @@ pnfs_update_layout(struct inode *ino,
goto out_unlock;
}
- /* Check to see if the layout for the given range already exists */
- lseg = pnfs_has_layout(lo, iomode);
- if (lseg) {
- dprintk("%s: Using cached lseg %p for iomode %d)\n",
- __func__, lseg, iomode);
+ /* Do we even need to bother with this? */
+ if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+ dprintk("%s matches recall, use MDS\n", __func__);
goto out_unlock;
}
+ /* Check to see if the layout for the given range already exists */
+ lseg = pnfs_find_lseg(lo, iomode);
+ if (lseg)
+ goto out_unlock;
/* if LAYOUTGET already failed once we don't try again */
if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
goto out_unlock;
- if (pnfs_layoutgets_blocked(lo, 0))
+ if (pnfs_layoutgets_blocked(lo, NULL, 0))
goto out_unlock;
atomic_inc(&lo->plh_outstanding);
@@ -641,6 +668,7 @@ pnfs_update_layout(struct inode *ino,
spin_lock(&clp->cl_lock);
list_del_init(&lo->plh_layouts);
spin_unlock(&clp->cl_lock);
+ clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
}
spin_unlock(&ino->i_lock);
}
@@ -662,6 +690,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
struct nfs4_layoutget_res *res = &lgp->res;
struct pnfs_layout_segment *lseg;
struct inode *ino = lo->plh_inode;
+ struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
int status = 0;
/* Verify we got what we asked for.
@@ -688,16 +717,32 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
}
spin_lock(&ino->i_lock);
+ if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+ dprintk("%s forget reply due to recall\n", __func__);
+ goto out_forget_reply;
+ }
+
+ if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) {
+ dprintk("%s forget reply due to state\n", __func__);
+ goto out_forget_reply;
+ }
init_lseg(lo, lseg);
lseg->pls_range = res->range;
*lgp->lsegpp = lseg;
pnfs_insert_layout(lo, lseg);
/* Done processing layoutget. Set the layout stateid */
- pnfs_set_layout_stateid(lo, &res->stateid);
+ pnfs_set_layout_stateid(lo, &res->stateid, false);
spin_unlock(&ino->i_lock);
out:
return status;
+
+out_forget_reply:
+ spin_unlock(&ino->i_lock);
+ lseg->pls_layout = lo;
+ NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+ goto out;
}
/*
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 8aaab56..f91d0d4 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -49,6 +49,7 @@ struct pnfs_layout_segment {
enum {
NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
+ NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */
};
@@ -67,9 +68,11 @@ struct pnfs_layoutdriver_type {
struct pnfs_layout_hdr {
atomic_t plh_refcount;
struct list_head plh_layouts; /* other client layouts */
+ struct list_head plh_bulk_recall; /* clnt list of bulk recalls */
struct list_head plh_segs; /* layout segments list */
nfs4_stateid plh_stateid;
atomic_t plh_outstanding; /* number of RPCs out */
+ u32 plh_barrier; /* ignore lower seqids */
unsigned long plh_flags;
struct inode *plh_inode;
};
@@ -139,18 +142,26 @@ extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
/* pnfs.c */
+void get_layout_hdr(struct pnfs_layout_hdr *lo);
struct pnfs_layout_segment *
pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
enum pnfs_iomode access_type);
void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
void unset_pnfs_layoutdriver(struct nfs_server *);
int pnfs_layout_process(struct nfs4_layoutget *lgp);
+void pnfs_free_lseg_list(struct list_head *tmp_list);
void pnfs_destroy_layout(struct nfs_inode *);
void pnfs_destroy_all_layouts(struct nfs_client *);
void put_layout_hdr(struct pnfs_layout_hdr *lo);
+void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *new,
+ bool update_barrier);
int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
struct pnfs_layout_hdr *lo,
struct nfs4_state *open_state);
+int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+ struct list_head *tmp_list,
+ u32 iomode);
static inline int lo_fail_bit(u32 iomode)
--
1.7.2.1
On Dec 23, 2010, at 2:25 PM, Trond Myklebust wrote:
> On Thu, 2010-12-23 at 12:29 -0500, Fred Isaman wrote:
>> This is the heart of the wave 2 submission. Add the code to trigger
>> drain and forget of any afected layouts. In addition, we set a
>> "barrier", below which any LAYOUTGET reply is ignored. This is to
>> compensate for the fact that we do not wait for outstanding LAYOUTGETs
>> to complete as per section 12.5.5.2.1 of RFC 5661.
>>
>> Signed-off-by: Fred Isaman <[email protected]>
>> ---
>> fs/nfs/callback_proc.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++-
>> fs/nfs/nfs4_fs.h | 1 +
>> fs/nfs/pnfs.c | 84 ++++++++++++++++++++++++++--------
>> fs/nfs/pnfs.h | 12 +++++
>> 4 files changed, 196 insertions(+), 20 deletions(-)
>>
>> diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
>> index c1bb157..a8e8e20 100644
>> --- a/fs/nfs/callback_proc.c
>> +++ b/fs/nfs/callback_proc.c
>> @@ -12,6 +12,7 @@
>> #include "callback.h"
>> #include "delegation.h"
>> #include "internal.h"
>> +#include "pnfs.h"
>>
>> #ifdef NFS_DEBUG
>> #define NFSDBG_FACILITY NFSDBG_CALLBACK
>> @@ -107,10 +108,126 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
>>
>> #if defined(CONFIG_NFS_V4_1)
>>
>> +static int initiate_layout_draining(struct nfs_client *clp,
> ^^^ u32
>
>> + struct cb_layoutrecallargs *args)
>> +{
>> + struct pnfs_layout_hdr *lo;
>> + int rv = NFS4ERR_NOMATCHING_LAYOUT;
> ^^^ u32
>
OK
>> +
>> + if (args->cbl_recall_type == RETURN_FILE) {
>> + LIST_HEAD(free_me_list);
>> +
>> + args->cbl_inode = NULL;
>> + spin_lock(&clp->cl_lock);
>> + list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
>> + if (nfs_compare_fh(&args->cbl_fh,
>> + &NFS_I(lo->plh_inode)->fh))
>> + continue;
>> + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
>> + rv = NFS4ERR_DELAY;
>> + else {
>> + /* Without this, layout can be freed as soon
>> + * as we release cl_lock. Matched in
>> + * do_callback_layoutrecall.
>> + */
>> + get_layout_hdr(lo);
>> + args->cbl_inode = lo->plh_inode;
>> + rv = NFS4_OK;
>> + }
>> + break;
>> + }
>> + spin_unlock(&clp->cl_lock);
>> +
>> + spin_lock(&lo->plh_inode->i_lock);
>
> What guarantees do you have that the inode still exists here? Does the
> layout segment hold a reference to it?
>
This section is why I once asked about whether the layout needs to reference the inode. I believe the answer is nothing. The lseg/layout definitely does not hold direct references. I assume doing grab/put of inode here and below is acceptable?
> Also, How do you know that (!test_bit(NFS_LAYOUT_BULK_RECALL,
> &lo->plh_flags)) is still valid when you get here?
>
Because it can not be set elsewhere while the NFS4CLNT_LAYOUTRECALL bit is set.
>> + if (rv == NFS4_OK) {
>> + lo->plh_block_lgets++;
>> + mark_matching_lsegs_invalid(lo, &free_me_list,
>> + args->cbl_range.iomode);
>> + if (list_empty(&lo->plh_segs))
>> + rv = NFS4ERR_NOMATCHING_LAYOUT;
>> + }
>> + pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
>> + spin_unlock(&lo->plh_inode->i_lock);
>> + pnfs_free_lseg_list(&free_me_list);
>> + } else {
>> + struct pnfs_layout_hdr *tmp;
>> + LIST_HEAD(recall_list);
>> + LIST_HEAD(free_me_list);
>> + struct pnfs_layout_range range = {
>> + .iomode = IOMODE_ANY,
>> + .offset = 0,
>> + .length = NFS4_MAX_UINT64,
>> + };
>> +
>> + spin_lock(&clp->cl_lock);
>> + list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
>> + if ((args->cbl_recall_type == RETURN_FSID) &&
>> + memcmp(&NFS_SERVER(lo->plh_inode)->fsid,
>> + &args->cbl_fsid, sizeof(struct nfs_fsid)))
>> + continue;
>> + get_layout_hdr(lo);
>> + BUG_ON(!list_empty(&lo->plh_bulk_recall));
>> + list_add(&lo->plh_bulk_recall, &recall_list);
>> + }
>> + spin_unlock(&clp->cl_lock);
>> + list_for_each_entry_safe(lo, tmp,
>> + &recall_list, plh_bulk_recall) {
>> + spin_lock(&lo->plh_inode->i_lock);
>
> Ditto. What guarantees that the inode still exists here, and how do you
> avoid races with a file return request?
>
>> + set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
>> + mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode);
>> + list_del_init(&lo->plh_bulk_recall);
>> + if (!list_empty(&lo->plh_segs))
>> + rv = NFS4_OK;
>> + spin_unlock(&lo->plh_inode->i_lock);
>> + put_layout_hdr(lo);
>> + }
>> + pnfs_free_lseg_list(&free_me_list);
>> + }
>> + return rv;
>> +}
>
> Shouldn't the above really be 2 different functions? One for file
> return, and the other for bulk layout returns?
>
OK
>> +static u32 do_callback_layoutrecall(struct nfs_client *clp,
>> + struct cb_layoutrecallargs *args)
>> +{
>> + u32 status, res = NFS4ERR_DELAY;
>> +
>> + dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
>> + if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state))
>> + goto out;
>> + status = initiate_layout_draining(clp, args);
>> + if (status)
> ^^^^^^^^^^^
> If you are going to insist on spelling out 'rv = NFS4_OK' instead of 'rv
> = 0' in initiate_layout_draining(), then shouldn't the above test be 'if
> (status != NFS4_OK)'?
>
> Please just dump all these NFS4_OK references, and replace them with 'rv
> = 0' so that tests like the above make sense. That's consistent with the
> rest of the NFS code.
OK
>
>> + res = status;
>> + else if (args->cbl_recall_type == RETURN_FILE) {
>> + struct pnfs_layout_hdr *lo;
>> +
>> + lo = NFS_I(args->cbl_inode)->layout;
>> + spin_lock(&lo->plh_inode->i_lock);
>> + lo->plh_block_lgets--;
>> + spin_unlock(&lo->plh_inode->i_lock);
>> + put_layout_hdr(lo);
>> + res = NFS4ERR_DELAY;
>
> Eh? Why doesn't this belong in (the split up version of)
> initiate_layout_draining()?
You're right. This is an artifact from when LAYOUTRETURN was sent. I'll split initiate_layout_draining and add this hunk in.
Fred
>
>> + }
>> + clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state);
>> +out:
>> + dprintk("%s returning %i\n", __func__, res);
>> + return res;
>> +
>> +}
>> +
>
> --
> Trond Myklebust
> Linux NFS client maintainer
>
> NetApp
> [email protected]
> http://www.netapp.com
>
On Thu, 2010-12-23 at 15:08 -0500, Fred Isaman wrote:
> On Dec 23, 2010, at 2:25 PM, Trond Myklebust wrote:
>
> > On Thu, 2010-12-23 at 12:29 -0500, Fred Isaman wrote:
> >> +
> >> + if (args->cbl_recall_type == RETURN_FILE) {
> >> + LIST_HEAD(free_me_list);
> >> +
> >> + args->cbl_inode = NULL;
> >> + spin_lock(&clp->cl_lock);
> >> + list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
> >> + if (nfs_compare_fh(&args->cbl_fh,
> >> + &NFS_I(lo->plh_inode)->fh))
> >> + continue;
> >> + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
> >> + rv = NFS4ERR_DELAY;
> >> + else {
> >> + /* Without this, layout can be freed as soon
> >> + * as we release cl_lock. Matched in
> >> + * do_callback_layoutrecall.
> >> + */
> >> + get_layout_hdr(lo);
> >> + args->cbl_inode = lo->plh_inode;
> >> + rv = NFS4_OK;
> >> + }
> >> + break;
> >> + }
> >> + spin_unlock(&clp->cl_lock);
> >> +
> >> + spin_lock(&lo->plh_inode->i_lock);
> >
> > What guarantees do you have that the inode still exists here? Does the
> > layout segment hold a reference to it?
> >
>
> This section is why I once asked about whether the layout needs to reference the inode. I believe the answer is nothing. The lseg/layout definitely does not hold direct references. I assume doing grab/put of inode here and below is acceptable?
igrab/iput is fine in this context.
There is also the question: what if we don't find a match for the layout
recall arguments? It looks to me as if we end up with 'lo ==
clp->cl_layouts', with disastrous consequences when we then do
spin_lock(lo->plh_inode->i_lock)...
> > Also, How do you know that (!test_bit(NFS_LAYOUT_BULK_RECALL,
> > &lo->plh_flags)) is still valid when you get here?
> >
>
> Because it can not be set elsewhere while the NFS4CLNT_LAYOUTRECALL bit is set.
OK. One race averted :-)
--
Trond Myklebust
Linux NFS client maintainer
NetApp
[email protected]
http://www.netapp.com
Comment references get_layout_hdr_locked, which never existed in
submitted code.
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/pnfs.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index db77342..6e9daff 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -225,7 +225,7 @@ destroy_lseg(struct kref *kref)
dprintk("--> %s\n", __func__);
NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
- /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
+ /* Matched by get_layout_hdr in pnfs_insert_layout */
put_layout_hdr(ino);
}
--
1.7.2.1
This will be required to allow us to grab reference outside of i_lock.
While we are at it, make put_layout_hdr take the same argument as all the
related functions.
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/pnfs.c | 48 ++++++++++++++++++++++++++----------------------
fs/nfs/pnfs.h | 4 ++--
2 files changed, 28 insertions(+), 24 deletions(-)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index cd99064..32b6646 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -177,34 +177,38 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
* pNFS client layout cache
*/
+/* Need to hold i_lock if caller does not already hold reference */
static void
-get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+get_layout_hdr(struct pnfs_layout_hdr *lo)
{
- assert_spin_locked(&lo->plh_inode->i_lock);
- lo->plh_refcount++;
+ atomic_inc(&lo->plh_refcount);
}
static void
-put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+destroy_layout_hdr(struct pnfs_layout_hdr *lo)
{
- assert_spin_locked(&lo->plh_inode->i_lock);
- BUG_ON(lo->plh_refcount == 0);
+ dprintk("%s: freeing layout cache %p\n", __func__, lo);
+ BUG_ON(!list_empty(&lo->plh_layouts));
+ NFS_I(lo->plh_inode)->layout = NULL;
+ kfree(lo);
+}
- lo->plh_refcount--;
- if (!lo->plh_refcount) {
- dprintk("%s: freeing layout cache %p\n", __func__, lo);
- BUG_ON(!list_empty(&lo->plh_layouts));
- NFS_I(lo->plh_inode)->layout = NULL;
- kfree(lo);
- }
+static void
+put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+{
+ if (atomic_dec_and_test(&lo->plh_refcount))
+ destroy_layout_hdr(lo);
}
void
-put_layout_hdr(struct inode *inode)
+put_layout_hdr(struct pnfs_layout_hdr *lo)
{
- spin_lock(&inode->i_lock);
- put_layout_hdr_locked(NFS_I(inode)->layout);
- spin_unlock(&inode->i_lock);
+ struct inode *inode = lo->plh_inode;
+
+ if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+ destroy_layout_hdr(lo);
+ spin_unlock(&inode->i_lock);
+ }
}
static void
@@ -223,7 +227,7 @@ static void free_lseg(struct pnfs_layout_segment *lseg)
NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
/* Matched by get_layout_hdr in pnfs_insert_layout */
- put_layout_hdr(ino);
+ put_layout_hdr(NFS_I(ino)->layout);
}
/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
@@ -490,7 +494,7 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
__func__, lseg, lseg->pls_range.iomode,
lseg->pls_range.offset, lseg->pls_range.length);
}
- get_layout_hdr_locked(lo);
+ get_layout_hdr(lo);
dprintk("%s:Return\n", __func__);
}
@@ -503,7 +507,7 @@ alloc_init_layout_hdr(struct inode *ino)
lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
if (!lo)
return NULL;
- lo->plh_refcount = 1;
+ atomic_set(&lo->plh_refcount, 1);
INIT_LIST_HEAD(&lo->plh_layouts);
INIT_LIST_HEAD(&lo->plh_segs);
lo->plh_inode = ino;
@@ -618,7 +622,7 @@ pnfs_update_layout(struct inode *ino,
goto out_unlock;
atomic_inc(&lo->plh_outstanding);
- get_layout_hdr_locked(lo);
+ get_layout_hdr(lo);
if (list_empty(&lo->plh_segs)) {
/* The lo must be on the clp list if there is any
* chance of a CB_LAYOUTRECALL(FILE) coming in.
@@ -641,7 +645,7 @@ pnfs_update_layout(struct inode *ino,
spin_unlock(&ino->i_lock);
}
atomic_dec(&lo->plh_outstanding);
- put_layout_hdr(ino);
+ put_layout_hdr(lo);
out:
dprintk("%s end, state 0x%lx lseg %p\n", __func__,
nfsi->layout->plh_flags, lseg);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 698380d..8aaab56 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -65,7 +65,7 @@ struct pnfs_layoutdriver_type {
};
struct pnfs_layout_hdr {
- unsigned long plh_refcount;
+ atomic_t plh_refcount;
struct list_head plh_layouts; /* other client layouts */
struct list_head plh_segs; /* layout segments list */
nfs4_stateid plh_stateid;
@@ -147,7 +147,7 @@ void unset_pnfs_layoutdriver(struct nfs_server *);
int pnfs_layout_process(struct nfs4_layoutget *lgp);
void pnfs_destroy_layout(struct nfs_inode *);
void pnfs_destroy_all_layouts(struct nfs_client *);
-void put_layout_hdr(struct inode *inode);
+void put_layout_hdr(struct pnfs_layout_hdr *lo);
int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
struct pnfs_layout_hdr *lo,
struct nfs4_state *open_state);
--
1.7.2.1
This is the xdr decoding for CB_LAYOUTRECALL.
Signed-off-by: Alexandros Batsakis <[email protected]>
Signed-off-by: Dean Hildebrand <[email protected]>
Signed-off-by: Marc Eshel <[email protected]>
Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/callback.h | 19 +++++++++++++
fs/nfs/callback_proc.c | 6 ++++
fs/nfs/callback_xdr.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 93 insertions(+), 1 deletions(-)
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index b678e3e..f6768ac 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -138,6 +138,25 @@ extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
void *dummy,
struct cb_process_state *cps);
+struct cb_layoutrecallargs {
+ struct sockaddr *cbl_addr;
+ uint32_t cbl_recall_type;
+ uint32_t cbl_layout_type;
+ uint32_t cbl_layoutchanged;
+ union {
+ struct {
+ struct nfs_fh cbl_fh;
+ struct pnfs_layout_range cbl_range;
+ nfs4_stateid cbl_stateid;
+ };
+ struct nfs_fsid cbl_fsid;
+ };
+};
+
+extern unsigned nfs4_callback_layoutrecall(
+ struct cb_layoutrecallargs *args,
+ void *dummy, struct cb_process_state *cps);
+
extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
extern void nfs4_cb_take_slot(struct nfs_client *clp);
#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index c1bead2..c1bb157 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -107,6 +107,12 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
#if defined(CONFIG_NFS_V4_1)
+__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
+ void *dummy, struct cb_process_state *cps)
+{
+ return cpu_to_be32(NFS4ERR_NOTSUPP); /* STUB */
+}
+
int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
{
if (delegation == NULL)
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 9dbccf4..1523c99 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -24,6 +24,7 @@
#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#if defined(CONFIG_NFS_V4_1)
+#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
4 + 1 + 3)
#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
@@ -223,6 +224,66 @@ out:
#if defined(CONFIG_NFS_V4_1)
+static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct cb_layoutrecallargs *args)
+{
+ __be32 *p;
+ __be32 status = 0;
+ uint32_t iomode;
+
+ args->cbl_addr = svc_addr(rqstp);
+ p = read_buf(xdr, 4 * sizeof(uint32_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto out;
+ }
+
+ args->cbl_layout_type = ntohl(*p++);
+ /* Depite the spec's xdr, iomode really belongs in the FILE switch,
+ * as it is unuseable and ignored with the other types.
+ */
+ iomode = ntohl(*p++);
+ args->cbl_layoutchanged = ntohl(*p++);
+ args->cbl_recall_type = ntohl(*p++);
+
+ if (args->cbl_recall_type == RETURN_FILE) {
+ args->cbl_range.iomode = iomode;
+ status = decode_fh(xdr, &args->cbl_fh);
+ if (unlikely(status != 0))
+ goto out;
+
+ p = read_buf(xdr, 2 * sizeof(uint64_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto out;
+ }
+ p = xdr_decode_hyper(p, &args->cbl_range.offset);
+ p = xdr_decode_hyper(p, &args->cbl_range.length);
+ status = decode_stateid(xdr, &args->cbl_stateid);
+ if (unlikely(status != 0))
+ goto out;
+ } else if (args->cbl_recall_type == RETURN_FSID) {
+ p = read_buf(xdr, 2 * sizeof(uint64_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto out;
+ }
+ p = xdr_decode_hyper(p, &args->cbl_fsid.major);
+ p = xdr_decode_hyper(p, &args->cbl_fsid.minor);
+ } else if (args->cbl_recall_type != RETURN_ALL) {
+ status = htonl(NFS4ERR_BADXDR);
+ goto out;
+ }
+ dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n",
+ __func__,
+ args->cbl_layout_type, iomode,
+ args->cbl_layoutchanged, args->cbl_recall_type);
+out:
+ dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+ return status;
+}
+
static __be32 decode_sessionid(struct xdr_stream *xdr,
struct nfs4_sessionid *sid)
{
@@ -577,10 +638,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
case OP_CB_SEQUENCE:
case OP_CB_RECALL_ANY:
case OP_CB_RECALL_SLOT:
+ case OP_CB_LAYOUTRECALL:
*op = &callback_ops[op_nr];
break;
- case OP_CB_LAYOUTRECALL:
case OP_CB_NOTIFY_DEVICEID:
case OP_CB_NOTIFY:
case OP_CB_PUSH_DELEG:
@@ -784,6 +845,12 @@ static struct callback_op callback_ops[] = {
.res_maxsize = CB_OP_RECALL_RES_MAXSZ,
},
#if defined(CONFIG_NFS_V4_1)
+ [OP_CB_LAYOUTRECALL] = {
+ .process_op = (callback_process_op_t)nfs4_callback_layoutrecall,
+ .decode_args =
+ (callback_decode_arg_t)decode_layoutrecall_args,
+ .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
+ },
[OP_CB_SEQUENCE] = {
.process_op = (callback_process_op_t)nfs4_callback_sequence,
.decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
--
1.7.2.1
This prepares for future changes, where the layout state needs
to change atomically with several other variables. In particular,
it will need to know if lo->segs is empty, as we test that instead
of manipulating the NFS_LAYOUT_STATEID_SET bit. Moreover, the
layoutstateid is not really a read-mostly structure, as it is
written almost as often as it is read.
The behavior of pnfs_get_layout_stateid is also slightly changed, so that
it no longer changes the stateid. Its name is changed to +pnfs_choose_layoutget_stateid.
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/nfs4xdr.c | 2 +-
fs/nfs/pnfs.c | 79 +++++++++++++++--------------------------------------
fs/nfs/pnfs.h | 7 ++---
3 files changed, 27 insertions(+), 61 deletions(-)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f3f9915..4e28242 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1798,7 +1798,7 @@ encode_layoutget(struct xdr_stream *xdr,
p = xdr_encode_hyper(p, args->range.offset);
p = xdr_encode_hyper(p, args->range.length);
p = xdr_encode_hyper(p, args->minlength);
- pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
+ pnfs_choose_layoutget_stateid(&stateid, NFS_I(args->inode)->layout,
args->ctx->state);
p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
*p = cpu_to_be32(args->maxcount);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 6736f9e..08313f53 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -258,9 +258,6 @@ pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
/* List does not take a reference, so no need for put here */
list_del_init(&lo->plh_layouts);
spin_unlock(&clp->cl_lock);
- write_seqlock(&lo->plh_seqlock);
- clear_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
- write_sequnlock(&lo->plh_seqlock);
dprintk("%s:Return\n", __func__);
}
@@ -319,69 +316,40 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
}
}
-/* update lo->plh_stateid with new if is more recent
- *
- * lo->plh_stateid could be the open stateid, in which case we just use what given.
- */
+/* update lo->plh_stateid with new if is more recent */
static void
pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
const nfs4_stateid *new)
{
- nfs4_stateid *old = &lo->plh_stateid;
- bool overwrite = false;
-
- write_seqlock(&lo->plh_seqlock);
- if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags) ||
- memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
- overwrite = true;
- else {
- u32 oldseq, newseq;
-
- oldseq = be32_to_cpu(old->stateid.seqid);
- newseq = be32_to_cpu(new->stateid.seqid);
- if ((int)(newseq - oldseq) > 0)
- overwrite = true;
- }
- if (overwrite)
- memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
- write_sequnlock(&lo->plh_seqlock);
-}
-
-static void
-pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
- struct nfs4_state *state)
-{
- int seq;
+ u32 oldseq, newseq;
- dprintk("--> %s\n", __func__);
- write_seqlock(&lo->plh_seqlock);
- do {
- seq = read_seqbegin(&state->seqlock);
- memcpy(lo->plh_stateid.data, state->stateid.data,
- sizeof(state->stateid.data));
- } while (read_seqretry(&state->seqlock, seq));
- set_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
- write_sequnlock(&lo->plh_seqlock);
- dprintk("<-- %s\n", __func__);
+ oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid);
+ newseq = be32_to_cpu(new->stateid.seqid);
+ if ((int)(newseq - oldseq) > 0)
+ memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
}
-void
-pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
- struct nfs4_state *open_state)
+int
+pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+ struct nfs4_state *open_state)
{
- int seq;
+ int status = 0;
dprintk("--> %s\n", __func__);
- do {
- seq = read_seqbegin(&lo->plh_seqlock);
- if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags)) {
- /* This will trigger retry of the read */
- pnfs_layout_from_open_stateid(lo, open_state);
- } else
- memcpy(dst->data, lo->plh_stateid.data,
- sizeof(lo->plh_stateid.data));
- } while (read_seqretry(&lo->plh_seqlock, seq));
+ spin_lock(&lo->plh_inode->i_lock);
+ if (list_empty(&lo->plh_segs)) {
+ int seq;
+
+ do {
+ seq = read_seqbegin(&open_state->seqlock);
+ memcpy(dst->data, open_state->stateid.data,
+ sizeof(open_state->stateid.data));
+ } while (read_seqretry(&open_state->seqlock, seq));
+ } else
+ memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data));
+ spin_unlock(&lo->plh_inode->i_lock);
dprintk("<-- %s\n", __func__);
+ return status;
}
/*
@@ -496,7 +464,6 @@ alloc_init_layout_hdr(struct inode *ino)
lo->plh_refcount = 1;
INIT_LIST_HEAD(&lo->plh_layouts);
INIT_LIST_HEAD(&lo->plh_segs);
- seqlock_init(&lo->plh_seqlock);
lo->plh_inode = ino;
return lo;
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index c2f1086..1093720 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -44,7 +44,6 @@ struct pnfs_layout_segment {
enum {
NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
- NFS_LAYOUT_STATEID_SET, /* have a valid layout stateid */
};
/* Per-layout driver specific registration structure */
@@ -63,7 +62,6 @@ struct pnfs_layout_hdr {
unsigned long plh_refcount;
struct list_head plh_layouts; /* other client layouts */
struct list_head plh_segs; /* layout segments list */
- seqlock_t plh_seqlock; /* Protects the stateid */
nfs4_stateid plh_stateid;
unsigned long plh_flags;
struct inode *plh_inode;
@@ -143,8 +141,9 @@ int pnfs_layout_process(struct nfs4_layoutget *lgp);
void pnfs_destroy_layout(struct nfs_inode *);
void pnfs_destroy_all_layouts(struct nfs_client *);
void put_layout_hdr(struct inode *inode);
-void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
- struct nfs4_state *open_state);
+int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
+ struct pnfs_layout_hdr *lo,
+ struct nfs4_state *open_state);
static inline int lo_fail_bit(u32 iomode)
--
1.7.2.1
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/nfs4filelayout.c | 6 +-
fs/nfs/pnfs.c | 94 +++++++++++++++++++++++-----------------------
fs/nfs/pnfs.h | 12 +++---
3 files changed, 56 insertions(+), 56 deletions(-)
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 738d6a4..23f930c 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -82,7 +82,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
{
struct nfs4_file_layout_dsaddr *dsaddr;
int status = -EINVAL;
- struct nfs_server *nfss = NFS_SERVER(lo->inode);
+ struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
dprintk("--> %s\n", __func__);
@@ -101,7 +101,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
/* find and reference the deviceid */
dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
if (dsaddr == NULL) {
- dsaddr = get_device_info(lo->inode, id);
+ dsaddr = get_device_info(lo->plh_inode, id);
if (dsaddr == NULL)
goto out;
}
@@ -243,7 +243,7 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
static void
filelayout_free_lseg(struct pnfs_layout_segment *lseg)
{
- struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->inode);
+ struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
dprintk("--> %s\n", __func__);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c3ca5fe..6736f9e 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -180,21 +180,21 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
static void
get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
{
- assert_spin_locked(&lo->inode->i_lock);
- lo->refcount++;
+ assert_spin_locked(&lo->plh_inode->i_lock);
+ lo->plh_refcount++;
}
static void
put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
{
- assert_spin_locked(&lo->inode->i_lock);
- BUG_ON(lo->refcount == 0);
+ assert_spin_locked(&lo->plh_inode->i_lock);
+ BUG_ON(lo->plh_refcount == 0);
- lo->refcount--;
- if (!lo->refcount) {
+ lo->plh_refcount--;
+ if (!lo->plh_refcount) {
dprintk("%s: freeing layout cache %p\n", __func__, lo);
- BUG_ON(!list_empty(&lo->layouts));
- NFS_I(lo->inode)->layout = NULL;
+ BUG_ON(!list_empty(&lo->plh_layouts));
+ NFS_I(lo->plh_inode)->layout = NULL;
kfree(lo);
}
}
@@ -221,7 +221,7 @@ destroy_lseg(struct kref *kref)
{
struct pnfs_layout_segment *lseg =
container_of(kref, struct pnfs_layout_segment, pls_refcount);
- struct inode *ino = lseg->pls_layout->inode;
+ struct inode *ino = lseg->pls_layout->plh_inode;
dprintk("--> %s\n", __func__);
NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
@@ -248,19 +248,19 @@ pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
dprintk("%s:Begin lo %p\n", __func__, lo);
- assert_spin_locked(&lo->inode->i_lock);
- list_for_each_entry_safe(lseg, next, &lo->segs, pls_list) {
+ assert_spin_locked(&lo->plh_inode->i_lock);
+ list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) {
dprintk("%s: freeing lseg %p\n", __func__, lseg);
list_move(&lseg->pls_list, tmp_list);
}
- clp = NFS_SERVER(lo->inode)->nfs_client;
+ clp = NFS_SERVER(lo->plh_inode)->nfs_client;
spin_lock(&clp->cl_lock);
/* List does not take a reference, so no need for put here */
- list_del_init(&lo->layouts);
+ list_del_init(&lo->plh_layouts);
spin_unlock(&clp->cl_lock);
- write_seqlock(&lo->seqlock);
+ write_seqlock(&lo->plh_seqlock);
clear_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
- write_sequnlock(&lo->seqlock);
+ write_sequnlock(&lo->plh_seqlock);
dprintk("%s:Return\n", __func__);
}
@@ -312,25 +312,25 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
while (!list_empty(&tmp_list)) {
lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
- layouts);
+ plh_layouts);
dprintk("%s freeing layout for inode %lu\n", __func__,
- lo->inode->i_ino);
- pnfs_destroy_layout(NFS_I(lo->inode));
+ lo->plh_inode->i_ino);
+ pnfs_destroy_layout(NFS_I(lo->plh_inode));
}
}
-/* update lo->stateid with new if is more recent
+/* update lo->plh_stateid with new if is more recent
*
- * lo->stateid could be the open stateid, in which case we just use what given.
+ * lo->plh_stateid could be the open stateid, in which case we just use what given.
*/
static void
pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
const nfs4_stateid *new)
{
- nfs4_stateid *old = &lo->stateid;
+ nfs4_stateid *old = &lo->plh_stateid;
bool overwrite = false;
- write_seqlock(&lo->seqlock);
+ write_seqlock(&lo->plh_seqlock);
if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags) ||
memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
overwrite = true;
@@ -344,7 +344,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
}
if (overwrite)
memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
- write_sequnlock(&lo->seqlock);
+ write_sequnlock(&lo->plh_seqlock);
}
static void
@@ -354,14 +354,14 @@ pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
int seq;
dprintk("--> %s\n", __func__);
- write_seqlock(&lo->seqlock);
+ write_seqlock(&lo->plh_seqlock);
do {
seq = read_seqbegin(&state->seqlock);
- memcpy(lo->stateid.data, state->stateid.data,
+ memcpy(lo->plh_stateid.data, state->stateid.data,
sizeof(state->stateid.data));
} while (read_seqretry(&state->seqlock, seq));
set_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
- write_sequnlock(&lo->seqlock);
+ write_sequnlock(&lo->plh_seqlock);
dprintk("<-- %s\n", __func__);
}
@@ -373,14 +373,14 @@ pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
dprintk("--> %s\n", __func__);
do {
- seq = read_seqbegin(&lo->seqlock);
+ seq = read_seqbegin(&lo->plh_seqlock);
if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags)) {
/* This will trigger retry of the read */
pnfs_layout_from_open_stateid(lo, open_state);
} else
- memcpy(dst->data, lo->stateid.data,
- sizeof(lo->stateid.data));
- } while (read_seqretry(&lo->seqlock, seq));
+ memcpy(dst->data, lo->plh_stateid.data,
+ sizeof(lo->plh_stateid.data));
+ } while (read_seqretry(&lo->plh_seqlock, seq));
dprintk("<-- %s\n", __func__);
}
@@ -395,7 +395,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
struct nfs_open_context *ctx,
u32 iomode)
{
- struct inode *ino = lo->inode;
+ struct inode *ino = lo->plh_inode;
struct nfs_server *server = NFS_SERVER(ino);
struct nfs4_layoutget *lgp;
struct pnfs_layout_segment *lseg = NULL;
@@ -405,7 +405,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
BUG_ON(ctx == NULL);
lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
if (lgp == NULL) {
- put_layout_hdr(lo->inode);
+ put_layout_hdr(lo->plh_inode);
return NULL;
}
lgp->args.minlength = NFS4_MAX_UINT64;
@@ -450,16 +450,16 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
dprintk("%s:Begin\n", __func__);
- assert_spin_locked(&lo->inode->i_lock);
- if (list_empty(&lo->segs)) {
- struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
+ assert_spin_locked(&lo->plh_inode->i_lock);
+ if (list_empty(&lo->plh_segs)) {
+ struct nfs_client *clp = NFS_SERVER(lo->plh_inode)->nfs_client;
spin_lock(&clp->cl_lock);
- BUG_ON(!list_empty(&lo->layouts));
- list_add_tail(&lo->layouts, &clp->cl_layouts);
+ BUG_ON(!list_empty(&lo->plh_layouts));
+ list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
spin_unlock(&clp->cl_lock);
}
- list_for_each_entry(lp, &lo->segs, pls_list) {
+ list_for_each_entry(lp, &lo->plh_segs, pls_list) {
if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
continue;
list_add_tail(&lseg->pls_list, &lp->pls_list);
@@ -474,7 +474,7 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
break;
}
if (!found) {
- list_add_tail(&lseg->pls_list, &lo->segs);
+ list_add_tail(&lseg->pls_list, &lo->plh_segs);
dprintk("%s: inserted lseg %p "
"iomode %d offset %llu length %llu at tail\n",
__func__, lseg, lseg->pls_range.iomode,
@@ -493,11 +493,11 @@ alloc_init_layout_hdr(struct inode *ino)
lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
if (!lo)
return NULL;
- lo->refcount = 1;
- INIT_LIST_HEAD(&lo->layouts);
- INIT_LIST_HEAD(&lo->segs);
- seqlock_init(&lo->seqlock);
- lo->inode = ino;
+ lo->plh_refcount = 1;
+ INIT_LIST_HEAD(&lo->plh_layouts);
+ INIT_LIST_HEAD(&lo->plh_segs);
+ seqlock_init(&lo->plh_seqlock);
+ lo->plh_inode = ino;
return lo;
}
@@ -551,8 +551,8 @@ pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
dprintk("%s:Begin\n", __func__);
- assert_spin_locked(&lo->inode->i_lock);
- list_for_each_entry(lseg, &lo->segs, pls_list) {
+ assert_spin_locked(&lo->plh_inode->i_lock);
+ list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
if (is_matching_lseg(lseg, iomode)) {
ret = lseg;
break;
@@ -619,7 +619,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
struct nfs4_layoutget_res *res = &lgp->res;
struct pnfs_layout_segment *lseg;
- struct inode *ino = lo->inode;
+ struct inode *ino = lo->plh_inode;
int status = 0;
/* Inject layout blob into I/O device driver */
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 6fcc073..c2f1086 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -60,13 +60,13 @@ struct pnfs_layoutdriver_type {
};
struct pnfs_layout_hdr {
- unsigned long refcount;
- struct list_head layouts; /* other client layouts */
- struct list_head segs; /* layout segments list */
- seqlock_t seqlock; /* Protects the stateid */
- nfs4_stateid stateid;
+ unsigned long plh_refcount;
+ struct list_head plh_layouts; /* other client layouts */
+ struct list_head plh_segs; /* layout segments list */
+ seqlock_t plh_seqlock; /* Protects the stateid */
+ nfs4_stateid plh_stateid;
unsigned long plh_flags;
- struct inode *inode;
+ struct inode *plh_inode;
};
struct pnfs_device {
--
1.7.2.1
While we are renaming all the fields, change lo->state to lo->plh_flags.
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/nfs4filelayout.c | 2 +-
fs/nfs/pnfs.c | 66 +++++++++++++++++++++++-----------------------
fs/nfs/pnfs.h | 10 +++---
3 files changed, 39 insertions(+), 39 deletions(-)
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 2e92f0d..738d6a4 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -243,7 +243,7 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
static void
filelayout_free_lseg(struct pnfs_layout_segment *lseg)
{
- struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode);
+ struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->inode);
struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
dprintk("--> %s\n", __func__);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 6e9daff..c3ca5fe 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -210,9 +210,9 @@ put_layout_hdr(struct inode *inode)
static void
init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
{
- INIT_LIST_HEAD(&lseg->fi_list);
- kref_init(&lseg->kref);
- lseg->layout = lo;
+ INIT_LIST_HEAD(&lseg->pls_list);
+ kref_init(&lseg->pls_refcount);
+ lseg->pls_layout = lo;
}
/* Called without i_lock held, as the free_lseg call may sleep */
@@ -220,8 +220,8 @@ static void
destroy_lseg(struct kref *kref)
{
struct pnfs_layout_segment *lseg =
- container_of(kref, struct pnfs_layout_segment, kref);
- struct inode *ino = lseg->layout->inode;
+ container_of(kref, struct pnfs_layout_segment, pls_refcount);
+ struct inode *ino = lseg->pls_layout->inode;
dprintk("--> %s\n", __func__);
NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
@@ -236,8 +236,8 @@ put_lseg(struct pnfs_layout_segment *lseg)
return;
dprintk("%s: lseg %p ref %d\n", __func__, lseg,
- atomic_read(&lseg->kref.refcount));
- kref_put(&lseg->kref, destroy_lseg);
+ atomic_read(&lseg->pls_refcount.refcount));
+ kref_put(&lseg->pls_refcount, destroy_lseg);
}
static void
@@ -249,9 +249,9 @@ pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
dprintk("%s:Begin lo %p\n", __func__, lo);
assert_spin_locked(&lo->inode->i_lock);
- list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) {
+ list_for_each_entry_safe(lseg, next, &lo->segs, pls_list) {
dprintk("%s: freeing lseg %p\n", __func__, lseg);
- list_move(&lseg->fi_list, tmp_list);
+ list_move(&lseg->pls_list, tmp_list);
}
clp = NFS_SERVER(lo->inode)->nfs_client;
spin_lock(&clp->cl_lock);
@@ -259,7 +259,7 @@ pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
list_del_init(&lo->layouts);
spin_unlock(&clp->cl_lock);
write_seqlock(&lo->seqlock);
- clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
+ clear_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
write_sequnlock(&lo->seqlock);
dprintk("%s:Return\n", __func__);
@@ -272,9 +272,9 @@ pnfs_free_lseg_list(struct list_head *tmp_list)
while (!list_empty(tmp_list)) {
lseg = list_entry(tmp_list->next, struct pnfs_layout_segment,
- fi_list);
+ pls_list);
dprintk("%s calling put_lseg on %p\n", __func__, lseg);
- list_del(&lseg->fi_list);
+ list_del(&lseg->pls_list);
put_lseg(lseg);
}
}
@@ -331,7 +331,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
bool overwrite = false;
write_seqlock(&lo->seqlock);
- if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) ||
+ if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags) ||
memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
overwrite = true;
else {
@@ -360,7 +360,7 @@ pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
memcpy(lo->stateid.data, state->stateid.data,
sizeof(state->stateid.data));
} while (read_seqretry(&state->seqlock, seq));
- set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
+ set_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags);
write_sequnlock(&lo->seqlock);
dprintk("<-- %s\n", __func__);
}
@@ -374,7 +374,7 @@ pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
dprintk("--> %s\n", __func__);
do {
seq = read_seqbegin(&lo->seqlock);
- if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
+ if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->plh_flags)) {
/* This will trigger retry of the read */
pnfs_layout_from_open_stateid(lo, open_state);
} else
@@ -424,7 +424,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
nfs4_proc_layoutget(lgp);
if (!lseg) {
/* remember that LAYOUTGET failed and suspend trying */
- set_bit(lo_fail_bit(iomode), &lo->state);
+ set_bit(lo_fail_bit(iomode), &lo->plh_flags);
}
return lseg;
}
@@ -459,26 +459,26 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
list_add_tail(&lo->layouts, &clp->cl_layouts);
spin_unlock(&clp->cl_lock);
}
- list_for_each_entry(lp, &lo->segs, fi_list) {
- if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0)
+ list_for_each_entry(lp, &lo->segs, pls_list) {
+ if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
continue;
- list_add_tail(&lseg->fi_list, &lp->fi_list);
+ list_add_tail(&lseg->pls_list, &lp->pls_list);
dprintk("%s: inserted lseg %p "
"iomode %d offset %llu length %llu before "
"lp %p iomode %d offset %llu length %llu\n",
- __func__, lseg, lseg->range.iomode,
- lseg->range.offset, lseg->range.length,
- lp, lp->range.iomode, lp->range.offset,
- lp->range.length);
+ __func__, lseg, lseg->pls_range.iomode,
+ lseg->pls_range.offset, lseg->pls_range.length,
+ lp, lp->pls_range.iomode, lp->pls_range.offset,
+ lp->pls_range.length);
found = 1;
break;
}
if (!found) {
- list_add_tail(&lseg->fi_list, &lo->segs);
+ list_add_tail(&lseg->pls_list, &lo->segs);
dprintk("%s: inserted lseg %p "
"iomode %d offset %llu length %llu at tail\n",
- __func__, lseg, lseg->range.iomode,
- lseg->range.offset, lseg->range.length);
+ __func__, lseg, lseg->pls_range.iomode,
+ lseg->pls_range.offset, lseg->pls_range.length);
}
get_layout_hdr_locked(lo);
@@ -538,7 +538,7 @@ pnfs_find_alloc_layout(struct inode *ino)
static int
is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
{
- return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW);
+ return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
}
/*
@@ -552,17 +552,17 @@ pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
dprintk("%s:Begin\n", __func__);
assert_spin_locked(&lo->inode->i_lock);
- list_for_each_entry(lseg, &lo->segs, fi_list) {
+ list_for_each_entry(lseg, &lo->segs, pls_list) {
if (is_matching_lseg(lseg, iomode)) {
ret = lseg;
break;
}
- if (cmp_layout(iomode, lseg->range.iomode) > 0)
+ if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
break;
}
dprintk("%s:Return lseg %p ref %d\n",
- __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0);
+ __func__, ret, ret ? atomic_read(&ret->pls_refcount.refcount) : 0);
return ret;
}
@@ -597,7 +597,7 @@ pnfs_update_layout(struct inode *ino,
}
/* if LAYOUTGET already failed once we don't try again */
- if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state))
+ if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
goto out_unlock;
get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */
@@ -606,7 +606,7 @@ pnfs_update_layout(struct inode *ino,
lseg = send_layoutget(lo, ctx, iomode);
out:
dprintk("%s end, state 0x%lx lseg %p\n", __func__,
- nfsi->layout->state, lseg);
+ nfsi->layout->plh_flags, lseg);
return lseg;
out_unlock:
spin_unlock(&ino->i_lock);
@@ -636,7 +636,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
spin_lock(&ino->i_lock);
init_lseg(lo, lseg);
- lseg->range = res->range;
+ lseg->pls_range = res->range;
*lgp->lsegpp = lseg;
pnfs_insert_layout(lo, lseg);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e12367d..6fcc073 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -31,10 +31,10 @@
#define FS_NFS_PNFS_H
struct pnfs_layout_segment {
- struct list_head fi_list;
- struct pnfs_layout_range range;
- struct kref kref;
- struct pnfs_layout_hdr *layout;
+ struct list_head pls_list;
+ struct pnfs_layout_range pls_range;
+ struct kref pls_refcount;
+ struct pnfs_layout_hdr *pls_layout;
};
#ifdef CONFIG_NFS_V4_1
@@ -65,7 +65,7 @@ struct pnfs_layout_hdr {
struct list_head segs; /* layout segments list */
seqlock_t seqlock; /* Protects the stateid */
nfs4_stateid stateid;
- unsigned long state;
+ unsigned long plh_flags;
struct inode *inode;
};
--
1.7.2.1
We shouldn't send a LAYOUTGET(openstateid) unless all outstanding RPCs
using the previous stateid are completed. This requires choosing the
stateid to encode earlier, so we can abort if one is not available (we
want to use the open stateid, but a LAYOUTGET is already out using
it), and adding a count of the number of outstanding rpc calls using
layout state (which for now consist solely of LAYOUTGETs).
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/nfs4proc.c | 7 ++++++-
fs/nfs/nfs4xdr.c | 5 +----
fs/nfs/pnfs.c | 24 +++++++++++++++++++-----
fs/nfs/pnfs.h | 1 +
include/linux/nfs_xdr.h | 1 +
5 files changed, 28 insertions(+), 10 deletions(-)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1609a07..3b059d3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5308,6 +5308,12 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
if (nfs4_setup_sequence(server, &lgp->args.seq_args,
&lgp->res.seq_res, 0, task))
return;
+ if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
+ NFS_I(lgp->args.inode)->layout,
+ lgp->args.ctx->state)) {
+ rpc_exit(task, NFS4_OK);
+ return;
+ }
rpc_call_start(task);
}
@@ -5342,7 +5348,6 @@ static void nfs4_layoutget_release(void *calldata)
struct nfs4_layoutget *lgp = calldata;
dprintk("--> %s\n", __func__);
- put_layout_hdr(lgp->args.inode);
if (lgp->res.layout.buf != NULL)
free_page((unsigned long) lgp->res.layout.buf);
put_nfs_open_context(lgp->args.ctx);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4e28242..3cbdd0c 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1787,7 +1787,6 @@ encode_layoutget(struct xdr_stream *xdr,
const struct nfs4_layoutget_args *args,
struct compound_hdr *hdr)
{
- nfs4_stateid stateid;
__be32 *p;
p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
@@ -1798,9 +1797,7 @@ encode_layoutget(struct xdr_stream *xdr,
p = xdr_encode_hyper(p, args->range.offset);
p = xdr_encode_hyper(p, args->range.length);
p = xdr_encode_hyper(p, args->minlength);
- pnfs_choose_layoutget_stateid(&stateid, NFS_I(args->inode)->layout,
- args->ctx->state);
- p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
+ p = xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
*p = cpu_to_be32(args->maxcount);
dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 212cbc2..59ed68b 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -371,6 +371,14 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
}
+/* lget is set to 1 if called from inside send_layoutget call chain */
+static bool
+pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, int lget)
+{
+ return (list_empty(&lo->plh_segs) &&
+ (atomic_read(&lo->plh_outstanding) > lget));
+}
+
int
pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
struct nfs4_state *open_state)
@@ -379,7 +387,9 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
dprintk("--> %s\n", __func__);
spin_lock(&lo->plh_inode->i_lock);
- if (list_empty(&lo->plh_segs)) {
+ if (pnfs_layoutgets_blocked(lo, 1)) {
+ status = -EAGAIN;
+ } else if (list_empty(&lo->plh_segs)) {
int seq;
do {
@@ -414,10 +424,8 @@ send_layoutget(struct pnfs_layout_hdr *lo,
BUG_ON(ctx == NULL);
lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
- if (lgp == NULL) {
- put_layout_hdr(lo->plh_inode);
+ if (lgp == NULL)
return NULL;
- }
lgp->args.minlength = NFS4_MAX_UINT64;
lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
lgp->args.range.iomode = iomode;
@@ -613,10 +621,16 @@ pnfs_update_layout(struct inode *ino,
if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
goto out_unlock;
- get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */
+ if (pnfs_layoutgets_blocked(lo, 0))
+ goto out_unlock;
+ atomic_inc(&lo->plh_outstanding);
+
+ get_layout_hdr_locked(lo);
spin_unlock(&ino->i_lock);
lseg = send_layoutget(lo, ctx, iomode);
+ atomic_dec(&lo->plh_outstanding);
+ put_layout_hdr(ino);
out:
dprintk("%s end, state 0x%lx lseg %p\n", __func__,
nfsi->layout->plh_flags, lseg);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 787253e..698380d 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -69,6 +69,7 @@ struct pnfs_layout_hdr {
struct list_head plh_layouts; /* other client layouts */
struct list_head plh_segs; /* layout segments list */
nfs4_stateid plh_stateid;
+ atomic_t plh_outstanding; /* number of RPCs out */
unsigned long plh_flags;
struct inode *plh_inode;
};
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index f923831..0843703 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -208,6 +208,7 @@ struct nfs4_layoutget_args {
struct inode *inode;
struct nfs_open_context *ctx;
struct nfs4_sequence_args seq_args;
+ nfs4_stateid stateid;
};
struct nfs4_layoutget_res {
--
1.7.2.1
A layout can request return-on-close. How this interacts with the
forgetful model of never sending LAYOUTRETURNS is a bit ambiguous.
We forget any layouts marked roc, and wait for them to be completely
forgotten before continuing with the close. In addition, to compensate
for races with any inflight LAYOUTGETs, and the fact that we do not get
any layout stateid back from the server, we set the barrier to the worst
case scenario of current_seqid + number of outstanding LAYOUTGETS.
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/client.c | 13 +++++++
fs/nfs/nfs4_fs.h | 2 +-
fs/nfs/nfs4proc.c | 21 ++++++++++-
fs/nfs/nfs4state.c | 7 +++-
fs/nfs/pnfs.c | 86 ++++++++++++++++++++++++++++++++++++++++++++-
fs/nfs/pnfs.h | 29 +++++++++++++++
include/linux/nfs_fs_sb.h | 1 +
7 files changed, 153 insertions(+), 6 deletions(-)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 17bfc25..a5cf28a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -202,10 +202,21 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
}
+
+static void pnfs_init_server(struct nfs_server *server)
+{
+ rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
+}
+
#else
static void nfs4_shutdown_client(struct nfs_client *clp)
{
}
+
+static void pnfs_init_server(struct nfs_server *server)
+{
+}
+
#endif /* CONFIG_NFS_V4 */
/*
@@ -973,6 +984,8 @@ static struct nfs_server *nfs_alloc_server(void)
return NULL;
}
+ pnfs_init_server(server);
+
return server;
}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index d927251..e0c3cea 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -242,7 +242,7 @@ extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
+extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
struct nfs4_fs_locations *fs_locations, struct page *page);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3b059d3..b234642 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1838,6 +1838,8 @@ struct nfs4_closedata {
struct nfs_closeres res;
struct nfs_fattr fattr;
unsigned long timestamp;
+ bool roc;
+ u32 roc_barrier;
};
static void nfs4_free_closedata(void *data)
@@ -1845,6 +1847,8 @@ static void nfs4_free_closedata(void *data)
struct nfs4_closedata *calldata = data;
struct nfs4_state_owner *sp = calldata->state->owner;
+ if (calldata->roc)
+ pnfs_roc_release(calldata->state->inode);
nfs4_put_open_state(calldata->state);
nfs_free_seqid(calldata->arg.seqid);
nfs4_put_state_owner(sp);
@@ -1877,6 +1881,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
*/
switch (task->tk_status) {
case 0:
+ if (calldata->roc)
+ pnfs_roc_set_barrier(state->inode,
+ calldata->roc_barrier);
nfs_set_open_stateid(state, &calldata->res.stateid, 0);
renew_lease(server, calldata->timestamp);
nfs4_close_clear_stateid_flags(state,
@@ -1929,8 +1936,15 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
return;
}
- if (calldata->arg.fmode == 0)
+ if (calldata->arg.fmode == 0) {
task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
+ if (calldata->roc &&
+ pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) {
+ rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq,
+ task, NULL);
+ return;
+ }
+ }
nfs_fattr_init(calldata->res.fattr);
calldata->timestamp = jiffies;
@@ -1958,7 +1972,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
*
* NOTE: Caller must be holding the sp->so_owner semaphore!
*/
-int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait)
+int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
{
struct nfs_server *server = NFS_SERVER(state->inode);
struct nfs4_closedata *calldata;
@@ -1993,6 +2007,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
calldata->res.fattr = &calldata->fattr;
calldata->res.seqid = calldata->arg.seqid;
calldata->res.server = server;
+ calldata->roc = roc;
path_get(path);
calldata->path = *path;
@@ -2010,6 +2025,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i
out_free_calldata:
kfree(calldata);
out:
+ if (roc)
+ pnfs_roc_release(state->inode);
nfs4_put_open_state(state);
nfs4_put_state_owner(sp);
return status;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index fb23a32..a472b7c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -607,8 +607,11 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state,
if (!call_close) {
nfs4_put_open_state(state);
nfs4_put_state_owner(owner);
- } else
- nfs4_do_close(path, state, gfp_mask, wait);
+ } else {
+ bool roc = pnfs_roc(state->inode);
+
+ nfs4_do_close(path, state, gfp_mask, wait, roc);
+ }
}
void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index bf4186b..bc40897 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -256,6 +256,7 @@ put_lseg_locked(struct pnfs_layout_segment *lseg,
spin_unlock(&clp->cl_lock);
clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
}
+ rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
list_add(&lseg->pls_list, tmp_list);
return 1;
}
@@ -401,7 +402,8 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
if ((stateid) &&
(int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
return true;
- return test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
+ return lo->plh_block_lgets ||
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
(list_empty(&lo->plh_segs) &&
(atomic_read(&lo->plh_outstanding) > lget));
}
@@ -474,6 +476,83 @@ send_layoutget(struct pnfs_layout_hdr *lo,
return lseg;
}
+bool pnfs_roc(struct inode *ino)
+{
+ struct pnfs_layout_hdr *lo;
+ struct pnfs_layout_segment *lseg, *tmp;
+ LIST_HEAD(tmp_list);
+ bool found = false;
+
+ spin_lock(&ino->i_lock);
+ lo = NFS_I(ino)->layout;
+ if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+ goto out_nolayout;
+ list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
+ if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+ mark_lseg_invalid(lseg, &tmp_list);
+ found = true;
+ }
+ if (!found)
+ goto out_nolayout;
+ lo->plh_block_lgets++;
+ get_layout_hdr(lo); /* matched in pnfs_roc_release */
+ spin_unlock(&ino->i_lock);
+ pnfs_free_lseg_list(&tmp_list);
+ return true;
+
+out_nolayout:
+ spin_unlock(&ino->i_lock);
+ return false;
+}
+
+void pnfs_roc_release(struct inode *ino)
+{
+ struct pnfs_layout_hdr *lo;
+
+ spin_lock(&ino->i_lock);
+ lo = NFS_I(ino)->layout;
+ lo->plh_block_lgets--;
+ put_layout_hdr_locked(lo);
+ spin_unlock(&ino->i_lock);
+}
+
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+ struct pnfs_layout_hdr *lo;
+
+ spin_lock(&ino->i_lock);
+ lo = NFS_I(ino)->layout;
+ if ((int)(barrier - lo->plh_barrier) > 0)
+ lo->plh_barrier = barrier;
+ spin_unlock(&ino->i_lock);
+}
+
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
+{
+ struct nfs_inode *nfsi = NFS_I(ino);
+ struct pnfs_layout_segment *lseg;
+ bool found = false;
+
+ spin_lock(&ino->i_lock);
+ list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
+ if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+ found = true;
+ break;
+ }
+ if (!found) {
+ struct pnfs_layout_hdr *lo = nfsi->layout;
+ u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid);
+
+ /* Since close does not return a layout stateid for use as
+ * a barrier, we choose the worst-case barrier.
+ */
+ *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
+ }
+ spin_unlock(&ino->i_lock);
+ return found;
+}
+
/*
* Compare two layout segments for sorting into layout cache.
* We want to preferentially return RW over RO layouts, so ensure those
@@ -732,6 +811,11 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
*lgp->lsegpp = lseg;
pnfs_insert_layout(lo, lseg);
+ if (res->return_on_close) {
+ set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
+ set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
+ }
+
/* Done processing layoutget. Set the layout stateid */
pnfs_set_layout_stateid(lo, &res->stateid, false);
spin_unlock(&ino->i_lock);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index f91d0d4..e2612ea 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -32,6 +32,7 @@
enum {
NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
+ NFS_LSEG_ROC, /* roc bit received from server */
};
struct pnfs_layout_segment {
@@ -50,6 +51,7 @@ enum {
NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
+ NFS_LAYOUT_ROC, /* some lseg had roc bit set */
NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */
};
@@ -72,6 +74,7 @@ struct pnfs_layout_hdr {
struct list_head plh_segs; /* layout segments list */
nfs4_stateid plh_stateid;
atomic_t plh_outstanding; /* number of RPCs out */
+ unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
u32 plh_barrier; /* ignore lower seqids */
unsigned long plh_flags;
struct inode *plh_inode;
@@ -162,6 +165,10 @@ int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
u32 iomode);
+bool pnfs_roc(struct inode *ino);
+void pnfs_roc_release(struct inode *ino);
+void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
static inline int lo_fail_bit(u32 iomode)
@@ -193,6 +200,28 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
return NULL;
}
+static inline bool
+pnfs_roc(struct inode *ino)
+{
+ return false;
+}
+
+static inline void
+pnfs_roc_release(struct inode *ino)
+{
+}
+
+static inline void
+pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+{
+}
+
+static inline bool
+pnfs_roc_drain(struct inode *ino, u32 *barrier)
+{
+ return false;
+}
+
static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
{
}
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index e93ada0..7f20c0b 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -149,6 +149,7 @@ struct nfs_server {
that are supported on this
filesystem */
struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */
+ struct rpc_wait_queue roc_rpcwaitq;
#endif
void (*destroy)(struct nfs_server *);
--
1.7.2.1
Either a bad server reply, or our ignoring of multiple array segments in
a reply, can cause a reply to not meet our requirements. Ensure
that we ignore such replies.
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/pnfs.c | 11 +++++++++++
1 files changed, 11 insertions(+), 0 deletions(-)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c00b673..cd99064 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -660,6 +660,17 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
struct inode *ino = lo->plh_inode;
int status = 0;
+ /* Verify we got what we asked for.
+ * Note that because the xdr parsing only accepts a single
+ * element array, this can fail even if the server is behaving
+ * correctly.
+ */
+ if (lgp->args.range.iomode > res->range.iomode ||
+ res->range.offset != 0 ||
+ res->range.length != NFS4_MAX_UINT64) {
+ status = -EINVAL;
+ goto out;
+ }
/* Inject layout blob into I/O device driver */
lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
if (!lseg || IS_ERR(lseg)) {
--
1.7.2.1
Since this list will be used to search for layouts to recall,
this is necessary to avoid a race where the recall comes in,
sees there is nothing in the client list, and prepares to return
NOMATCHING, while the LAYOUTGET gets processed before the recall
updates the stateid.
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/pnfs.c | 27 +++++++++++++++++++--------
1 files changed, 19 insertions(+), 8 deletions(-)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 59ed68b..c00b673 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -469,14 +469,6 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
dprintk("%s:Begin\n", __func__);
assert_spin_locked(&lo->plh_inode->i_lock);
- if (list_empty(&lo->plh_segs)) {
- struct nfs_client *clp = NFS_SERVER(lo->plh_inode)->nfs_client;
-
- spin_lock(&clp->cl_lock);
- BUG_ON(!list_empty(&lo->plh_layouts));
- list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
- spin_unlock(&clp->cl_lock);
- }
list_for_each_entry(lp, &lo->plh_segs, pls_list) {
if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
continue;
@@ -597,6 +589,7 @@ pnfs_update_layout(struct inode *ino,
enum pnfs_iomode iomode)
{
struct nfs_inode *nfsi = NFS_I(ino);
+ struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
struct pnfs_layout_hdr *lo;
struct pnfs_layout_segment *lseg = NULL;
@@ -626,9 +619,27 @@ pnfs_update_layout(struct inode *ino,
atomic_inc(&lo->plh_outstanding);
get_layout_hdr_locked(lo);
+ if (list_empty(&lo->plh_segs)) {
+ /* The lo must be on the clp list if there is any
+ * chance of a CB_LAYOUTRECALL(FILE) coming in.
+ */
+ spin_lock(&clp->cl_lock);
+ BUG_ON(!list_empty(&lo->plh_layouts));
+ list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
+ spin_unlock(&clp->cl_lock);
+ }
spin_unlock(&ino->i_lock);
lseg = send_layoutget(lo, ctx, iomode);
+ if (!lseg) {
+ spin_lock(&ino->i_lock);
+ if (list_empty(&lo->plh_segs)) {
+ spin_lock(&clp->cl_lock);
+ list_del_init(&lo->plh_layouts);
+ spin_unlock(&clp->cl_lock);
+ }
+ spin_unlock(&ino->i_lock);
+ }
atomic_dec(&lo->plh_outstanding);
put_layout_hdr(ino);
out:
--
1.7.2.1
From: Alexandros Batsakis <[email protected]>
While here, update the code a bit.
Signed-off-by: Alexandros Batsakis <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/callback.h | 8 ++++++++
fs/nfs/callback_proc.c | 29 ++++++++++++++++++++++++++---
2 files changed, 34 insertions(+), 3 deletions(-)
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index f6768ac..d3b44f9 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -119,6 +119,14 @@ extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
#define RCA4_TYPE_MASK_RDATA_DLG 0
#define RCA4_TYPE_MASK_WDATA_DLG 1
+#define RCA4_TYPE_MASK_DIR_DLG 2
+#define RCA4_TYPE_MASK_FILE_LAYOUT 3
+#define RCA4_TYPE_MASK_BLK_LAYOUT 4
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8
+#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12
+#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15
+#define RCA4_TYPE_MASK_ALL 0xf31f
struct cb_recallanyargs {
struct sockaddr *craa_addr;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 6619c05..4bb91cb 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -230,6 +230,17 @@ __be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
return cpu_to_be32(res);
}
+static void pnfs_recall_all_layouts(struct nfs_client *clp)
+{
+ struct cb_layoutrecallargs args;
+
+ /* Pretend we got a CB_LAYOUTRECALL(ALL) */
+ memset(&args, 0, sizeof(args));
+ args.cbl_recall_type = RETURN_ALL;
+ /* FIXME we ignore errors, what should we do? */
+ do_callback_layoutrecall(clp, &args);
+}
+
int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
{
if (delegation == NULL)
@@ -421,29 +432,41 @@ out:
return status;
}
+static bool
+validate_bitmap_values(unsigned long mask)
+{
+ return (mask & ~RCA4_TYPE_MASK_ALL) == 0;
+}
+
__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
struct cb_process_state *cps)
{
__be32 status;
fmode_t flags = 0;
- status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+ status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
if (!cps->clp) /* set in cb_sequence */
goto out;
dprintk("NFS: RECALL_ANY callback request from %s\n",
rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+ status = cpu_to_be32(NFS4ERR_INVAL);
+ if (!validate_bitmap_values(args->craa_type_mask))
+ goto out;
+
+ status = cpu_to_be32(NFS4_OK);
if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)
&args->craa_type_mask))
flags = FMODE_READ;
if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *)
&args->craa_type_mask))
flags |= FMODE_WRITE;
-
+ if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *)
+ &args->craa_type_mask))
+ pnfs_recall_all_layouts(cps->clp);
if (flags)
nfs_expire_all_delegation_types(cps->clp, flags);
- status = htonl(NFS4_OK);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
return status;
--
1.7.2.1
On Dec 23, 2010, at 4:05 PM, Trond Myklebust wrote:
> On Thu, 2010-12-23 at 15:08 -0500, Fred Isaman wrote:
>> On Dec 23, 2010, at 2:25 PM, Trond Myklebust wrote:
>>
>>> On Thu, 2010-12-23 at 12:29 -0500, Fred Isaman wrote:
>>>> +
>>>> + if (args->cbl_recall_type == RETURN_FILE) {
>>>> + LIST_HEAD(free_me_list);
>>>> +
>>>> + args->cbl_inode = NULL;
>>>> + spin_lock(&clp->cl_lock);
>>>> + list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
>>>> + if (nfs_compare_fh(&args->cbl_fh,
>>>> + &NFS_I(lo->plh_inode)->fh))
>>>> + continue;
>>>> + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
>>>> + rv = NFS4ERR_DELAY;
>>>> + else {
>>>> + /* Without this, layout can be freed as soon
>>>> + * as we release cl_lock. Matched in
>>>> + * do_callback_layoutrecall.
>>>> + */
>>>> + get_layout_hdr(lo);
>>>> + args->cbl_inode = lo->plh_inode;
>>>> + rv = NFS4_OK;
>>>> + }
>>>> + break;
>>>> + }
>>>> + spin_unlock(&clp->cl_lock);
>>>> +
>>>> + spin_lock(&lo->plh_inode->i_lock);
>>>
>>> What guarantees do you have that the inode still exists here? Does the
>>> layout segment hold a reference to it?
>>>
>>
>> This section is why I once asked about whether the layout needs to reference the inode. I believe the answer is nothing. The lseg/layout definitely does not hold direct references. I assume doing grab/put of inode here and below is acceptable?
>
> igrab/iput is fine in this context.
>
> There is also the question: what if we don't find a match for the layout
> recall arguments? It looks to me as if we end up with 'lo ==
> clp->cl_layouts', with disastrous consequences when we then do
> spin_lock(lo->plh_inode->i_lock)...
>
gah, you're right. I'll get that fixed too.
Fred
>>> Also, How do you know that (!test_bit(NFS_LAYOUT_BULK_RECALL,
>>> &lo->plh_flags)) is still valid when you get here?
>>>
>>
>> Because it can not be set elsewhere while the NFS4CLNT_LAYOUTRECALL bit is set.
>
> OK. One race averted :-)
>
> --
> Trond Myklebust
> Linux NFS client maintainer
>
> NetApp
> [email protected]
> http://www.netapp.com
>
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/nfs4proc.c | 3 +--
include/linux/nfs_xdr.h | 1 -
2 files changed, 1 insertions(+), 3 deletions(-)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index fcf2658..f475516 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5330,7 +5330,6 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
return;
}
}
- lgp->status = task->tk_status;
dprintk("<-- %s\n", __func__);
}
@@ -5386,7 +5385,7 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
status = nfs4_wait_for_completion_rpc_task(task);
if (status != 0)
goto out;
- status = lgp->status;
+ status = task->tk_status;
if (status != 0)
goto out;
status = pnfs_layout_process(lgp);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index d325def..f923831 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -223,7 +223,6 @@ struct nfs4_layoutget {
struct nfs4_layoutget_args args;
struct nfs4_layoutget_res res;
struct pnfs_layout_segment **lsegpp;
- int status;
};
struct nfs4_getdeviceinfo_args {
--
1.7.2.1
No functional changes, just some code minor code rearrangement and
comments.
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/nfs4proc.c | 19 ++++++++++---------
1 files changed, 10 insertions(+), 9 deletions(-)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f475516..1609a07 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5297,10 +5297,14 @@ static void
nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
{
struct nfs4_layoutget *lgp = calldata;
- struct inode *ino = lgp->args.inode;
- struct nfs_server *server = NFS_SERVER(ino);
+ struct nfs_server *server = NFS_SERVER(lgp->args.inode);
dprintk("--> %s\n", __func__);
+ /* Note the is a race here, where a CB_LAYOUTRECALL can come in
+ * right now covering the LAYOUTGET we are about to send.
+ * However, that is not so catastrophic, and there seems
+ * to be no way to prevent it completely.
+ */
if (nfs4_setup_sequence(server, &lgp->args.seq_args,
&lgp->res.seq_res, 0, task))
return;
@@ -5383,13 +5387,10 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
if (IS_ERR(task))
return PTR_ERR(task);
status = nfs4_wait_for_completion_rpc_task(task);
- if (status != 0)
- goto out;
- status = task->tk_status;
- if (status != 0)
- goto out;
- status = pnfs_layout_process(lgp);
-out:
+ if (status == 0)
+ status = task->tk_status;
+ if (status == 0)
+ status = pnfs_layout_process(lgp);
rpc_put_task(task);
dprintk("<-- %s status=%d\n", __func__, status);
return status;
--
1.7.2.1
On Dec 22, 2010, at 4:47 PM, Trond Myklebust wrote:
> On Tue, 2010-12-21 at 23:00 -0500, Fred Isaman wrote:
>> This will be required to allow us to grab reference outside of i_lock.
>> While we are at it, make put_layout_hdr take the same argument as all the
>> related functions.
>>
>> Signed-off-by: Fred Isaman <[email protected]>
>> ---
>> fs/nfs/pnfs.c | 50 ++++++++++++++++++++++++++++----------------------
>> fs/nfs/pnfs.h | 4 ++--
>> 2 files changed, 30 insertions(+), 24 deletions(-)
>>
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index 970cc3b..53a0184 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -177,34 +177,40 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
>> * pNFS client layout cache
>> */
>>
>> +/* Need to hold i_lock if caller does not already hold reference */
>> static void
>> -get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
>> +get_layout_hdr(struct pnfs_layout_hdr *lo)
>> {
>> - assert_spin_locked(&lo->plh_inode->i_lock);
>> - lo->plh_refcount++;
>> + atomic_inc(&lo->plh_refcount);
>> }
>>
>> static void
>> -put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
>> +destroy_layout_hdr(struct pnfs_layout_hdr *lo)
>> {
>> - assert_spin_locked(&lo->plh_inode->i_lock);
>> - BUG_ON(lo->plh_refcount == 0);
>> + dprintk("%s: freeing layout cache %p\n", __func__, lo);
>> + BUG_ON(!list_empty(&lo->plh_layouts));
>> + NFS_I(lo->plh_inode)->layout = NULL;
>> + kfree(lo);
>> +}
>>
>> - lo->plh_refcount--;
>> - if (!lo->plh_refcount) {
>> - dprintk("%s: freeing layout cache %p\n", __func__, lo);
>> - BUG_ON(!list_empty(&lo->plh_layouts));
>> - NFS_I(lo->plh_inode)->layout = NULL;
>> - kfree(lo);
>> - }
>> +static void
>> +put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
>> +{
>> + BUG_ON(atomic_read(&lo->plh_refcount) == 0);
>
> Isn't this debugged by now?
>
>> + if (atomic_dec_and_test(&lo->plh_refcount))
>> + destroy_layout_hdr(lo);
>> }
>>
>> void
>> -put_layout_hdr(struct inode *inode)
>> +put_layout_hdr(struct pnfs_layout_hdr *lo)
>> {
>> - spin_lock(&inode->i_lock);
>> - put_layout_hdr_locked(NFS_I(inode)->layout);
>> - spin_unlock(&inode->i_lock);
>> + struct inode *inode = lo->plh_inode;
>> +
>> + BUG_ON(atomic_read(&lo->plh_refcount) == 0);
> Ditto....
OK, I'll remove those.
Fred
>
>> + if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
>> + destroy_layout_hdr(lo);
>> + spin_unlock(&inode->i_lock);
>> + }
>> }
>>
>> static void
>> @@ -224,7 +230,7 @@ static void free_lseg(struct pnfs_layout_segment *lseg)
>> BUG_ON(atomic_read(&lseg->pls_refcount) != 0);
>> NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
>> /* Matched by get_layout_hdr in pnfs_insert_layout */
>> - put_layout_hdr(ino);
>> + put_layout_hdr(NFS_I(ino)->layout);
>> }
>>
>> /* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
>> @@ -481,7 +487,7 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
>> __func__, lseg, lseg->pls_range.iomode,
>> lseg->pls_range.offset, lseg->pls_range.length);
>> }
>> - get_layout_hdr_locked(lo);
>> + get_layout_hdr(lo);
>>
>> dprintk("%s:Return\n", __func__);
>> }
>> @@ -494,7 +500,7 @@ alloc_init_layout_hdr(struct inode *ino)
>> lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
>> if (!lo)
>> return NULL;
>> - lo->plh_refcount = 1;
>> + atomic_set(&lo->plh_refcount, 1);
>> INIT_LIST_HEAD(&lo->plh_layouts);
>> INIT_LIST_HEAD(&lo->plh_segs);
>> lo->plh_inode = ino;
>> @@ -609,7 +615,7 @@ pnfs_update_layout(struct inode *ino,
>> goto out_unlock;
>> atomic_inc(&lo->plh_outstanding);
>>
>> - get_layout_hdr_locked(lo);
>> + get_layout_hdr(lo);
>> if (list_empty(&lo->plh_segs)) {
>> /* The lo must be on the clp list if there is any
>> * chance of a CB_LAYOUTRECALL(FILE) coming in.
>> @@ -632,7 +638,7 @@ pnfs_update_layout(struct inode *ino,
>> spin_unlock(&ino->i_lock);
>> }
>> atomic_dec(&lo->plh_outstanding);
>> - put_layout_hdr(ino);
>> + put_layout_hdr(lo);
>> out:
>> dprintk("%s end, state 0x%lx lseg %p\n", __func__,
>> nfsi->layout->plh_flags, lseg);
>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
>> index 698380d..8aaab56 100644
>> --- a/fs/nfs/pnfs.h
>> +++ b/fs/nfs/pnfs.h
>> @@ -65,7 +65,7 @@ struct pnfs_layoutdriver_type {
>> };
>>
>> struct pnfs_layout_hdr {
>> - unsigned long plh_refcount;
>> + atomic_t plh_refcount;
>> struct list_head plh_layouts; /* other client layouts */
>> struct list_head plh_segs; /* layout segments list */
>> nfs4_stateid plh_stateid;
>> @@ -147,7 +147,7 @@ void unset_pnfs_layoutdriver(struct nfs_server *);
>> int pnfs_layout_process(struct nfs4_layoutget *lgp);
>> void pnfs_destroy_layout(struct nfs_inode *);
>> void pnfs_destroy_all_layouts(struct nfs_client *);
>> -void put_layout_hdr(struct inode *inode);
>> +void put_layout_hdr(struct pnfs_layout_hdr *lo);
>> int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
>> struct pnfs_layout_hdr *lo,
>> struct nfs4_state *open_state);
>
> --
> Trond Myklebust
> Linux NFS client maintainer
>
> NetApp
> [email protected]
> http://www.netapp.com
>