These fix the incorrect lock order used throughout the pnfs code. We
first simplify the logic (and in the proces squash a few bugs), then
rearrange the code so we never have to take both i_lock and cl_lock
simultaneously.
They apply to Trond's linux-next branch.
Fred
The pnfs code was using throughout the lock order i_lock, cl_lock.
This conflicts with the nfs delegation code. Rework the pnfs code
to avoid taking both locks simultaneously.
Currently the code takes the double lock to add/remove the layout to a
nfs_client list, while atomically checking that the list of lsegs is
empty. To avoid this, we rely on existing serializations. When a
layout is initialized with lseg count equal zero, LAYOUTGET's
openstateid serialization is in effect, making it safe to assume it
stays zero unless we change it. And once a layout's lseg count drops
to zero, it is set as DESTROYED and so will stay at zero.
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/callback_proc.c | 2 +-
fs/nfs/pnfs.c | 50 +++++++++++++++++++++++++++--------------------
2 files changed, 30 insertions(+), 22 deletions(-)
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 8958757..2f41dcce 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -188,10 +188,10 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
rv = NFS4ERR_DELAY;
list_del_init(&lo->plh_bulk_recall);
spin_unlock(&ino->i_lock);
+ pnfs_free_lseg_list(&free_me_list);
put_layout_hdr(lo);
iput(ino);
}
- pnfs_free_lseg_list(&free_me_list);
return rv;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index f89c3bb..ee6c69a 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -247,13 +247,6 @@ put_lseg_locked(struct pnfs_layout_segment *lseg,
BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
list_del(&lseg->pls_list);
if (list_empty(&lseg->pls_layout->plh_segs)) {
- struct nfs_client *clp;
-
- clp = NFS_SERVER(ino)->nfs_client;
- spin_lock(&clp->cl_lock);
- /* List does not take a reference, so no need for put here */
- list_del_init(&lseg->pls_layout->plh_layouts);
- spin_unlock(&clp->cl_lock);
set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
/* Matched by initial refcount set in alloc_init_layout_hdr */
put_layout_hdr_locked(lseg->pls_layout);
@@ -319,14 +312,30 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
return invalid - removed;
}
+/* note free_me must contain lsegs from a single layout_hdr */
void
pnfs_free_lseg_list(struct list_head *free_me)
{
- struct pnfs_layout_segment *lseg, *tmp;
+ if (!list_empty(free_me)) {
+ struct pnfs_layout_segment *lseg, *tmp;
+ struct pnfs_layout_hdr *lo;
- list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
- list_del(&lseg->pls_list);
- free_lseg(lseg);
+ lo = list_first_entry(free_me,
+ struct pnfs_layout_segment,
+ pls_list)->pls_layout;
+
+ if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) {
+ struct nfs_client *clp;
+
+ clp = NFS_SERVER(lo->plh_inode)->nfs_client;
+ spin_lock(&clp->cl_lock);
+ list_del_init(&lo->plh_layouts);
+ spin_unlock(&clp->cl_lock);
+ }
+ list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
+ list_del(&lseg->pls_list);
+ free_lseg(lseg);
+ }
}
}
@@ -704,6 +713,7 @@ pnfs_update_layout(struct inode *ino,
struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
struct pnfs_layout_hdr *lo;
struct pnfs_layout_segment *lseg = NULL;
+ bool first = false;
if (!pnfs_enabled_sb(NFS_SERVER(ino)))
return NULL;
@@ -734,7 +744,10 @@ pnfs_update_layout(struct inode *ino,
atomic_inc(&lo->plh_outstanding);
get_layout_hdr(lo);
- if (list_empty(&lo->plh_segs)) {
+ if (list_empty(&lo->plh_segs))
+ first = true;
+ spin_unlock(&ino->i_lock);
+ if (first) {
/* The lo must be on the clp list if there is any
* chance of a CB_LAYOUTRECALL(FILE) coming in.
*/
@@ -743,17 +756,12 @@ pnfs_update_layout(struct inode *ino,
list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
spin_unlock(&clp->cl_lock);
}
- spin_unlock(&ino->i_lock);
lseg = send_layoutget(lo, ctx, iomode);
- if (!lseg) {
- spin_lock(&ino->i_lock);
- if (list_empty(&lo->plh_segs)) {
- spin_lock(&clp->cl_lock);
- list_del_init(&lo->plh_layouts);
- spin_unlock(&clp->cl_lock);
- }
- spin_unlock(&ino->i_lock);
+ if (!lseg && first) {
+ spin_lock(&clp->cl_lock);
+ list_del_init(&lo->plh_layouts);
+ spin_unlock(&clp->cl_lock);
}
atomic_dec(&lo->plh_outstanding);
put_layout_hdr(lo);
--
1.7.2.1
We do not need to clear the NFS_LAYOUT_BULK_RECALL, as setting it
guarantees that NFS_LAYOUT_DESTROYED will be set once any outstanding
io is finished.
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/pnfs.c | 2 --
1 files changed, 0 insertions(+), 2 deletions(-)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index dcd5d54..f89c3bb 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -254,7 +254,6 @@ put_lseg_locked(struct pnfs_layout_segment *lseg,
/* List does not take a reference, so no need for put here */
list_del_init(&lseg->pls_layout->plh_layouts);
spin_unlock(&clp->cl_lock);
- clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
/* Matched by initial refcount set in alloc_init_layout_hdr */
put_layout_hdr_locked(lseg->pls_layout);
@@ -753,7 +752,6 @@ pnfs_update_layout(struct inode *ino,
spin_lock(&clp->cl_lock);
list_del_init(&lo->plh_layouts);
spin_unlock(&clp->cl_lock);
- clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
}
spin_unlock(&ino->i_lock);
}
--
1.7.2.1
The code could violate the following from RFC5661, section 12.5.3:
"Once a client has no more layouts on a file, the layout stateid is no
longer valid and MUST NOT be used."
This can occur when a layout already has a lseg, starts another
non-everlapping LAYOUTGET, and a CB_LAYOUTRECALL for the existing lseg
is processed before we hit pnfs_layout_process().
Solve by setting, each time the client has no more lsegs for a file, a
flag which blocks further use of the layout and triggers its removal.
This also fixes a second bug which occurs in the same instance as
above. If we actually use pnfs_layout_process, we add the new lseg to
the layout, but the layout has been removed from the nfs_client list
by the intervening CB_LAYOUTRECALL and will not be added back. Thus
the newly acquired lseg will not be properly returned in the event of
a subsequent CB_LAYOUTRECALL.
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/pnfs.c | 12 +++++++++---
1 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 1b1bc1a..dcd5d54 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -255,6 +255,9 @@ put_lseg_locked(struct pnfs_layout_segment *lseg,
list_del_init(&lseg->pls_layout->plh_layouts);
spin_unlock(&clp->cl_lock);
clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
+ set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
+ /* Matched by initial refcount set in alloc_init_layout_hdr */
+ put_layout_hdr_locked(lseg->pls_layout);
}
rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
list_add(&lseg->pls_list, tmp_list);
@@ -299,6 +302,11 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
dprintk("%s:Begin lo %p\n", __func__, lo);
+ if (list_empty(&lo->plh_segs)) {
+ if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
+ put_layout_hdr_locked(lo);
+ return 0;
+ }
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
dprintk("%s: freeing lseg %p iomode %d "
@@ -332,10 +340,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
spin_lock(&nfsi->vfs_inode.i_lock);
lo = nfsi->layout;
if (lo) {
- set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
- /* Matched by refcount set to 1 in alloc_init_layout_hdr */
- put_layout_hdr_locked(lo);
}
spin_unlock(&nfsi->vfs_inode.i_lock);
pnfs_free_lseg_list(&tmp_list);
@@ -403,6 +408,7 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
(int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
return true;
return lo->plh_block_lgets ||
+ test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
(list_empty(&lo->plh_segs) &&
(atomic_read(&lo->plh_outstanding) > lget));
--
1.7.2.1
On 2011-02-01 18:31, Fred Isaman wrote:
>
> On Feb 1, 2011, at 10:38 AM, Benny Halevy wrote:
>
>> On 2011-01-31 17:27, Fred Isaman wrote:
>>> The code could violate the following from RFC5661, section 12.5.3:
>>> "Once a client has no more layouts on a file, the layout stateid is no
>>> longer valid and MUST NOT be used."
>>
>> NACK.
>>
>> Fred, this is your interpretation of the forgetful model and it is taken
>> out of context.
>>
>> Until the spec is changed only the server invalidates the stateid by returning
>> lrs_present=false on LAYOUTRETURN. Merely forgetting the layout state without
>> LAYOUTRETURN does not determine that.
>>
>>
>> Also from 12.5.3:
>> Once a layout stateid is established, the "other"
>> field will stay constant unless the stateid is revoked or the client
>> returns all layouts on the file and the server disposes of the stateid.
>> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>>
>
>
> OK, so the question is, does sending a LAYOUTGET with openstateid "return all layouts".
> Making the answer to this "yes" is perfectly consistent with the spec, given its complete absence
> of direction here.
I disagree, and so are other server implementations, including linux-pnfs!
(It will return BAD_STATEID if the client forgets the layout
in any case but LAYOUTRETURN or CB_LAYOUTRECALL/NOMATCHING_LAYOUT)
> The alternative is a forgetful model where the client can forget layouts, but
> not layout stateid.
Right, and this is the direction we should go until the errata is in place.
>
> The question is, is there any objection to the above interpretation of LAYOUTGET with openstateid?
> Because if not, then we will shortly get an appropriate errata,
> and there is no good reason to delay the patch until the errata is finalized.
> I know previously you had objected on the basis that this prevents parallel use of openstateid with LAYOUTGET.
> But given that you had indicated that such parallel use can not be done for other reasons,
> I have been working on the assumption that the interpretation above will be accepted without controversy at the upcoming IETF.
>
> On the other hand, if there is objection to the above interpretation, that should be made known.
I simulated this with the layout-sim and given the LAYOUTGET sent with the initial stateid
is always fully serialized with other layout stateid-changing operations, this model works.
I object the timing, before the IETF discussion is over and before the upcoming Connectathon
where other server vendors have no chance to implement this erratum.
Benny
>
> Fred
>
>
>> and
>>
>> Once the client receives a layout stateid, it MUST use the correct
>> "seqid" for subsequent LAYOUTGET or LAYOUTRETURN operations. The
>> correct "seqid" is defined as the highest "seqid" value from
>> responses of fully processed LAYOUTGET or LAYOUTRETURN operations or
>> arguments of a fully processed CB_LAYOUTRECALL operation.
>>
>> and 18.43.3
>>
>> The loga_stateid field specifies a valid stateid. If a layout is not
>> currently held by the client, the loga_stateid field represents a
>> stateid reflecting the correspondingly valid open, byte-range lock,
>> or delegation stateid. Once a layout is held on the file by the
>> client, the loga_stateid field MUST be a stateid as returned from a
>> previous LAYOUTGET or LAYOUTRETURN operation or provided by a
>> CB_LAYOUTRECALL operation (see Section 12.5.3).
>>
>> Only when we agree that the client can throw away the layout state and
>> send a singular future LAYOUTGET with a non-layout stateid - something it MUST not
>> do at the moment - only then we can do what this patch suggests.
>>
>> Benny
>>
>>>
>>> This can occur when a layout already has a lseg, starts another
>>> non-everlapping LAYOUTGET, and a CB_LAYOUTRECALL for the existing lseg
>>> is processed before we hit pnfs_layout_process().
>>>
>>> Solve by setting, each time the client has no more lsegs for a file, a
>>> flag which blocks further use of the layout and triggers its removal.
>>>
>>> This also fixes a second bug which occurs in the same instance as
>>> above. If we actually use pnfs_layout_process, we add the new lseg to
>>> the layout, but the layout has been removed from the nfs_client list
>>> by the intervening CB_LAYOUTRECALL and will not be added back. Thus
>>> the newly acquired lseg will not be properly returned in the event of
>>> a subsequent CB_LAYOUTRECALL.
>>>
>>> Signed-off-by: Fred Isaman <[email protected]>
>>> ---
>>> fs/nfs/pnfs.c | 12 +++++++++---
>>> 1 files changed, 9 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>>> index 1b1bc1a..dcd5d54 100644
>>> --- a/fs/nfs/pnfs.c
>>> +++ b/fs/nfs/pnfs.c
>>> @@ -255,6 +255,9 @@ put_lseg_locked(struct pnfs_layout_segment *lseg,
>>> list_del_init(&lseg->pls_layout->plh_layouts);
>>> spin_unlock(&clp->cl_lock);
>>> clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
>>> + set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
>>> + /* Matched by initial refcount set in alloc_init_layout_hdr */
>>> + put_layout_hdr_locked(lseg->pls_layout);
>>> }
>>> rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
>>> list_add(&lseg->pls_list, tmp_list);
>>> @@ -299,6 +302,11 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
>>>
>>> dprintk("%s:Begin lo %p\n", __func__, lo);
>>>
>>> + if (list_empty(&lo->plh_segs)) {
>>> + if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
>>> + put_layout_hdr_locked(lo);
>>> + return 0;
>>> + }
>>> list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
>>> if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
>>> dprintk("%s: freeing lseg %p iomode %d "
>>> @@ -332,10 +340,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
>>> spin_lock(&nfsi->vfs_inode.i_lock);
>>> lo = nfsi->layout;
>>> if (lo) {
>>> - set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
>>> mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
>>> - /* Matched by refcount set to 1 in alloc_init_layout_hdr */
>>> - put_layout_hdr_locked(lo);
>>> }
>>> spin_unlock(&nfsi->vfs_inode.i_lock);
>>> pnfs_free_lseg_list(&tmp_list);
>>> @@ -403,6 +408,7 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
>>> (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
>>> return true;
>>> return lo->plh_block_lgets ||
>>> + test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
>>> test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
>>> (list_empty(&lo->plh_segs) &&
>>> (atomic_read(&lo->plh_outstanding) > lget));
>
I'm actually seeing that with the pnfs tree over 2.8.38-rc3
See signature below.
Will re-test with this patch.
Benny
[ INFO: possible circular locking dependency detected ]
2.6.38-rc3-pnfs-00391-g13858b7 #5
-------------------------------------------------------
test5/2174 is trying to acquire lock:
(&sb->s_type->i_lock_key#24){+.+...}, at: [<ffffffffa018e924>] nfs_inode_set_]
but task is already holding lock:
(&(&clp->cl_lock)->rlock){+.+...}, at: [<ffffffffa018e822>] nfs_inode_set_del]
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (&(&clp->cl_lock)->rlock){+.+...}:
[<ffffffff8108281f>] lock_acquire+0xd3/0x100
[<ffffffff8147df43>] _raw_spin_lock+0x36/0x69
[<ffffffffa0193b87>] pnfs_update_layout+0x2f5/0x4d9 [nfs]
[<ffffffffa0166862>] nfs_write_begin+0x90/0x257 [nfs]
[<ffffffff810e1d7b>] generic_file_buffered_write+0x106/0x267
[<ffffffff810e33a8>] __generic_file_aio_write+0x245/0x27a
[<ffffffff810e3439>] generic_file_aio_write+0x5c/0xaa
[<ffffffffa016728b>] nfs_file_write+0xdf/0x177 [nfs]
[<ffffffff8112b599>] do_sync_write+0xcb/0x108
[<ffffffff8112bf97>] vfs_write+0xb1/0x10d
[<ffffffff8112c0bc>] sys_write+0x4d/0x77
[<ffffffff8100ab82>] system_call_fastpath+0x16/0x1b
-> #0 (&sb->s_type->i_lock_key#24){+.+...}:
[<ffffffff81082457>] __lock_acquire+0xa45/0xd3a
[<ffffffff8108281f>] lock_acquire+0xd3/0x100
[<ffffffff8147df43>] _raw_spin_lock+0x36/0x69
[<ffffffffa018e924>] nfs_inode_set_delegation+0x1f4/0x250 [nfs]
[<ffffffffa017cb9b>] nfs4_opendata_to_nfs4_state+0x26c/0x2c9 [nfs]
[<ffffffffa0181827>] nfs4_do_open+0x364/0x37c [nfs]
[<ffffffffa0181867>] nfs4_atomic_open+0x28/0x45 [nfs]
[<ffffffffa0165edc>] nfs_open_revalidate+0xf8/0x1a1 [nfs]
[<ffffffff81135111>] d_revalidate+0x21/0x56
[<ffffffff811351ca>] do_revalidate+0x1d/0x7a
[<ffffffff811353eb>] do_lookup+0x1c4/0x1f8
[<ffffffff81137406>] link_path_walk+0x260/0x485
[<ffffffff8113786a>] do_path_lookup+0x50/0x10d
[<ffffffff81138658>] do_filp_open+0x137/0x65e
[<ffffffff81129e6e>] do_sys_open+0x60/0xf2
[<ffffffff81129f33>] sys_open+0x20/0x22
[<ffffffff8100ab82>] system_call_fastpath+0x16/0x1b
On 2011-01-31 17:27, Fred Isaman wrote:
> The pnfs code was using throughout the lock order i_lock, cl_lock.
> This conflicts with the nfs delegation code. Rework the pnfs code
> to avoid taking both locks simultaneously.
>
> Currently the code takes the double lock to add/remove the layout to a
> nfs_client list, while atomically checking that the list of lsegs is
> empty. To avoid this, we rely on existing serializations. When a
> layout is initialized with lseg count equal zero, LAYOUTGET's
> openstateid serialization is in effect, making it safe to assume it
> stays zero unless we change it. And once a layout's lseg count drops
> to zero, it is set as DESTROYED and so will stay at zero.
>
> Signed-off-by: Fred Isaman <[email protected]>
> ---
> fs/nfs/callback_proc.c | 2 +-
> fs/nfs/pnfs.c | 50 +++++++++++++++++++++++++++--------------------
> 2 files changed, 30 insertions(+), 22 deletions(-)
>
> diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
> index 8958757..2f41dcce 100644
> --- a/fs/nfs/callback_proc.c
> +++ b/fs/nfs/callback_proc.c
> @@ -188,10 +188,10 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
> rv = NFS4ERR_DELAY;
> list_del_init(&lo->plh_bulk_recall);
> spin_unlock(&ino->i_lock);
> + pnfs_free_lseg_list(&free_me_list);
> put_layout_hdr(lo);
> iput(ino);
> }
> - pnfs_free_lseg_list(&free_me_list);
> return rv;
> }
>
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index f89c3bb..ee6c69a 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -247,13 +247,6 @@ put_lseg_locked(struct pnfs_layout_segment *lseg,
> BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
> list_del(&lseg->pls_list);
> if (list_empty(&lseg->pls_layout->plh_segs)) {
> - struct nfs_client *clp;
> -
> - clp = NFS_SERVER(ino)->nfs_client;
> - spin_lock(&clp->cl_lock);
> - /* List does not take a reference, so no need for put here */
> - list_del_init(&lseg->pls_layout->plh_layouts);
> - spin_unlock(&clp->cl_lock);
> set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
> /* Matched by initial refcount set in alloc_init_layout_hdr */
> put_layout_hdr_locked(lseg->pls_layout);
> @@ -319,14 +312,30 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
> return invalid - removed;
> }
>
> +/* note free_me must contain lsegs from a single layout_hdr */
> void
> pnfs_free_lseg_list(struct list_head *free_me)
> {
> - struct pnfs_layout_segment *lseg, *tmp;
> + if (!list_empty(free_me)) {
> + struct pnfs_layout_segment *lseg, *tmp;
> + struct pnfs_layout_hdr *lo;
>
> - list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
> - list_del(&lseg->pls_list);
> - free_lseg(lseg);
> + lo = list_first_entry(free_me,
> + struct pnfs_layout_segment,
> + pls_list)->pls_layout;
> +
> + if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) {
> + struct nfs_client *clp;
> +
> + clp = NFS_SERVER(lo->plh_inode)->nfs_client;
> + spin_lock(&clp->cl_lock);
> + list_del_init(&lo->plh_layouts);
> + spin_unlock(&clp->cl_lock);
> + }
> + list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
> + list_del(&lseg->pls_list);
> + free_lseg(lseg);
> + }
> }
> }
>
> @@ -704,6 +713,7 @@ pnfs_update_layout(struct inode *ino,
> struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
> struct pnfs_layout_hdr *lo;
> struct pnfs_layout_segment *lseg = NULL;
> + bool first = false;
>
> if (!pnfs_enabled_sb(NFS_SERVER(ino)))
> return NULL;
> @@ -734,7 +744,10 @@ pnfs_update_layout(struct inode *ino,
> atomic_inc(&lo->plh_outstanding);
>
> get_layout_hdr(lo);
> - if (list_empty(&lo->plh_segs)) {
> + if (list_empty(&lo->plh_segs))
> + first = true;
> + spin_unlock(&ino->i_lock);
> + if (first) {
> /* The lo must be on the clp list if there is any
> * chance of a CB_LAYOUTRECALL(FILE) coming in.
> */
> @@ -743,17 +756,12 @@ pnfs_update_layout(struct inode *ino,
> list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
> spin_unlock(&clp->cl_lock);
> }
> - spin_unlock(&ino->i_lock);
>
> lseg = send_layoutget(lo, ctx, iomode);
> - if (!lseg) {
> - spin_lock(&ino->i_lock);
> - if (list_empty(&lo->plh_segs)) {
> - spin_lock(&clp->cl_lock);
> - list_del_init(&lo->plh_layouts);
> - spin_unlock(&clp->cl_lock);
> - }
> - spin_unlock(&ino->i_lock);
> + if (!lseg && first) {
> + spin_lock(&clp->cl_lock);
> + list_del_init(&lo->plh_layouts);
> + spin_unlock(&clp->cl_lock);
> }
> atomic_dec(&lo->plh_outstanding);
> put_layout_hdr(lo);
On 2011-01-31 17:27, Fred Isaman wrote:
> The pnfs code was using throughout the lock order i_lock, cl_lock.
> This conflicts with the nfs delegation code. Rework the pnfs code
> to avoid taking both locks simultaneously.
>
> Currently the code takes the double lock to add/remove the layout to a
> nfs_client list, while atomically checking that the list of lsegs is
> empty. To avoid this, we rely on existing serializations. When a
> layout is initialized with lseg count equal zero, LAYOUTGET's
> openstateid serialization is in effect, making it safe to assume it
> stays zero unless we change it. And once a layout's lseg count drops
> to zero, it is set as DESTROYED and so will stay at zero.
>
> Signed-off-by: Fred Isaman <[email protected]>
> ---
> fs/nfs/callback_proc.c | 2 +-
> fs/nfs/pnfs.c | 50 +++++++++++++++++++++++++++--------------------
> 2 files changed, 30 insertions(+), 22 deletions(-)
>
> diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
> index 8958757..2f41dcce 100644
> --- a/fs/nfs/callback_proc.c
> +++ b/fs/nfs/callback_proc.c
> @@ -188,10 +188,10 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
> rv = NFS4ERR_DELAY;
> list_del_init(&lo->plh_bulk_recall);
> spin_unlock(&ino->i_lock);
> + pnfs_free_lseg_list(&free_me_list);
> put_layout_hdr(lo);
> iput(ino);
> }
> - pnfs_free_lseg_list(&free_me_list);
> return rv;
> }
>
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index f89c3bb..ee6c69a 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -247,13 +247,6 @@ put_lseg_locked(struct pnfs_layout_segment *lseg,
> BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
> list_del(&lseg->pls_list);
> if (list_empty(&lseg->pls_layout->plh_segs)) {
> - struct nfs_client *clp;
> -
> - clp = NFS_SERVER(ino)->nfs_client;
> - spin_lock(&clp->cl_lock);
> - /* List does not take a reference, so no need for put here */
> - list_del_init(&lseg->pls_layout->plh_layouts);
> - spin_unlock(&clp->cl_lock);
> set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
> /* Matched by initial refcount set in alloc_init_layout_hdr */
> put_layout_hdr_locked(lseg->pls_layout);
> @@ -319,14 +312,30 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
> return invalid - removed;
> }
>
> +/* note free_me must contain lsegs from a single layout_hdr */
> void
> pnfs_free_lseg_list(struct list_head *free_me)
> {
> - struct pnfs_layout_segment *lseg, *tmp;
> + if (!list_empty(free_me)) {
Since this puts everything in this function under the condition
why not define an inline wrapper in pnfs.h
that checks that and calls the real function only when the list's not empty
or, if this is assumed to be a rare case, just begin the function with
if (list_empty(free_me))
return;
Benny
> + struct pnfs_layout_segment *lseg, *tmp;
> + struct pnfs_layout_hdr *lo;
>
> - list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
> - list_del(&lseg->pls_list);
> - free_lseg(lseg);
> + lo = list_first_entry(free_me,
> + struct pnfs_layout_segment,
> + pls_list)->pls_layout;
> +
> + if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) {
> + struct nfs_client *clp;
> +
> + clp = NFS_SERVER(lo->plh_inode)->nfs_client;
> + spin_lock(&clp->cl_lock);
> + list_del_init(&lo->plh_layouts);
> + spin_unlock(&clp->cl_lock);
> + }
> + list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
> + list_del(&lseg->pls_list);
> + free_lseg(lseg);
> + }
> }
> }
>
> @@ -704,6 +713,7 @@ pnfs_update_layout(struct inode *ino,
> struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
> struct pnfs_layout_hdr *lo;
> struct pnfs_layout_segment *lseg = NULL;
> + bool first = false;
>
> if (!pnfs_enabled_sb(NFS_SERVER(ino)))
> return NULL;
> @@ -734,7 +744,10 @@ pnfs_update_layout(struct inode *ino,
> atomic_inc(&lo->plh_outstanding);
>
> get_layout_hdr(lo);
> - if (list_empty(&lo->plh_segs)) {
> + if (list_empty(&lo->plh_segs))
> + first = true;
> + spin_unlock(&ino->i_lock);
> + if (first) {
> /* The lo must be on the clp list if there is any
> * chance of a CB_LAYOUTRECALL(FILE) coming in.
> */
> @@ -743,17 +756,12 @@ pnfs_update_layout(struct inode *ino,
> list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
> spin_unlock(&clp->cl_lock);
> }
> - spin_unlock(&ino->i_lock);
>
> lseg = send_layoutget(lo, ctx, iomode);
> - if (!lseg) {
> - spin_lock(&ino->i_lock);
> - if (list_empty(&lo->plh_segs)) {
> - spin_lock(&clp->cl_lock);
> - list_del_init(&lo->plh_layouts);
> - spin_unlock(&clp->cl_lock);
> - }
> - spin_unlock(&ino->i_lock);
> + if (!lseg && first) {
> + spin_lock(&clp->cl_lock);
> + list_del_init(&lo->plh_layouts);
> + spin_unlock(&clp->cl_lock);
> }
> atomic_dec(&lo->plh_outstanding);
> put_layout_hdr(lo);
On Wed, 2011-02-02 at 06:19 +0200, Benny Halevy wrote:
> On 2011-02-01 18:31, Fred Isaman wrote:
> >
> > On Feb 1, 2011, at 10:38 AM, Benny Halevy wrote:
> >
> >> On 2011-01-31 17:27, Fred Isaman wrote:
> >>> The code could violate the following from RFC5661, section 12.5.3:
> >>> "Once a client has no more layouts on a file, the layout stateid is no
> >>> longer valid and MUST NOT be used."
> >>
> >> NACK.
> >>
> >> Fred, this is your interpretation of the forgetful model and it is taken
> >> out of context.
> >>
> >> Until the spec is changed only the server invalidates the stateid by returning
> >> lrs_present=false on LAYOUTRETURN. Merely forgetting the layout state without
> >> LAYOUTRETURN does not determine that.
> >>
> >>
> >> Also from 12.5.3:
> >> Once a layout stateid is established, the "other"
> >> field will stay constant unless the stateid is revoked or the client
> >> returns all layouts on the file and the server disposes of the stateid.
> >> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>
> > OK, so the question is, does sending a LAYOUTGET with openstateid "return all layouts".
> > Making the answer to this "yes" is perfectly consistent with the spec, given its complete absence
> > of direction here.
>
> I disagree, and so are other server implementations, including linux-pnfs!
> (It will return BAD_STATEID if the client forgets the layout
> in any case but LAYOUTRETURN or CB_LAYOUTRECALL/NOMATCHING_LAYOUT)
Where do you see this being allowed? I see nothing in RFC5661 that
justifies the server returning BAD_STATEID (or any other error) if the
client supplies an open stateid in LAYOUTGET. The only stateids that are
explicitly disallowed in section 12 are the special stateids.
IOW: I see no justification there for your interpretation or the above
error message. Fix the linux pnfs server and the "other server
implementation" if it does this.
The only text I can find is repeated in both section 12.5.2. and section
12.5.3 and states that the client uses an open stateid, delegation
stateid or lock stateid if it holds no layouts.
Section 12.5.5.1 then states that the client is allowed to forget
layouts and that this is safe as long as it doesn't assume it holds
layouts for ranges that lie beyond what the server granted. The same
section states that the server has no control over what the client
believes, and so it is allowed to recall a layout that the client knows
nothing about if it is in doubt.
Trond
--
Trond Myklebust
Linux NFS client maintainer
NetApp
[email protected]
http://www.netapp.com
On 2011-02-02 07:56, Trond Myklebust wrote:
> On Wed, 2011-02-02 at 06:19 +0200, Benny Halevy wrote:
>> On 2011-02-01 18:31, Fred Isaman wrote:
>>>
>>> On Feb 1, 2011, at 10:38 AM, Benny Halevy wrote:
>>>
>>>> On 2011-01-31 17:27, Fred Isaman wrote:
>>>>> The code could violate the following from RFC5661, section 12.5.3:
>>>>> "Once a client has no more layouts on a file, the layout stateid is no
>>>>> longer valid and MUST NOT be used."
>>>>
>>>> NACK.
>>>>
>>>> Fred, this is your interpretation of the forgetful model and it is taken
>>>> out of context.
>>>>
>>>> Until the spec is changed only the server invalidates the stateid by returning
>>>> lrs_present=false on LAYOUTRETURN. Merely forgetting the layout state without
>>>> LAYOUTRETURN does not determine that.
>>>>
>>>>
>>>> Also from 12.5.3:
>>>> Once a layout stateid is established, the "other"
>>>> field will stay constant unless the stateid is revoked or the client
>>>> returns all layouts on the file and the server disposes of the stateid.
>>>> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>>
>>> OK, so the question is, does sending a LAYOUTGET with openstateid "return all layouts".
>>> Making the answer to this "yes" is perfectly consistent with the spec, given its complete absence
>>> of direction here.
>>
>> I disagree, and so are other server implementations, including linux-pnfs!
>> (It will return BAD_STATEID if the client forgets the layout
>> in any case but LAYOUTRETURN or CB_LAYOUTRECALL/NOMATCHING_LAYOUT)
>
> Where do you see this being allowed? I see nothing in RFC5661 that
> justifies the server returning BAD_STATEID (or any other error) if the
> client supplies an open stateid in LAYOUTGET. The only stateids that are
> explicitly disallowed in section 12 are the special stateids.
> IOW: I see no justification there for your interpretation or the above
> error message. Fix the linux pnfs server and the "other server
> implementation" if it does this.
This case is indeed not defined well in the spec.
>
> The only text I can find is repeated in both section 12.5.2. and section
> 12.5.3 and states that the client uses an open stateid, delegation
> stateid or lock stateid if it holds no layouts.
Right, but the point in which it is defined to hold no layout is when
it has returned them to the server. There is not much sense in arguing
that over and over again as we're in agreement regarding the proposed
errata.
Benny
> Section 12.5.5.1 then states that the client is allowed to forget
> layouts and that this is safe as long as it doesn't assume it holds
> layouts for ranges that lie beyond what the server granted. The same
> section states that the server has no control over what the client
> believes, and so it is allowed to recall a layout that the client knows
> nothing about if it is in doubt.
>
>
> Trond
On Feb 1, 2011, at 11:19 PM, Benny Halevy wrote:
> On 2011-02-01 18:31, Fred Isaman wrote:
>>
>> On Feb 1, 2011, at 10:38 AM, Benny Halevy wrote:
>>
>>> On 2011-01-31 17:27, Fred Isaman wrote:
>>>> The code could violate the following from RFC5661, section 12.5.3:
>>>> "Once a client has no more layouts on a file, the layout stateid is no
>>>> longer valid and MUST NOT be used."
>>>
>>> NACK.
>>>
>>> Fred, this is your interpretation of the forgetful model and it is taken
>>> out of context.
>>>
>>> Until the spec is changed only the server invalidates the stateid by returning
>>> lrs_present=false on LAYOUTRETURN. Merely forgetting the layout state without
>>> LAYOUTRETURN does not determine that.
>>>
>>>
>>> Also from 12.5.3:
>>> Once a layout stateid is established, the "other"
>>> field will stay constant unless the stateid is revoked or the client
>>> returns all layouts on the file and the server disposes of the stateid.
>>> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>>>
>>
>>
>> OK, so the question is, does sending a LAYOUTGET with openstateid "return all layouts".
>> Making the answer to this "yes" is perfectly consistent with the spec, given its complete absence
>> of direction here.
>
> I disagree, and so are other server implementations, including linux-pnfs!
> (It will return BAD_STATEID if the client forgets the layout
> in any case but LAYOUTRETURN or CB_LAYOUTRECALL/NOMATCHING_LAYOUT)
>
>> The alternative is a forgetful model where the client can forget layouts, but
>> not layout stateid.
>
> Right, and this is the direction we should go until the errata is in place.
>
>>
>> The question is, is there any objection to the above interpretation of LAYOUTGET with openstateid?
>> Because if not, then we will shortly get an appropriate errata,
>> and there is no good reason to delay the patch until the errata is finalized.
>> I know previously you had objected on the basis that this prevents parallel use of openstateid with LAYOUTGET.
>> But given that you had indicated that such parallel use can not be done for other reasons,
>> I have been working on the assumption that the interpretation above will be accepted without controversy at the upcoming IETF.
>>
>> On the other hand, if there is objection to the above interpretation, that should be made known.
>
> I simulated this with the layout-sim and given the LAYOUTGET sent with the initial stateid
> is always fully serialized with other layout stateid-changing operations, this model works.
>
> I object the timing, before the IETF discussion is over and before the upcoming Connectathon
> where other server vendors have no chance to implement this erratum.
>
Our goal is to have at cthon working code to test that is basically what will be submitted upstream, Waiting for the IETF to be over before implementing means nothing will be implemented. Instead, we have been consistently making best guesses, with Trond's input, on how each spec ambiguity will be resolved.
This patch fixes two real existing bugs. It prepares the way for fixing another real existing bug (lock inversion). Yes, it makes an assumption on how we will decide to clarify a spec issue, but so does much of the existing code. (For example, we update seqid on NOMATCHING, all the roc code.) Finally, the current code *already* makes this assumption. Because the client can randomly forget layouts, it has NO way to know when sending a NOMATCHING return whether it should keep the stateid or not, so it *already* uses the assumption that if it has none in memory, it will send an openstateid.
Fred
On 2011-02-02 17:45, Fred Isaman wrote:
>
> On Feb 1, 2011, at 11:19 PM, Benny Halevy wrote:
>
>> On 2011-02-01 18:31, Fred Isaman wrote:
>>>
>>> On Feb 1, 2011, at 10:38 AM, Benny Halevy wrote:
>>>
>>>> On 2011-01-31 17:27, Fred Isaman wrote:
>>>>> The code could violate the following from RFC5661, section 12.5.3:
>>>>> "Once a client has no more layouts on a file, the layout stateid is no
>>>>> longer valid and MUST NOT be used."
>>>>
>>>> NACK.
>>>>
>>>> Fred, this is your interpretation of the forgetful model and it is taken
>>>> out of context.
>>>>
>>>> Until the spec is changed only the server invalidates the stateid by returning
>>>> lrs_present=false on LAYOUTRETURN. Merely forgetting the layout state without
>>>> LAYOUTRETURN does not determine that.
>>>>
>>>>
>>>> Also from 12.5.3:
>>>> Once a layout stateid is established, the "other"
>>>> field will stay constant unless the stateid is revoked or the client
>>>> returns all layouts on the file and the server disposes of the stateid.
>>>> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>>>>
>>>
>>>
>>> OK, so the question is, does sending a LAYOUTGET with openstateid "return all layouts".
>>> Making the answer to this "yes" is perfectly consistent with the spec, given its complete absence
>>> of direction here.
>>
>> I disagree, and so are other server implementations, including linux-pnfs!
>> (It will return BAD_STATEID if the client forgets the layout
>> in any case but LAYOUTRETURN or CB_LAYOUTRECALL/NOMATCHING_LAYOUT)
>>
>>> The alternative is a forgetful model where the client can forget layouts, but
>>> not layout stateid.
>>
>> Right, and this is the direction we should go until the errata is in place.
>>
>>>
>>> The question is, is there any objection to the above interpretation of LAYOUTGET with openstateid?
>>> Because if not, then we will shortly get an appropriate errata,
>>> and there is no good reason to delay the patch until the errata is finalized.
>>> I know previously you had objected on the basis that this prevents parallel use of openstateid with LAYOUTGET.
>>> But given that you had indicated that such parallel use can not be done for other reasons,
>>> I have been working on the assumption that the interpretation above will be accepted without controversy at the upcoming IETF.
>>>
>>> On the other hand, if there is objection to the above interpretation, that should be made known.
>>
>> I simulated this with the layout-sim and given the LAYOUTGET sent with the initial stateid
>> is always fully serialized with other layout stateid-changing operations, this model works.
>>
>> I object the timing, before the IETF discussion is over and before the upcoming Connectathon
>> where other server vendors have no chance to implement this erratum.
>>
>
> Our goal is to have at cthon working code to test that is basically what will be submitted upstream, Waiting for the IETF to be over before implementing means nothing will be implemented. Instead, we have been consistently making best guesses, with Trond's input, on how each spec ambiguity will be resolved.
>
> This patch fixes two real existing bugs. It prepares the way for fixing another real existing bug (lock inversion). Yes, it makes an assumption on how we will decide to clarify a spec issue, but so does much of the existing code. (For example, we update seqid on NOMATCHING, all the roc code.) Finally, the current code *already* makes this assumption. Because the client can randomly forget layouts, it has NO way to know when sending a NOMATCHING return whether it should keep the stateid or not, so it *already* uses the assumption that if it has none in memory, it will send an openstateid.
OK. I'm convinced this patch doesn't screw things up more than they already are.
Benny
>
> Fred
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2011-01-31 17:27, Fred Isaman wrote:
> The code could violate the following from RFC5661, section 12.5.3:
> "Once a client has no more layouts on a file, the layout stateid is no
> longer valid and MUST NOT be used."
NACK.
Fred, this is your interpretation of the forgetful model and it is taken
out of context.
Until the spec is changed only the server invalidates the stateid by returning
lrs_present=false on LAYOUTRETURN. Merely forgetting the layout state without
LAYOUTRETURN does not determine that.
Also from 12.5.3:
Once a layout stateid is established, the "other"
field will stay constant unless the stateid is revoked or the client
returns all layouts on the file and the server disposes of the stateid.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
and
Once the client receives a layout stateid, it MUST use the correct
"seqid" for subsequent LAYOUTGET or LAYOUTRETURN operations. The
correct "seqid" is defined as the highest "seqid" value from
responses of fully processed LAYOUTGET or LAYOUTRETURN operations or
arguments of a fully processed CB_LAYOUTRECALL operation.
and 18.43.3
The loga_stateid field specifies a valid stateid. If a layout is not
currently held by the client, the loga_stateid field represents a
stateid reflecting the correspondingly valid open, byte-range lock,
or delegation stateid. Once a layout is held on the file by the
client, the loga_stateid field MUST be a stateid as returned from a
previous LAYOUTGET or LAYOUTRETURN operation or provided by a
CB_LAYOUTRECALL operation (see Section 12.5.3).
Only when we agree that the client can throw away the layout state and
send a singular future LAYOUTGET with a non-layout stateid - something it MUST not
do at the moment - only then we can do what this patch suggests.
Benny
>
> This can occur when a layout already has a lseg, starts another
> non-everlapping LAYOUTGET, and a CB_LAYOUTRECALL for the existing lseg
> is processed before we hit pnfs_layout_process().
>
> Solve by setting, each time the client has no more lsegs for a file, a
> flag which blocks further use of the layout and triggers its removal.
>
> This also fixes a second bug which occurs in the same instance as
> above. If we actually use pnfs_layout_process, we add the new lseg to
> the layout, but the layout has been removed from the nfs_client list
> by the intervening CB_LAYOUTRECALL and will not be added back. Thus
> the newly acquired lseg will not be properly returned in the event of
> a subsequent CB_LAYOUTRECALL.
>
> Signed-off-by: Fred Isaman <[email protected]>
> ---
> fs/nfs/pnfs.c | 12 +++++++++---
> 1 files changed, 9 insertions(+), 3 deletions(-)
>
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 1b1bc1a..dcd5d54 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -255,6 +255,9 @@ put_lseg_locked(struct pnfs_layout_segment *lseg,
> list_del_init(&lseg->pls_layout->plh_layouts);
> spin_unlock(&clp->cl_lock);
> clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
> + set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
> + /* Matched by initial refcount set in alloc_init_layout_hdr */
> + put_layout_hdr_locked(lseg->pls_layout);
> }
> rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
> list_add(&lseg->pls_list, tmp_list);
> @@ -299,6 +302,11 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
>
> dprintk("%s:Begin lo %p\n", __func__, lo);
>
> + if (list_empty(&lo->plh_segs)) {
> + if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
> + put_layout_hdr_locked(lo);
> + return 0;
> + }
> list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
> if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
> dprintk("%s: freeing lseg %p iomode %d "
> @@ -332,10 +340,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
> spin_lock(&nfsi->vfs_inode.i_lock);
> lo = nfsi->layout;
> if (lo) {
> - set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
> mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
> - /* Matched by refcount set to 1 in alloc_init_layout_hdr */
> - put_layout_hdr_locked(lo);
> }
> spin_unlock(&nfsi->vfs_inode.i_lock);
> pnfs_free_lseg_list(&tmp_list);
> @@ -403,6 +408,7 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
> (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
> return true;
> return lo->plh_block_lgets ||
> + test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
> test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
> (list_empty(&lo->plh_segs) &&
> (atomic_read(&lo->plh_outstanding) > lget));
On Feb 1, 2011, at 10:38 AM, Benny Halevy wrote:
> On 2011-01-31 17:27, Fred Isaman wrote:
>> The code could violate the following from RFC5661, section 12.5.3:
>> "Once a client has no more layouts on a file, the layout stateid is no
>> longer valid and MUST NOT be used."
>
> NACK.
>
> Fred, this is your interpretation of the forgetful model and it is taken
> out of context.
>
> Until the spec is changed only the server invalidates the stateid by returning
> lrs_present=false on LAYOUTRETURN. Merely forgetting the layout state without
> LAYOUTRETURN does not determine that.
>
>
> Also from 12.5.3:
> Once a layout stateid is established, the "other"
> field will stay constant unless the stateid is revoked or the client
> returns all layouts on the file and the server disposes of the stateid.
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>
OK, so the question is, does sending a LAYOUTGET with openstateid "return all layouts".
Making the answer to this "yes" is perfectly consistent with the spec, given its complete absence
of direction here. The alternative is a forgetful model where the client can forget layouts, but
not layout stateid.
The question is, is there any objection to the above interpretation of LAYOUTGET with openstateid?
Because if not, then we will shortly get an appropriate errata, and there is no good reason to delay the patch until the errata is finalized. I know previously you had objected on the basis that this prevents parallel use of openstateid with LAYOUTGET. But given that you had indicated that such parallel use can not be done for other reasons, I have been working on the assumption that the interpretation above will be accepted without controversy at the upcoming IETF.
On the other hand, if there is objection to the above interpretation, that should be made known.
Fred
> and
>
> Once the client receives a layout stateid, it MUST use the correct
> "seqid" for subsequent LAYOUTGET or LAYOUTRETURN operations. The
> correct "seqid" is defined as the highest "seqid" value from
> responses of fully processed LAYOUTGET or LAYOUTRETURN operations or
> arguments of a fully processed CB_LAYOUTRECALL operation.
>
> and 18.43.3
>
> The loga_stateid field specifies a valid stateid. If a layout is not
> currently held by the client, the loga_stateid field represents a
> stateid reflecting the correspondingly valid open, byte-range lock,
> or delegation stateid. Once a layout is held on the file by the
> client, the loga_stateid field MUST be a stateid as returned from a
> previous LAYOUTGET or LAYOUTRETURN operation or provided by a
> CB_LAYOUTRECALL operation (see Section 12.5.3).
>
> Only when we agree that the client can throw away the layout state and
> send a singular future LAYOUTGET with a non-layout stateid - something it MUST not
> do at the moment - only then we can do what this patch suggests.
>
> Benny
>
>>
>> This can occur when a layout already has a lseg, starts another
>> non-everlapping LAYOUTGET, and a CB_LAYOUTRECALL for the existing lseg
>> is processed before we hit pnfs_layout_process().
>>
>> Solve by setting, each time the client has no more lsegs for a file, a
>> flag which blocks further use of the layout and triggers its removal.
>>
>> This also fixes a second bug which occurs in the same instance as
>> above. If we actually use pnfs_layout_process, we add the new lseg to
>> the layout, but the layout has been removed from the nfs_client list
>> by the intervening CB_LAYOUTRECALL and will not be added back. Thus
>> the newly acquired lseg will not be properly returned in the event of
>> a subsequent CB_LAYOUTRECALL.
>>
>> Signed-off-by: Fred Isaman <[email protected]>
>> ---
>> fs/nfs/pnfs.c | 12 +++++++++---
>> 1 files changed, 9 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index 1b1bc1a..dcd5d54 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -255,6 +255,9 @@ put_lseg_locked(struct pnfs_layout_segment *lseg,
>> list_del_init(&lseg->pls_layout->plh_layouts);
>> spin_unlock(&clp->cl_lock);
>> clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
>> + set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
>> + /* Matched by initial refcount set in alloc_init_layout_hdr */
>> + put_layout_hdr_locked(lseg->pls_layout);
>> }
>> rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
>> list_add(&lseg->pls_list, tmp_list);
>> @@ -299,6 +302,11 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
>>
>> dprintk("%s:Begin lo %p\n", __func__, lo);
>>
>> + if (list_empty(&lo->plh_segs)) {
>> + if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
>> + put_layout_hdr_locked(lo);
>> + return 0;
>> + }
>> list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
>> if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
>> dprintk("%s: freeing lseg %p iomode %d "
>> @@ -332,10 +340,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
>> spin_lock(&nfsi->vfs_inode.i_lock);
>> lo = nfsi->layout;
>> if (lo) {
>> - set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
>> mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
>> - /* Matched by refcount set to 1 in alloc_init_layout_hdr */
>> - put_layout_hdr_locked(lo);
>> }
>> spin_unlock(&nfsi->vfs_inode.i_lock);
>> pnfs_free_lseg_list(&tmp_list);
>> @@ -403,6 +408,7 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
>> (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
>> return true;
>> return lo->plh_block_lgets ||
>> + test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
>> test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
>> (list_empty(&lo->plh_segs) &&
>> (atomic_read(&lo->plh_outstanding) > lget));
On Feb 1, 2011, at 10:41 AM, Benny Halevy wrote:
> I'm actually seeing that with the pnfs tree over 2.8.38-rc3
> See signature below.
> Will re-test with this patch.
>
In your retest, note that this patch depends on patch 1 of the series to work correctly.
Fred
> Benny
>
> [ INFO: possible circular locking dependency detected ]
> 2.6.38-rc3-pnfs-00391-g13858b7 #5
> -------------------------------------------------------
> test5/2174 is trying to acquire lock:
> (&sb->s_type->i_lock_key#24){+.+...}, at: [<ffffffffa018e924>] nfs_inode_set_]
>
> but task is already holding lock:
> (&(&clp->cl_lock)->rlock){+.+...}, at: [<ffffffffa018e822>] nfs_inode_set_del]
>
> which lock already depends on the new lock.
>
>
> the existing dependency chain (in reverse order) is:
>
> -> #1 (&(&clp->cl_lock)->rlock){+.+...}:
> [<ffffffff8108281f>] lock_acquire+0xd3/0x100
> [<ffffffff8147df43>] _raw_spin_lock+0x36/0x69
> [<ffffffffa0193b87>] pnfs_update_layout+0x2f5/0x4d9 [nfs]
> [<ffffffffa0166862>] nfs_write_begin+0x90/0x257 [nfs]
> [<ffffffff810e1d7b>] generic_file_buffered_write+0x106/0x267
> [<ffffffff810e33a8>] __generic_file_aio_write+0x245/0x27a
> [<ffffffff810e3439>] generic_file_aio_write+0x5c/0xaa
> [<ffffffffa016728b>] nfs_file_write+0xdf/0x177 [nfs]
> [<ffffffff8112b599>] do_sync_write+0xcb/0x108
> [<ffffffff8112bf97>] vfs_write+0xb1/0x10d
> [<ffffffff8112c0bc>] sys_write+0x4d/0x77
> [<ffffffff8100ab82>] system_call_fastpath+0x16/0x1b
>
> -> #0 (&sb->s_type->i_lock_key#24){+.+...}:
> [<ffffffff81082457>] __lock_acquire+0xa45/0xd3a
> [<ffffffff8108281f>] lock_acquire+0xd3/0x100
> [<ffffffff8147df43>] _raw_spin_lock+0x36/0x69
> [<ffffffffa018e924>] nfs_inode_set_delegation+0x1f4/0x250 [nfs]
> [<ffffffffa017cb9b>] nfs4_opendata_to_nfs4_state+0x26c/0x2c9 [nfs]
> [<ffffffffa0181827>] nfs4_do_open+0x364/0x37c [nfs]
> [<ffffffffa0181867>] nfs4_atomic_open+0x28/0x45 [nfs]
> [<ffffffffa0165edc>] nfs_open_revalidate+0xf8/0x1a1 [nfs]
> [<ffffffff81135111>] d_revalidate+0x21/0x56
> [<ffffffff811351ca>] do_revalidate+0x1d/0x7a
> [<ffffffff811353eb>] do_lookup+0x1c4/0x1f8
> [<ffffffff81137406>] link_path_walk+0x260/0x485
> [<ffffffff8113786a>] do_path_lookup+0x50/0x10d
> [<ffffffff81138658>] do_filp_open+0x137/0x65e
> [<ffffffff81129e6e>] do_sys_open+0x60/0xf2
> [<ffffffff81129f33>] sys_open+0x20/0x22
> [<ffffffff8100ab82>] system_call_fastpath+0x16/0x1b
>
>
> On 2011-01-31 17:27, Fred Isaman wrote:
>> The pnfs code was using throughout the lock order i_lock, cl_lock.
>> This conflicts with the nfs delegation code. Rework the pnfs code
>> to avoid taking both locks simultaneously.
>>
>> Currently the code takes the double lock to add/remove the layout to a
>> nfs_client list, while atomically checking that the list of lsegs is
>> empty. To avoid this, we rely on existing serializations. When a
>> layout is initialized with lseg count equal zero, LAYOUTGET's
>> openstateid serialization is in effect, making it safe to assume it
>> stays zero unless we change it. And once a layout's lseg count drops
>> to zero, it is set as DESTROYED and so will stay at zero.
>>
>> Signed-off-by: Fred Isaman <[email protected]>
>> ---
>> fs/nfs/callback_proc.c | 2 +-
>> fs/nfs/pnfs.c | 50 +++++++++++++++++++++++++++--------------------
>> 2 files changed, 30 insertions(+), 22 deletions(-)
>>
>> diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
>> index 8958757..2f41dcce 100644
>> --- a/fs/nfs/callback_proc.c
>> +++ b/fs/nfs/callback_proc.c
>> @@ -188,10 +188,10 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
>> rv = NFS4ERR_DELAY;
>> list_del_init(&lo->plh_bulk_recall);
>> spin_unlock(&ino->i_lock);
>> + pnfs_free_lseg_list(&free_me_list);
>> put_layout_hdr(lo);
>> iput(ino);
>> }
>> - pnfs_free_lseg_list(&free_me_list);
>> return rv;
>> }
>>
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index f89c3bb..ee6c69a 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -247,13 +247,6 @@ put_lseg_locked(struct pnfs_layout_segment *lseg,
>> BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
>> list_del(&lseg->pls_list);
>> if (list_empty(&lseg->pls_layout->plh_segs)) {
>> - struct nfs_client *clp;
>> -
>> - clp = NFS_SERVER(ino)->nfs_client;
>> - spin_lock(&clp->cl_lock);
>> - /* List does not take a reference, so no need for put here */
>> - list_del_init(&lseg->pls_layout->plh_layouts);
>> - spin_unlock(&clp->cl_lock);
>> set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
>> /* Matched by initial refcount set in alloc_init_layout_hdr */
>> put_layout_hdr_locked(lseg->pls_layout);
>> @@ -319,14 +312,30 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
>> return invalid - removed;
>> }
>>
>> +/* note free_me must contain lsegs from a single layout_hdr */
>> void
>> pnfs_free_lseg_list(struct list_head *free_me)
>> {
>> - struct pnfs_layout_segment *lseg, *tmp;
>> + if (!list_empty(free_me)) {
>> + struct pnfs_layout_segment *lseg, *tmp;
>> + struct pnfs_layout_hdr *lo;
>>
>> - list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
>> - list_del(&lseg->pls_list);
>> - free_lseg(lseg);
>> + lo = list_first_entry(free_me,
>> + struct pnfs_layout_segment,
>> + pls_list)->pls_layout;
>> +
>> + if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) {
>> + struct nfs_client *clp;
>> +
>> + clp = NFS_SERVER(lo->plh_inode)->nfs_client;
>> + spin_lock(&clp->cl_lock);
>> + list_del_init(&lo->plh_layouts);
>> + spin_unlock(&clp->cl_lock);
>> + }
>> + list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
>> + list_del(&lseg->pls_list);
>> + free_lseg(lseg);
>> + }
>> }
>> }
>>
>> @@ -704,6 +713,7 @@ pnfs_update_layout(struct inode *ino,
>> struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
>> struct pnfs_layout_hdr *lo;
>> struct pnfs_layout_segment *lseg = NULL;
>> + bool first = false;
>>
>> if (!pnfs_enabled_sb(NFS_SERVER(ino)))
>> return NULL;
>> @@ -734,7 +744,10 @@ pnfs_update_layout(struct inode *ino,
>> atomic_inc(&lo->plh_outstanding);
>>
>> get_layout_hdr(lo);
>> - if (list_empty(&lo->plh_segs)) {
>> + if (list_empty(&lo->plh_segs))
>> + first = true;
>> + spin_unlock(&ino->i_lock);
>> + if (first) {
>> /* The lo must be on the clp list if there is any
>> * chance of a CB_LAYOUTRECALL(FILE) coming in.
>> */
>> @@ -743,17 +756,12 @@ pnfs_update_layout(struct inode *ino,
>> list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
>> spin_unlock(&clp->cl_lock);
>> }
>> - spin_unlock(&ino->i_lock);
>>
>> lseg = send_layoutget(lo, ctx, iomode);
>> - if (!lseg) {
>> - spin_lock(&ino->i_lock);
>> - if (list_empty(&lo->plh_segs)) {
>> - spin_lock(&clp->cl_lock);
>> - list_del_init(&lo->plh_layouts);
>> - spin_unlock(&clp->cl_lock);
>> - }
>> - spin_unlock(&ino->i_lock);
>> + if (!lseg && first) {
>> + spin_lock(&clp->cl_lock);
>> + list_del_init(&lo->plh_layouts);
>> + spin_unlock(&clp->cl_lock);
>> }
>> atomic_dec(&lo->plh_outstanding);
>> put_layout_hdr(lo);