2010-05-10 03:37:46

by Zhang Jingwang

[permalink] [raw]
Subject: [PATCH] pnfsblock: Lookup list entry of layouts and tags in reverse order

Optimize for sequencial write. Layout infos and tags are organized by
file offset. When appending data to a file whole list will be examined,
which introduce notable performance decrease.

Signed-off-by: Zhang Jingwang <[email protected]>
---
fs/nfs/blocklayout/extents.c | 126 +++++++++++++++++++++---------------------
1 files changed, 64 insertions(+), 62 deletions(-)

diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 3c311f2..514f2cc 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -66,8 +66,8 @@ static int32_t _find_entry(struct my_tree_t *tree, u64 s)
struct pnfs_inval_tracking *pos;

dprintk("%s(%llu) enter\n", __func__, s);
- list_for_each_entry(pos, &tree->mtt_stub, it_link) {
- if (pos->it_sector < s)
+ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
+ if (pos->it_sector > s)
continue;
else if (pos->it_sector == s)
return pos->it_tags & INTERNAL_MASK;
@@ -102,8 +102,8 @@ static int _add_entry(struct my_tree_t *tree, u64 s, int32_t tag,
struct pnfs_inval_tracking *pos;

dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
- list_for_each_entry(pos, &tree->mtt_stub, it_link) {
- if (pos->it_sector < s)
+ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
+ if (pos->it_sector > s)
continue;
else if (pos->it_sector == s) {
found = 1;
@@ -125,7 +125,7 @@ static int _add_entry(struct my_tree_t *tree, u64 s, int32_t tag,
}
new->it_sector = s;
new->it_tags = (1 << tag);
- list_add_tail(&new->it_link, &pos->it_link);
+ list_add(&new->it_link, &pos->it_link);
return 1;
}
}
@@ -230,14 +230,14 @@ _range_has_tag(struct my_tree_t *tree, u64 start, u64 end, int32_t tag)
u64 expect = 0;

dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
- list_for_each_entry(pos, &tree->mtt_stub, it_link) {
- if (pos->it_sector < start)
+ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
+ if (pos->it_sector >= end)
continue;
if (!expect) {
- if ((pos->it_sector == start) &&
+ if ((pos->it_sector == end - tree->mtt_step_size) &&
(pos->it_tags & (1 << tag))) {
- expect = start + tree->mtt_step_size;
- if (expect == end)
+ expect = pos->it_sector - tree->mtt_step_size;
+ if (expect < start)
return 1;
continue;
} else {
@@ -246,8 +246,8 @@ _range_has_tag(struct my_tree_t *tree, u64 start, u64 end, int32_t tag)
}
if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
return 0;
- expect += tree->mtt_step_size;
- if (expect == end)
+ expect -= tree->mtt_step_size;
+ if (expect < start)
return 1;
}
return 0;
@@ -594,65 +594,67 @@ add_and_merge_extent(struct pnfs_block_layout *bl,
/* Scan for proper place to insert, extending new to the left
* as much as possible.
*/
- list_for_each_entry_safe(be, tmp, list, be_node) {
- if (new->be_f_offset < be->be_f_offset)
+ list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
+ if (new->be_f_offset >= be->be_f_offset + be->be_length)
break;
- if (end <= be->be_f_offset + be->be_length) {
- /* new is a subset of existing be*/
+ if (new->be_f_offset >= be->be_f_offset) {
+ if (end <= be->be_f_offset + be->be_length) {
+ /* new is a subset of existing be*/
+ if (extents_consistent(be, new)) {
+ dprintk("%s: new is subset, ignoring\n",
+ __func__);
+ put_extent(new);
+ return 0;
+ } else {
+ goto out_err;
+ }
+ } else {
+ /* |<-- be -->|
+ * |<-- new -->| */
+ if (extents_consistent(be, new)) {
+ /* extend new to fully replace be */
+ new->be_length += new->be_f_offset -
+ be->be_f_offset;
+ new->be_f_offset = be->be_f_offset;
+ new->be_v_offset = be->be_v_offset;
+ dprintk("%s: removing %p\n", __func__, be);
+ list_del(&be->be_node);
+ put_extent(be);
+ } else {
+ goto out_err;
+ }
+ }
+ } else if (end >= be->be_f_offset + be->be_length) {
+ /* new extent overlap existing be */
if (extents_consistent(be, new)) {
- dprintk("%s: new is subset, ignoring\n",
- __func__);
- put_extent(new);
- return 0;
- } else
+ /* extend new to fully replace be */
+ dprintk("%s: removing %p\n", __func__, be);
+ list_del(&be->be_node);
+ put_extent(be);
+ } else {
goto out_err;
- } else if (new->be_f_offset <=
- be->be_f_offset + be->be_length) {
- /* new overlaps or abuts existing be */
- if (extents_consistent(be, new)) {
+ }
+ } else if (end > be->be_f_offset) {
+ /* |<-- be -->|
+ *|<-- new -->| */
+ if (extents_consistent(new, be)) {
/* extend new to fully replace be */
- new->be_length += new->be_f_offset -
- be->be_f_offset;
- new->be_f_offset = be->be_f_offset;
- new->be_v_offset = be->be_v_offset;
+ new->be_length += be->be_f_offset + be->be_length -
+ new->be_f_offset - new->be_length;
dprintk("%s: removing %p\n", __func__, be);
list_del(&be->be_node);
put_extent(be);
- } else if (new->be_f_offset !=
- be->be_f_offset + be->be_length)
+ } else {
goto out_err;
+ }
}
}
/* Note that if we never hit the above break, be will not point to a
* valid extent. However, in that case &be->be_node==list.
*/
- list_add_tail(&new->be_node, &be->be_node);
+ list_add(&new->be_node, &be->be_node);
dprintk("%s: inserting new\n", __func__);
print_elist(list);
- /* Scan forward for overlaps. If we find any, extend new and
- * remove the overlapped extent.
- */
- be = list_prepare_entry(new, list, be_node);
- list_for_each_entry_safe_continue(be, tmp, list, be_node) {
- if (end < be->be_f_offset)
- break;
- /* new overlaps or abuts existing be */
- if (extents_consistent(be, new)) {
- if (end < be->be_f_offset + be->be_length) {
- /* extend new to fully cover be */
- end = be->be_f_offset + be->be_length;
- new->be_length = end - new->be_f_offset;
- }
- dprintk("%s: removing %p\n", __func__, be);
- list_del(&be->be_node);
- put_extent(be);
- } else if (end != be->be_f_offset) {
- list_del(&new->be_node);
- goto out_err;
- }
- }
- dprintk("%s: after merging\n", __func__);
- print_elist(list);
/* STUB - The per-list consistency checks have all been done,
* should now check cross-list consistency.
*/
@@ -685,10 +687,10 @@ find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
if (ret &&
(!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
break;
- list_for_each_entry(be, &bl->bl_extents[i], be_node) {
- if (isect < be->be_f_offset)
+ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
+ if (isect >= be->be_f_offset + be->be_length)
break;
- if (isect < be->be_f_offset + be->be_length) {
+ if (isect >= be->be_f_offset) {
/* We have found an extent */
dprintk("%s Get %p (%i)\n", __func__, be,
atomic_read(&be->be_refcnt.refcount));
@@ -721,10 +723,10 @@ find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
for (i = 0; i < EXTENT_LISTS; i++) {
if (ret)
break;
- list_for_each_entry(be, &bl->bl_extents[i], be_node) {
- if (isect < be->be_f_offset)
+ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
+ if (isect >= be->be_f_offset + be->be_length)
break;
- if (isect < be->be_f_offset + be->be_length) {
+ if (isect >= be->be_f_offset) {
/* We have found an extent */
dprintk("%s Get %p (%i)\n", __func__, be,
atomic_read(&be->be_refcnt.refcount));
--
1.6.2.5



2010-05-21 23:00:13

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH] pnfsblock: Lookup list entry of layouts and tags in reverse order

On Thu, May 20, 2010 at 01:44:13PM +0800, Tao Guo wrote:
> Hi, Fields
> I used the connectathon general test with my latest code, and it
> passed the tests.
> Can you just try the two patches I have sent in the mailing list to
> see if it helps?

GENERAL TESTS: directory /mnt/TMP
if test ! -x runtests; then chmod a+x runtests; fi
cd /mnt/TMP; rm -f Makefile runtests runtests.wrk *.sh *.c mkdummy
rmdummy nroff.in makefile.tst
cp Makefile runtests runtests.wrk *.sh *.c mkdummy rmdummy nroff.in
makefile.tst /mnt/TMP

Small Compile
/usr/lib/gcc/x86_64-linux-gnu/4.4.1/../../../../lib/crt1.o: In function
`_start':
/build/buildd/eglibc-2.10.1/csu/../sysdeps/x86_64/elf/start.S:109:
undefined reference to `main'
collect2: ld returned 1 exit status
Command exited with non-zero status 1
0.02user 0.05system 0:00.12elapsed 66%CPU (0avgtext+0avgdata
22464maxresident)k
0inputs+0outputs (0major+4946minor)pagefaults 0swaps
general tests failed

I don't know what's going on there; sounds like data corruption?

--b.

2010-05-17 13:53:46

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH] pnfsblock: Lookup list entry of layouts and tags in reverse order

On Wed, May 12, 2010 at 04:28:12PM -0400, bfields wrote:
> On Wed, May 12, 2010 at 09:46:43AM +0300, Benny Halevy wrote:
> > On May. 10, 2010, 6:36 +0300, Zhang Jingwang <[email protected]> wrote:
> > > Optimize for sequencial write. Layout infos and tags are organized by
> > > file offset. When appending data to a file whole list will be examined,
> > > which introduce notable performance decrease.
> >
> > Looks good to me.
> >
> > Fred, can you please double check?
>
> I don't know if Fred's still up for reviewing block stuff?
>
> I've been trying to keep up with at least some minimal testing, but not
> as well as I'd like.
>
> The one thing I've noticed is that the connectathon general test has
> started failing right at the start with an IO error. The last good
> version I tested was b5c09c21, which was based on 33-rc6. The earliest
> bad version I tested was 419312ada, based on 34-rc2. A quick look at
> network traces from the two traces didn't turn up anything obvious. I
> haven't had the chance yet to look closer.

As of the latest (6666f47d), in my tests the client is falling back on
IO to the MDS and doing no block IO at all. b5c09c21 still works, so
the problem isn't due to a change in the server I'm testing against. I
haven't investigated any more closely.

--b.

2010-05-17 14:24:46

by Boaz Harrosh

[permalink] [raw]
Subject: Re: [PATCH] pnfsblock: Lookup list entry of layouts and tags in reverse order

On 05/17/2010 04:53 PM, J. Bruce Fields wrote:
> On Wed, May 12, 2010 at 04:28:12PM -0400, bfields wrote:
>> On Wed, May 12, 2010 at 09:46:43AM +0300, Benny Halevy wrote:
>>> On May. 10, 2010, 6:36 +0300, Zhang Jingwang <[email protected]> wrote:
>>>> Optimize for sequencial write. Layout infos and tags are organized by
>>>> file offset. When appending data to a file whole list will be examined,
>>>> which introduce notable performance decrease.
>>>
>>> Looks good to me.
>>>
>>> Fred, can you please double check?
>>
>> I don't know if Fred's still up for reviewing block stuff?
>>
>> I've been trying to keep up with at least some minimal testing, but not
>> as well as I'd like.
>>
>> The one thing I've noticed is that the connectathon general test has
>> started failing right at the start with an IO error. The last good
>> version I tested was b5c09c21, which was based on 33-rc6. The earliest
>> bad version I tested was 419312ada, based on 34-rc2. A quick look at
>> network traces from the two traces didn't turn up anything obvious. I
>> haven't had the chance yet to look closer.
>
> As of the latest (6666f47d), in my tests the client is falling back on
> IO to the MDS and doing no block IO at all. b5c09c21 still works, so
> the problem isn't due to a change in the server I'm testing against. I
> haven't investigated any more closely.
>

You might be hitting the .commit bug, no? Still no fix. I'm using a work
around for objects. I'm not sure how it affects blocks. I think you should
see that the very first IO goes through layout driver then the IO is redone
through MDS, for each node. Even though write/read returned success because
commit returns NOT_ATTEMPTED. But I might be totally off.

Boaz
> --b.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>


2010-05-17 14:53:15

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH] pnfsblock: Lookup list entry of layouts and tags in reverse order

On Mon, May 17, 2010 at 05:24:39PM +0300, Boaz Harrosh wrote:
> On 05/17/2010 04:53 PM, J. Bruce Fields wrote:
> > On Wed, May 12, 2010 at 04:28:12PM -0400, bfields wrote:
> >> On Wed, May 12, 2010 at 09:46:43AM +0300, Benny Halevy wrote:
> >>> On May. 10, 2010, 6:36 +0300, Zhang Jingwang <[email protected]> wrote:
> >>>> Optimize for sequencial write. Layout infos and tags are organized by
> >>>> file offset. When appending data to a file whole list will be examined,
> >>>> which introduce notable performance decrease.
> >>>
> >>> Looks good to me.
> >>>
> >>> Fred, can you please double check?
> >>
> >> I don't know if Fred's still up for reviewing block stuff?
> >>
> >> I've been trying to keep up with at least some minimal testing, but not
> >> as well as I'd like.
> >>
> >> The one thing I've noticed is that the connectathon general test has
> >> started failing right at the start with an IO error. The last good
> >> version I tested was b5c09c21, which was based on 33-rc6. The earliest
> >> bad version I tested was 419312ada, based on 34-rc2. A quick look at
> >> network traces from the two traces didn't turn up anything obvious. I
> >> haven't had the chance yet to look closer.
> >
> > As of the latest (6666f47d), in my tests the client is falling back on
> > IO to the MDS and doing no block IO at all. b5c09c21 still works, so
> > the problem isn't due to a change in the server I'm testing against. I
> > haven't investigated any more closely.
> >
>
> You might be hitting the .commit bug, no? Still no fix. I'm using a work
> around for objects. I'm not sure how it affects blocks. I think you should
> see that the very first IO goes through layout driver then the IO is redone
> through MDS, for each node. Even though write/read returned success because
> commit returns NOT_ATTEMPTED. But I might be totally off.

I don't believe it's even attempting a GETLAYOUT.

I'll take a look at the network....--b.

2010-05-17 16:53:08

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH] pnfsblock: Lookup list entry of layouts and tags in reverse order

On Mon, May 17, 2010 at 10:53:11AM -0400, J. Bruce Fields wrote:
> On Mon, May 17, 2010 at 05:24:39PM +0300, Boaz Harrosh wrote:
> > On 05/17/2010 04:53 PM, J. Bruce Fields wrote:
> > > On Wed, May 12, 2010 at 04:28:12PM -0400, bfields wrote:
> > >> On Wed, May 12, 2010 at 09:46:43AM +0300, Benny Halevy wrote:
> > >>> On May. 10, 2010, 6:36 +0300, Zhang Jingwang <[email protected]> wrote:
> > >>>> Optimize for sequencial write. Layout infos and tags are organized by
> > >>>> file offset. When appending data to a file whole list will be examined,
> > >>>> which introduce notable performance decrease.
> > >>>
> > >>> Looks good to me.
> > >>>
> > >>> Fred, can you please double check?
> > >>
> > >> I don't know if Fred's still up for reviewing block stuff?
> > >>
> > >> I've been trying to keep up with at least some minimal testing, but not
> > >> as well as I'd like.
> > >>
> > >> The one thing I've noticed is that the connectathon general test has
> > >> started failing right at the start with an IO error. The last good
> > >> version I tested was b5c09c21, which was based on 33-rc6. The earliest
> > >> bad version I tested was 419312ada, based on 34-rc2. A quick look at
> > >> network traces from the two traces didn't turn up anything obvious. I
> > >> haven't had the chance yet to look closer.
> > >
> > > As of the latest (6666f47d), in my tests the client is falling back on
> > > IO to the MDS and doing no block IO at all. b5c09c21 still works, so
> > > the problem isn't due to a change in the server I'm testing against. I
> > > haven't investigated any more closely.
> > >
> >
> > You might be hitting the .commit bug, no? Still no fix. I'm using a work
> > around for objects. I'm not sure how it affects blocks. I think you should
> > see that the very first IO goes through layout driver then the IO is redone
> > through MDS, for each node. Even though write/read returned success because
> > commit returns NOT_ATTEMPTED. But I might be totally off.
>
> I don't believe it's even attempting a GETLAYOUT.
>
> I'll take a look at the network....--b.

Everything on the network looks fine, the server's doing the right
stuff, the client just never asks for a layout.

In fact, blk_initialize_mountpont is failing on the very first check:

if (server->pnfs_blksize == 0) {
dprintk("%s Server did not return blksize\n", __func__);
...

After rearranging the caller:

@@ -880,9 +880,9 @@ static void nfs4_init_pnfs(struct nfs_server *server, struct nfs_fh *mntfh, stru

if (nfs4_has_session(clp) &&
(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) {
- set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype);
pnfs_set_ds_iosize(server);
server->pnfs_blksize = fsinfo->blksize;
+ set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype);
}
#endif /* CONFIG_NFS_V4_1 */
}

it just fails a little later (see below). I haven't tried to go any
farther yet.

(But: why are the layout drivers using this odd pnfs_client_operations
indirection to call back to the common pnfs code? As far as I can tell
there's only one definition of the pnfs_client_operations, so we should
just remove that structure and call pnfs_getdevicelist, etc., by name.)

--b.

May 17 16:36:14 pearlet4 kernel: BUG: unable to handle kernel NULL pointer dereference at (null)
May 17 16:36:14 pearlet4 kernel: IP: [<ffffffff8122bc36>] _nfs4_pnfs_getdevicelist+0x26/0x110
May 17 16:36:14 pearlet4 kernel: PGD 6e11067 PUD 6e12067 PMD 0
May 17 16:36:14 pearlet4 kernel: Oops: 0000 [#1] PREEMPT
May 17 16:36:14 pearlet4 kernel: last sysfs file: /sys/kernel/uevent_seqnum
May 17 16:36:14 pearlet4 kernel: CPU 0
May 17 16:36:14 pearlet4 kernel: Modules linked in: iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi
May 17 16:36:14 pearlet4 kernel:
May 17 16:36:14 pearlet4 kernel: Pid: 2794, comm: mount.nfs4 Not tainted 2.6.34-rc6-pnfs-00314-ga35e9c3 #136 /
May 17 16:36:14 pearlet4 kernel: RIP: 0010:[<ffffffff8122bc36>] [<ffffffff8122bc36>] _nfs4_pnfs_getdevicelist+0x26/0x110
May 17 16:36:14 pearlet4 kernel: RSP: 0018:ffff880004e99538 EFLAGS: 00010246
May 17 16:36:14 pearlet4 kernel: RAX: 0000000000000000 RBX: ffff880005fff378 RCX: ffff880004e99548
May 17 16:36:14 pearlet4 kernel: RDX: ffff880004ca24c8 RSI: ffff880004e99a28 RDI: ffff880005fff378
May 17 16:36:14 pearlet4 kernel: RBP: ffff880004e995c8 R08: 0000000000000000 R09: ffff880004ca24c8
May 17 16:36:14 pearlet4 kernel: R10: 0000000000000000 R11: 0000000000000001 R12: ffff880004ca24c8
May 17 16:36:14 pearlet4 kernel: R13: ffff880004ca24c8 R14: ffff880004e995d8 R15: ffff880004e99a28
May 17 16:36:14 pearlet4 kernel: FS: 00007fed29c476f0(0000) GS:ffffffff81e1c000(0000) knlGS:0000000000000000
May 17 16:36:14 pearlet4 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
May 17 16:36:14 pearlet4 kernel: CR2: 0000000000000000 CR3: 0000000004e77000 CR4: 00000000000006f0
May 17 16:36:14 pearlet4 kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
May 17 16:36:14 pearlet4 kernel: DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
May 17 16:36:14 pearlet4 kernel: Process mount.nfs4 (pid: 2794, threadinfo ffff880004e98000, task ffff880004e78040)
May 17 16:36:14 pearlet4 kernel: Stack:
May 17 16:36:14 pearlet4 kernel: ffff880004e995c8 ffff880004e995c8 ffff880004e99588 ffffffff8190e5dc
May 17 16:36:14 pearlet4 kernel: <0> ffff880004e98000 ffff880004e995c8 ffff880004ca24c0 ffff880007800a80
May 17 16:36:14 pearlet4 kernel: <0> 0000000000000000 ffff880007800a80 ffff880004ca24c0 ffffffff810d46c6
May 17 16:36:14 pearlet4 kernel: Call Trace:
May 17 16:36:14 pearlet4 kernel: [<ffffffff8190e5dc>] ? klist_next+0x8c/0xf0
May 17 16:36:14 pearlet4 kernel: [<ffffffff810d46c6>] ? poison_obj+0x36/0x50
May 17 16:36:14 pearlet4 kernel: [<ffffffff810d4a18>] ? cache_alloc_debugcheck_after+0xe8/0x1f0
May 17 16:36:14 pearlet4 kernel: [<ffffffff8122c21e>] nfs4_pnfs_getdevicelist+0x4e/0xa0
May 17 16:36:14 pearlet4 kernel: [<ffffffff810d677d>] ? kmem_cache_alloc_notrace+0xfd/0x1a0
May 17 16:36:14 pearlet4 kernel: [<ffffffff81250e81>] bl_initialize_mountpoint+0x161/0x6a0
May 17 16:36:14 pearlet4 kernel: [<ffffffff812497c9>] set_pnfs_layoutdriver+0x89/0x120
May 17 16:36:14 pearlet4 kernel: [<ffffffff8120c71f>] nfs_probe_fsinfo+0x54f/0x5f0
May 17 16:36:14 pearlet4 kernel: [<ffffffff8120d789>] nfs_clone_server+0x129/0x270
May 17 16:36:14 pearlet4 kernel: [<ffffffff810d46c6>] ? poison_obj+0x36/0x50
May 17 16:36:14 pearlet4 kernel: [<ffffffff810d4a18>] ? cache_alloc_debugcheck_after+0xe8/0x1f0
May 17 16:36:14 pearlet4 kernel: [<ffffffff810f6db1>] ? alloc_vfsmnt+0xa1/0x180
May 17 16:36:14 pearlet4 kernel: [<ffffffff810d627d>] ? __kmalloc_track_caller+0x16d/0x2b0
May 17 16:36:14 pearlet4 kernel: [<ffffffff810f6db1>] ? alloc_vfsmnt+0xa1/0x180
May 17 16:36:14 pearlet4 kernel: [<ffffffff81219fa1>] nfs4_xdev_get_sb+0x61/0x340
May 17 16:36:14 pearlet4 kernel: [<ffffffff810dd15a>] vfs_kern_mount+0x8a/0x1e0
May 17 16:36:14 pearlet4 kernel: [<ffffffff81224f23>] nfs_follow_mountpoint+0x3b3/0x4b0
May 17 16:36:14 pearlet4 kernel: [<ffffffff810e73b7>] link_path_walk+0xb67/0xd20
May 17 16:36:14 pearlet4 kernel: [<ffffffff810e76b0>] path_walk+0x60/0xd0
May 17 16:36:14 pearlet4 kernel: [<ffffffff810e778d>] vfs_path_lookup+0x6d/0x90
May 17 16:36:14 pearlet4 kernel: [<ffffffff8121988d>] nfs_follow_remote_path+0x6d/0x170
May 17 16:36:14 pearlet4 kernel: [<ffffffff810637fd>] ? trace_hardirqs_on_caller+0x14d/0x190
May 17 16:36:14 pearlet4 kernel: [<ffffffff812197fb>] ? nfs_do_root_mount+0x8b/0xb0
May 17 16:36:14 pearlet4 kernel: [<ffffffff81219abf>] nfs4_try_mount+0x6f/0xd0
May 17 16:36:14 pearlet4 kernel: [<ffffffff81219bc2>] nfs4_get_sb+0xa2/0x360
May 17 16:36:14 pearlet4 kernel: [<ffffffff810dd15a>] vfs_kern_mount+0x8a/0x1e0
May 17 16:36:14 pearlet4 kernel: [<ffffffff810dd322>] do_kern_mount+0x52/0x130
May 17 16:36:14 pearlet4 kernel: [<ffffffff81926cda>] ? _lock_kernel+0x6a/0x16a
May 17 16:36:14 pearlet4 kernel: [<ffffffff810f788e>] do_mount+0x2de/0x850
May 17 16:36:14 pearlet4 kernel: [<ffffffff810f585a>] ? copy_mount_options+0xea/0x190
May 17 16:36:14 pearlet4 kernel: [<ffffffff810f7e98>] sys_mount+0x98/0xf0
May 17 16:36:14 pearlet4 kernel: [<ffffffff81002518>] system_call_fastpath+0x16/0x1b
May 17 16:36:14 pearlet4 kernel: Code: 00 00 00 00 00 55 48 89 e5 53 48 81 ec 88 00 00 00 0f 1f 44 00 00 48 8b 87 70 02 00 00 f6 05 75 38 7e 01 10 48 8d 4d 80 48 89 fb <8b> 00 48 89 55 80 48 8d 55 d0 48 c7 45 d8 00 00 00 00 48 c7 45
May 17 16:36:14 pearlet4 kernel: RIP [<ffffffff8122bc36>] _nfs4_pnfs_getdevicelist+0x26/0x110
May 17 16:36:14 pearlet4 kernel: RSP <ffff880004e99538>
May 17 16:36:14 pearlet4 kernel: CR2: 0000000000000000
May 17 16:36:14 pearlet4 kernel: ---[ end trace 3956532521eb7ba1 ]---
May 17 16:36:14 pearlet4 kernel: mount.nfs4 used greatest stack depth: 2104 bytes left
May 17 16:36:21 pearlet4 kernel: eth0: no IPv6 routers present
May 17 16:40:32 pearlet4 ntpd[2255]: synchronized to 91.189.94.4, stratum 2
May 17 16:40:32 pearlet4 ntpd[2255]: kernel time sync status change 2001

2010-05-17 17:22:55

by Zhang Jingwang

[permalink] [raw]
Subject: Re: [PATCH] pnfsblock: Lookup list entry of layouts and tags in reverse order

I've sent two patches to solve this problem, you can try them.

[PATCH] pnfs: set pnfs_curr_ld before calling initialize_mountpoint
[PATCH] pnfs: set pnfs_blksize before calling set_pnfs_layoutdriver

2010/5/18 J. Bruce Fields <[email protected]>:
> On Mon, May 17, 2010 at 10:53:11AM -0400, J. Bruce Fields wrote:
>> On Mon, May 17, 2010 at 05:24:39PM +0300, Boaz Harrosh wrote:
>> > On 05/17/2010 04:53 PM, J. Bruce Fields wrote:
>> > > On Wed, May 12, 2010 at 04:28:12PM -0400, bfields wrote:
>> > >> On Wed, May 12, 2010 at 09:46:43AM +0300, Benny Halevy wrote:
>> > >>> On May. 10, 2010, 6:36 +0300, Zhang Jingwang <zhangjingwang@nr=
chpc.ac.cn> wrote:
>> > >>>> Optimize for sequencial write. Layout infos and tags are orga=
nized by
>> > >>>> file offset. When appending data to a file whole list will be=
examined,
>> > >>>> which introduce notable performance decrease.
>> > >>>
>> > >>> Looks good to me.
>> > >>>
>> > >>> Fred, can you please double check?
>> > >>
>> > >> I don't know if Fred's still up for reviewing block stuff?
>> > >>
>> > >> I've been trying to keep up with at least some minimal testing,=
but not
>> > >> as well as I'd like.
>> > >>
>> > >> The one thing I've noticed is that the connectathon general tes=
t has
>> > >> started failing right at the start with an IO error. =A0The las=
t good
>> > >> version I tested was b5c09c21, which was based on 33-rc6. =A0Th=
e earliest
>> > >> bad version I tested was 419312ada, based on 34-rc2. =A0A quick=
look at
>> > >> network traces from the two traces didn't turn up anything obvi=
ous. =A0I
>> > >> haven't had the chance yet to look closer.
>> > >
>> > > As of the latest (6666f47d), in my tests the client is falling b=
ack on
>> > > IO to the MDS and doing no block IO at all. =A0b5c09c21 still wo=
rks, so
>> > > the problem isn't due to a change in the server I'm testing agai=
nst. =A0I
>> > > haven't investigated any more closely.
>> > >
>> >
>> > You might be hitting the .commit bug, no? Still no fix. I'm using =
a work
>> > around for objects. I'm not sure how it affects blocks. I think yo=
u should
>> > see that the very first IO goes through layout driver then the IO =
is redone
>> > through MDS, for each node. Even though write/read returned succes=
s because
>> > commit returns NOT_ATTEMPTED. But I might be totally off.
>>
>> I don't believe it's even attempting a GETLAYOUT.
>>
>> I'll take a look at the network....--b.
>
> Everything on the network looks fine, the server's doing the right
> stuff, the client just never asks for a layout.
>
> In fact, blk_initialize_mountpont is failing on the very first check:
>
> =A0 =A0 =A0 =A0if (server->pnfs_blksize =3D=3D 0) {
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0dprintk("%s Server did not return blks=
ize\n", __func__);
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0...
>
> After rearranging the caller:
>
> @@ -880,9 +880,9 @@ static void nfs4_init_pnfs(struct nfs_server *ser=
ver, struct nfs_fh *mntfh, stru
>
> =A0 =A0 =A0 =A0if (nfs4_has_session(clp) &&
> =A0 =A0 =A0 =A0 =A0 =A0(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PN=
=46S_MDS)) {
> - =A0 =A0 =A0 =A0 =A0 =A0 =A0 set_pnfs_layoutdriver(server, mntfh, fs=
info->layouttype);
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0pnfs_set_ds_iosize(server);
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0server->pnfs_blksize =3D fsinfo->blksi=
ze;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 set_pnfs_layoutdriver(server, mntfh, fs=
info->layouttype);
> =A0 =A0 =A0 =A0}
> =A0#endif /* CONFIG_NFS_V4_1 */
> =A0}
>
> it just fails a little later (see below). =A0I haven't tried to go an=
y
> farther yet.
>
> (But: why are the layout drivers using this odd pnfs_client_operation=
s
> indirection to call back to the common pnfs code? =A0As far as I can =
tell
> there's only one definition of the pnfs_client_operations, so we shou=
ld
> just remove that structure and call pnfs_getdevicelist, etc., by name=
=2E)
>
> --b.
>
> May 17 16:36:14 pearlet4 kernel: BUG: unable to handle kernel NULL po=
inter dereference at (null)
> May 17 16:36:14 pearlet4 kernel: IP: [<ffffffff8122bc36>] _nfs4_pnfs_=
getdevicelist+0x26/0x110
> May 17 16:36:14 pearlet4 kernel: PGD 6e11067 PUD 6e12067 PMD 0
> May 17 16:36:14 pearlet4 kernel: Oops: 0000 [#1] PREEMPT
> May 17 16:36:14 pearlet4 kernel: last sysfs file: /sys/kernel/uevent_=
seqnum
> May 17 16:36:14 pearlet4 kernel: CPU 0
> May 17 16:36:14 pearlet4 kernel: Modules linked in: iscsi_tcp libiscs=
i_tcp libiscsi scsi_transport_iscsi
> May 17 16:36:14 pearlet4 kernel:
> May 17 16:36:14 pearlet4 kernel: Pid: 2794, comm: mount.nfs4 Not tain=
ted 2.6.34-rc6-pnfs-00314-ga35e9c3 #136 /
> May 17 16:36:14 pearlet4 kernel: RIP: 0010:[<ffffffff8122bc36>] =A0[<=
ffffffff8122bc36>] _nfs4_pnfs_getdevicelist+0x26/0x110
> May 17 16:36:14 pearlet4 kernel: RSP: 0018:ffff880004e99538 =A0EFLAGS=
: 00010246
> May 17 16:36:14 pearlet4 kernel: RAX: 0000000000000000 RBX: ffff88000=
5fff378 RCX: ffff880004e99548
> May 17 16:36:14 pearlet4 kernel: RDX: ffff880004ca24c8 RSI: ffff88000=
4e99a28 RDI: ffff880005fff378
> May 17 16:36:14 pearlet4 kernel: RBP: ffff880004e995c8 R08: 000000000=
0000000 R09: ffff880004ca24c8
> May 17 16:36:14 pearlet4 kernel: R10: 0000000000000000 R11: 000000000=
0000001 R12: ffff880004ca24c8
> May 17 16:36:14 pearlet4 kernel: R13: ffff880004ca24c8 R14: ffff88000=
4e995d8 R15: ffff880004e99a28
> May 17 16:36:14 pearlet4 kernel: FS: =A000007fed29c476f0(0000) GS:fff=
fffff81e1c000(0000) knlGS:0000000000000000
> May 17 16:36:14 pearlet4 kernel: CS: =A00010 DS: 0000 ES: 0000 CR0: 0=
00000008005003b
> May 17 16:36:14 pearlet4 kernel: CR2: 0000000000000000 CR3: 000000000=
4e77000 CR4: 00000000000006f0
> May 17 16:36:14 pearlet4 kernel: DR0: 0000000000000000 DR1: 000000000=
0000000 DR2: 0000000000000000
> May 17 16:36:14 pearlet4 kernel: DR3: 0000000000000000 DR6: 00000000f=
fff0ff0 DR7: 0000000000000400
> May 17 16:36:14 pearlet4 kernel: Process mount.nfs4 (pid: 2794, threa=
dinfo ffff880004e98000, task ffff880004e78040)
> May 17 16:36:14 pearlet4 kernel: Stack:
> May 17 16:36:14 pearlet4 kernel: ffff880004e995c8 ffff880004e995c8 ff=
ff880004e99588 ffffffff8190e5dc
> May 17 16:36:14 pearlet4 kernel: <0> ffff880004e98000 ffff880004e995c=
8 ffff880004ca24c0 ffff880007800a80
> May 17 16:36:14 pearlet4 kernel: <0> 0000000000000000 ffff880007800a8=
0 ffff880004ca24c0 ffffffff810d46c6
> May 17 16:36:14 pearlet4 kernel: Call Trace:
> May 17 16:36:14 pearlet4 kernel: [<ffffffff8190e5dc>] ? klist_next+0x=
8c/0xf0
> May 17 16:36:14 pearlet4 kernel: [<ffffffff810d46c6>] ? poison_obj+0x=
36/0x50
> May 17 16:36:14 pearlet4 kernel: [<ffffffff810d4a18>] ? cache_alloc_d=
ebugcheck_after+0xe8/0x1f0
> May 17 16:36:14 pearlet4 kernel: [<ffffffff8122c21e>] nfs4_pnfs_getde=
vicelist+0x4e/0xa0
> May 17 16:36:14 pearlet4 kernel: [<ffffffff810d677d>] ? kmem_cache_al=
loc_notrace+0xfd/0x1a0
> May 17 16:36:14 pearlet4 kernel: [<ffffffff81250e81>] bl_initialize_m=
ountpoint+0x161/0x6a0
> May 17 16:36:14 pearlet4 kernel: [<ffffffff812497c9>] set_pnfs_layout=
driver+0x89/0x120
> May 17 16:36:14 pearlet4 kernel: [<ffffffff8120c71f>] nfs_probe_fsinf=
o+0x54f/0x5f0
> May 17 16:36:14 pearlet4 kernel: [<ffffffff8120d789>] nfs_clone_serve=
r+0x129/0x270
> May 17 16:36:14 pearlet4 kernel: [<ffffffff810d46c6>] ? poison_obj+0x=
36/0x50
> May 17 16:36:14 pearlet4 kernel: [<ffffffff810d4a18>] ? cache_alloc_d=
ebugcheck_after+0xe8/0x1f0
> May 17 16:36:14 pearlet4 kernel: [<ffffffff810f6db1>] ? alloc_vfsmnt+=
0xa1/0x180
> May 17 16:36:14 pearlet4 kernel: [<ffffffff810d627d>] ? __kmalloc_tra=
ck_caller+0x16d/0x2b0
> May 17 16:36:14 pearlet4 kernel: [<ffffffff810f6db1>] ? alloc_vfsmnt+=
0xa1/0x180
> May 17 16:36:14 pearlet4 kernel: [<ffffffff81219fa1>] nfs4_xdev_get_s=
b+0x61/0x340
> May 17 16:36:14 pearlet4 kernel: [<ffffffff810dd15a>] vfs_kern_mount+=
0x8a/0x1e0
> May 17 16:36:14 pearlet4 kernel: [<ffffffff81224f23>] nfs_follow_moun=
tpoint+0x3b3/0x4b0
> May 17 16:36:14 pearlet4 kernel: [<ffffffff810e73b7>] link_path_walk+=
0xb67/0xd20
> May 17 16:36:14 pearlet4 kernel: [<ffffffff810e76b0>] path_walk+0x60/=
0xd0
> May 17 16:36:14 pearlet4 kernel: [<ffffffff810e778d>] vfs_path_lookup=
+0x6d/0x90
> May 17 16:36:14 pearlet4 kernel: [<ffffffff8121988d>] nfs_follow_remo=
te_path+0x6d/0x170
> May 17 16:36:14 pearlet4 kernel: [<ffffffff810637fd>] ? trace_hardirq=
s_on_caller+0x14d/0x190
> May 17 16:36:14 pearlet4 kernel: [<ffffffff812197fb>] ? nfs_do_root_m=
ount+0x8b/0xb0
> May 17 16:36:14 pearlet4 kernel: [<ffffffff81219abf>] nfs4_try_mount+=
0x6f/0xd0
> May 17 16:36:14 pearlet4 kernel: [<ffffffff81219bc2>] nfs4_get_sb+0xa=
2/0x360
> May 17 16:36:14 pearlet4 kernel: [<ffffffff810dd15a>] vfs_kern_mount+=
0x8a/0x1e0
> May 17 16:36:14 pearlet4 kernel: [<ffffffff810dd322>] do_kern_mount+0=
x52/0x130
> May 17 16:36:14 pearlet4 kernel: [<ffffffff81926cda>] ? _lock_kernel+=
0x6a/0x16a
> May 17 16:36:14 pearlet4 kernel: [<ffffffff810f788e>] do_mount+0x2de/=
0x850
> May 17 16:36:14 pearlet4 kernel: [<ffffffff810f585a>] ? copy_mount_op=
tions+0xea/0x190
> May 17 16:36:14 pearlet4 kernel: [<ffffffff810f7e98>] sys_mount+0x98/=
0xf0
> May 17 16:36:14 pearlet4 kernel: [<ffffffff81002518>] system_call_fas=
tpath+0x16/0x1b
> May 17 16:36:14 pearlet4 kernel: Code: 00 00 00 00 00 55 48 89 e5 53 =
48 81 ec 88 00 00 00 0f 1f 44 00 00 48 8b 87 70 02 00 00 f6 05 75 38 7e=
01 10 48 8d 4d 80 48 89 fb <8b> 00 48 89 55 80 48 8d 55 d0 48 c7 45 d8=
00 00 00 00 48 c7 45
> May 17 16:36:14 pearlet4 kernel: RIP =A0[<ffffffff8122bc36>] _nfs4_pn=
fs_getdevicelist+0x26/0x110
> May 17 16:36:14 pearlet4 kernel: RSP <ffff880004e99538>
> May 17 16:36:14 pearlet4 kernel: CR2: 0000000000000000
> May 17 16:36:14 pearlet4 kernel: ---[ end trace 3956532521eb7ba1 ]---
> May 17 16:36:14 pearlet4 kernel: mount.nfs4 used greatest stack depth=
: 2104 bytes left
> May 17 16:36:21 pearlet4 kernel: eth0: no IPv6 routers present
> May 17 16:40:32 pearlet4 ntpd[2255]: synchronized to 91.189.94.4, str=
atum 2
> May 17 16:40:32 pearlet4 ntpd[2255]: kernel time sync status change 2=
001
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" =
in
> the body of a message to [email protected]
> More majordomo info at =A0http://vger.kernel.org/majordomo-info.html
>



--=20
Zhang Jingwang
National Research Centre for High Performance Computers
Institute of Computing Technology, Chinese Academy of Sciences
No. 6, South Kexueyuan Road, Haidian District
Beijing, China

2010-05-18 16:20:10

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH] pnfsblock: Lookup list entry of layouts and tags in reverse order

On Tue, May 18, 2010 at 01:22:52AM +0800, Zhang Jingwang wrote:
> I've sent two patches to solve this problem, you can try them.
>=20
> [PATCH] pnfs: set pnfs_curr_ld before calling initialize_mountpoint
> [PATCH] pnfs: set pnfs_blksize before calling set_pnfs_layoutdriver

Thanks. With Benny's latest block all (97602fc6, which includes the tw=
o
patches above), I'm back to the previous behavior:

>=20
> 2010/5/18 J. Bruce Fields <[email protected]>:
> > On Mon, May 17, 2010 at 10:53:11AM -0400, J. Bruce Fields wrote:
> >> On Mon, May 17, 2010 at 05:24:39PM +0300, Boaz Harrosh wrote:
> >> > On 05/17/2010 04:53 PM, J. Bruce Fields wrote:
> >> > > On Wed, May 12, 2010 at 04:28:12PM -0400, bfields wrote:
> >> > >> The one thing I've noticed is that the connectathon general t=
est has
> >> > >> started failing right at the start with an IO error. =C2=A0Th=
e last good
> >> > >> version I tested was b5c09c21, which was based on 33-rc6. =C2=
=A0The earliest
> >> > >> bad version I tested was 419312ada, based on 34-rc2. =C2=A0A =
quick look at
> >> > >> network traces from the two traces didn't turn up anything ob=
vious. =C2=A0I
> >> > >> haven't had the chance yet to look closer.

So I still see the IO error at the start of the connectathon general
tests.

Also, I get the following warning--I don't know if it's new or not.

--b.

=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D
[ INFO: possible circular locking dependency detected ]
2.6.34-pnfs-00322-g97602fc #141
-------------------------------------------------------
cp/2789 is trying to acquire lock:
(&(&nfsi->lo_lock)->rlock){+.+...}, at: [<ffffffff8124dbee>] T.947+0x4=
e/0x210

but task is already holding lock:
(&sb->s_type->i_lock_key#11){+.+...}, at: [<ffffffff81223689>] nfs_upd=
atepage+0x139/0x5a0

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #2 (&sb->s_type->i_lock_key#11){+.+...}:
[<ffffffff81065913>] __lock_acquire+0x1293/0x1d30
[<ffffffff81066442>] lock_acquire+0x92/0x170
[<ffffffff81925d5b>] _raw_spin_lock+0x3b/0x50
[<ffffffff81244173>] nfs_inode_set_delegation+0x203/0x2c0
[<ffffffff81231b7a>] nfs4_opendata_to_nfs4_state+0x31a/0x3d0
[<ffffffff81231fb2>] nfs4_do_open+0x242/0x460
[<ffffffff81232a05>] nfs4_proc_create+0x85/0x220
[<ffffffff8120ec64>] nfs_create+0x74/0x120
[<ffffffff810e5d63>] vfs_create+0xb3/0x100
[<ffffffff810e656b>] do_last+0x59b/0x6c0
[<ffffffff810e88e2>] do_filp_open+0x212/0x690
[<ffffffff810d8059>] do_sys_open+0x69/0x140
[<ffffffff810d8170>] sys_open+0x20/0x30
[<ffffffff81002518>] system_call_fastpath+0x16/0x1b

-> #1 (&(&clp->cl_lock)->rlock){+.+...}:
[<ffffffff81065913>] __lock_acquire+0x1293/0x1d30
[<ffffffff81066442>] lock_acquire+0x92/0x170
[<ffffffff81925d5b>] _raw_spin_lock+0x3b/0x50
[<ffffffff8124b378>] pnfs_update_layout+0x2f8/0xaf0
[<ffffffff8124c7e4>] pnfs_file_write+0x64/0xc0
[<ffffffff810daab7>] vfs_write+0xb7/0x180
[<ffffffff810dac71>] sys_write+0x51/0x90
[<ffffffff81002518>] system_call_fastpath+0x16/0x1b

-> #0 (&(&nfsi->lo_lock)->rlock){+.+...}:
[<ffffffff81065dd2>] __lock_acquire+0x1752/0x1d30
[<ffffffff81066442>] lock_acquire+0x92/0x170
[<ffffffff81925d5b>] _raw_spin_lock+0x3b/0x50
[<ffffffff8124dbee>] T.947+0x4e/0x210
[<ffffffff8124ddfb>] _pnfs_do_flush+0x4b/0xf0
[<ffffffff8122364d>] nfs_updatepage+0xfd/0x5a0
[<ffffffff812126b5>] nfs_write_end+0x265/0x3e0
[<ffffffff810a3397>] generic_file_buffered_write+0x187/0x2a0
[<ffffffff810a5890>] __generic_file_aio_write+0x240/0x460
[<ffffffff810a5b17>] generic_file_aio_write+0x67/0xd0
[<ffffffff81213661>] nfs_file_write+0xb1/0x1f0
[<ffffffff810d9fca>] do_sync_write+0xda/0x120
[<ffffffff8124c802>] pnfs_file_write+0x82/0xc0
[<ffffffff810daab7>] vfs_write+0xb7/0x180
[<ffffffff810dac71>] sys_write+0x51/0x90
[<ffffffff81002518>] system_call_fastpath+0x16/0x1b

other info that might help us debug this:

2 locks held by cp/2789:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff810a5b04>] g=
eneric_file_aio_write+0x54/0xd0
#1: (&sb->s_type->i_lock_key#11){+.+...}, at: [<ffffffff81223689>] nf=
s_updatepage+0x139/0x5a0

stack backtrace:
Pid: 2789, comm: cp Not tainted 2.6.34-pnfs-00322-g97602fc #141
Call Trace:
[<ffffffff81064033>] print_circular_bug+0xf3/0x100
[<ffffffff81065dd2>] __lock_acquire+0x1752/0x1d30
[<ffffffff81066442>] lock_acquire+0x92/0x170
[<ffffffff8124dbee>] ? T.947+0x4e/0x210
[<ffffffff81929d59>] ? sub_preempt_count+0x9/0xa0
[<ffffffff81925d5b>] _raw_spin_lock+0x3b/0x50
[<ffffffff8124dbee>] ? T.947+0x4e/0x210
[<ffffffff8124dbee>] T.947+0x4e/0x210
[<ffffffff8124ddfb>] _pnfs_do_flush+0x4b/0xf0
[<ffffffff8122364d>] nfs_updatepage+0xfd/0x5a0
[<ffffffff812126b5>] nfs_write_end+0x265/0x3e0
[<ffffffff810a3397>] generic_file_buffered_write+0x187/0x2a0
[<ffffffff810a5890>] __generic_file_aio_write+0x240/0x460
[<ffffffff81929d59>] ? sub_preempt_count+0x9/0xa0
[<ffffffff810a5b17>] generic_file_aio_write+0x67/0xd0
[<ffffffff81213661>] nfs_file_write+0xb1/0x1f0
[<ffffffff810d9fca>] do_sync_write+0xda/0x120
[<ffffffff810528a0>] ? autoremove_wake_function+0x0/0x40
[<ffffffff8124c802>] pnfs_file_write+0x82/0xc0
[<ffffffff810daab7>] vfs_write+0xb7/0x180
[<ffffffff810dac71>] sys_write+0x51/0x90
[<ffffffff81002518>] system_call_fastpath+0x16/0x1b
eth0: no IPv6 routers present

2010-05-19 04:56:44

by Tao Guo

[permalink] [raw]
Subject: Re: [PATCH] pnfsblock: Lookup list entry of layouts and tags in reverse order

I think the warning just indicate a possible bug:
nfs_inode_set_delegation():
clp->cl_lock
--> inode->i_lock
get_lock_alloc_layout():
nfsi->lo_lock
--> clp->cl_lock
nfs_try_to_update_request()->pnfs_do_flush()->_pnfs_do_flush()->
pnfs_find_get_lseg()->get_lock_current_layout():

inode->i_lock

-->nfsi->lo_lock
In nfs_inode_set_delegation(), maybe we should unlock clp->cl_lock befo=
re
taking inode->i_lock spinlock?

PS: I just use the latest pnfsblock code(pnfs-all-2.6.34-2010-05-17) do=
ing some
basic r/w tests and it works fine. Can you find out which code path
lead to IO errror?

On Wed, May 19, 2010 at 12:20 AM, J. Bruce Fields <[email protected]=
> wrote:
> On Tue, May 18, 2010 at 01:22:52AM +0800, Zhang Jingwang wrote:
>> I've sent two patches to solve this problem, you can try them.
>>
>> [PATCH] pnfs: set pnfs_curr_ld before calling initialize_mountpoint
>> [PATCH] pnfs: set pnfs_blksize before calling set_pnfs_layoutdriver
>
> Thanks. =C2=A0With Benny's latest block all (97602fc6, which includes=
the two
> patches above), I'm back to the previous behavior:
>
>>
>> 2010/5/18 J. Bruce Fields <[email protected]>:
>> > On Mon, May 17, 2010 at 10:53:11AM -0400, J. Bruce Fields wrote:
>> >> On Mon, May 17, 2010 at 05:24:39PM +0300, Boaz Harrosh wrote:
>> >> > On 05/17/2010 04:53 PM, J. Bruce Fields wrote:
>> >> > > On Wed, May 12, 2010 at 04:28:12PM -0400, bfields wrote:
>> >> > >> The one thing I've noticed is that the connectathon general =
test has
>> >> > >> started failing right at the start with an IO error. =C2=A0T=
he last good
>> >> > >> version I tested was b5c09c21, which was based on 33-rc6. =C2=
=A0The earliest
>> >> > >> bad version I tested was 419312ada, based on 34-rc2. =C2=A0A=
quick look at
>> >> > >> network traces from the two traces didn't turn up anything o=
bvious. =C2=A0I
>> >> > >> haven't had the chance yet to look closer.
>
> So I still see the IO error at the start of the connectathon general
> tests.
>
> Also, I get the following warning--I don't know if it's new or not.
>
> --b.
>
> =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D
> [ INFO: possible circular locking dependency detected ]
> 2.6.34-pnfs-00322-g97602fc #141
> -------------------------------------------------------
> cp/2789 is trying to acquire lock:
> =C2=A0(&(&nfsi->lo_lock)->rlock){+.+...}, at: [<ffffffff8124dbee>] T.=
947+0x4e/0x210
>
> but task is already holding lock:
> =C2=A0(&sb->s_type->i_lock_key#11){+.+...}, at: [<ffffffff81223689>] =
nfs_updatepage+0x139/0x5a0
>
> which lock already depends on the new lock.
>
>
> the existing dependency chain (in reverse order) is:
>
> -> #2 (&sb->s_type->i_lock_key#11){+.+...}:
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff81065913>] __lock_acquire+0x1293/0x1d3=
0
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff81066442>] lock_acquire+0x92/0x170
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff81925d5b>] _raw_spin_lock+0x3b/0x50
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff81244173>] nfs_inode_set_delegation+0x=
203/0x2c0
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff81231b7a>] nfs4_opendata_to_nfs4_state=
+0x31a/0x3d0
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff81231fb2>] nfs4_do_open+0x242/0x460
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff81232a05>] nfs4_proc_create+0x85/0x220
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff8120ec64>] nfs_create+0x74/0x120
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff810e5d63>] vfs_create+0xb3/0x100
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff810e656b>] do_last+0x59b/0x6c0
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff810e88e2>] do_filp_open+0x212/0x690
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff810d8059>] do_sys_open+0x69/0x140
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff810d8170>] sys_open+0x20/0x30
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff81002518>] system_call_fastpath+0x16/0=
x1b
>
> -> #1 (&(&clp->cl_lock)->rlock){+.+...}:
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff81065913>] __lock_acquire+0x1293/0x1d3=
0
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff81066442>] lock_acquire+0x92/0x170
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff81925d5b>] _raw_spin_lock+0x3b/0x50
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff8124b378>] pnfs_update_layout+0x2f8/0x=
af0
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff8124c7e4>] pnfs_file_write+0x64/0xc0
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff810daab7>] vfs_write+0xb7/0x180
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff810dac71>] sys_write+0x51/0x90
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff81002518>] system_call_fastpath+0x16/0=
x1b
>
> -> #0 (&(&nfsi->lo_lock)->rlock){+.+...}:
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff81065dd2>] __lock_acquire+0x1752/0x1d3=
0
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff81066442>] lock_acquire+0x92/0x170
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff81925d5b>] _raw_spin_lock+0x3b/0x50
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff8124dbee>] T.947+0x4e/0x210
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff8124ddfb>] _pnfs_do_flush+0x4b/0xf0
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff8122364d>] nfs_updatepage+0xfd/0x5a0
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff812126b5>] nfs_write_end+0x265/0x3e0
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff810a3397>] generic_file_buffered_write=
+0x187/0x2a0
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff810a5890>] __generic_file_aio_write+0x=
240/0x460
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff810a5b17>] generic_file_aio_write+0x67=
/0xd0
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff81213661>] nfs_file_write+0xb1/0x1f0
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff810d9fca>] do_sync_write+0xda/0x120
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff8124c802>] pnfs_file_write+0x82/0xc0
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff810daab7>] vfs_write+0xb7/0x180
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff810dac71>] sys_write+0x51/0x90
> =C2=A0 =C2=A0 =C2=A0 [<ffffffff81002518>] system_call_fastpath+0x16/0=
x1b
>
> other info that might help us debug this:
>
> 2 locks held by cp/2789:
> =C2=A0#0: =C2=A0(&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff=
810a5b04>] generic_file_aio_write+0x54/0xd0
> =C2=A0#1: =C2=A0(&sb->s_type->i_lock_key#11){+.+...}, at: [<ffffffff8=
1223689>] nfs_updatepage+0x139/0x5a0
>
> stack backtrace:
> Pid: 2789, comm: cp Not tainted 2.6.34-pnfs-00322-g97602fc #141
> Call Trace:
> =C2=A0[<ffffffff81064033>] print_circular_bug+0xf3/0x100
> =C2=A0[<ffffffff81065dd2>] __lock_acquire+0x1752/0x1d30
> =C2=A0[<ffffffff81066442>] lock_acquire+0x92/0x170
> =C2=A0[<ffffffff8124dbee>] ? T.947+0x4e/0x210
> =C2=A0[<ffffffff81929d59>] ? sub_preempt_count+0x9/0xa0
> =C2=A0[<ffffffff81925d5b>] _raw_spin_lock+0x3b/0x50
> =C2=A0[<ffffffff8124dbee>] ? T.947+0x4e/0x210
> =C2=A0[<ffffffff8124dbee>] T.947+0x4e/0x210
> =C2=A0[<ffffffff8124ddfb>] _pnfs_do_flush+0x4b/0xf0
> =C2=A0[<ffffffff8122364d>] nfs_updatepage+0xfd/0x5a0
> =C2=A0[<ffffffff812126b5>] nfs_write_end+0x265/0x3e0
> =C2=A0[<ffffffff810a3397>] generic_file_buffered_write+0x187/0x2a0
> =C2=A0[<ffffffff810a5890>] __generic_file_aio_write+0x240/0x460
> =C2=A0[<ffffffff81929d59>] ? sub_preempt_count+0x9/0xa0
> =C2=A0[<ffffffff810a5b17>] generic_file_aio_write+0x67/0xd0
> =C2=A0[<ffffffff81213661>] nfs_file_write+0xb1/0x1f0
> =C2=A0[<ffffffff810d9fca>] do_sync_write+0xda/0x120
> =C2=A0[<ffffffff810528a0>] ? autoremove_wake_function+0x0/0x40
> =C2=A0[<ffffffff8124c802>] pnfs_file_write+0x82/0xc0
> =C2=A0[<ffffffff810daab7>] vfs_write+0xb7/0x180
> =C2=A0[<ffffffff810dac71>] sys_write+0x51/0x90
> =C2=A0[<ffffffff81002518>] system_call_fastpath+0x16/0x1b
> eth0: no IPv6 routers present
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" =
in
> the body of a message to [email protected]
> More majordomo info at =C2=A0http://vger.kernel.org/majordomo-info.ht=
ml
>



--=20
tao.

2010-05-19 16:36:35

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH] pnfsblock: Lookup list entry of layouts and tags in reverse order

On Wed, May 19, 2010 at 12:56:42PM +0800, Tao Guo wrote:
> I think the warning just indicate a possible bug:
> nfs_inode_set_delegation():
> clp->cl_lock
> --> inode->i_lock
> get_lock_alloc_layout():
> nfsi->lo_lock
> --> clp->cl_lock
> nfs_try_to_update_request()->pnfs_do_flush()->_pnfs_do_flush()->
> pnfs_find_get_lseg()->get_lock_current_layout():
>=20
> inode->i_lock
>=20
> -->nfsi->lo_lock
> In nfs_inode_set_delegation(), maybe we should unlock clp->cl_lock be=
fore
> taking inode->i_lock spinlock?
>=20
> PS: I just use the latest pnfsblock code(pnfs-all-2.6.34-2010-05-17) =
doing some
> basic r/w tests and it works fine.

Could you try running the connectathon general test?

> Can you find out which code path
> lead to IO errror?

I'll try to narrow down the test case.

--b.

>=20
> On Wed, May 19, 2010 at 12:20 AM, J. Bruce Fields <[email protected]=
rg> wrote:
> > On Tue, May 18, 2010 at 01:22:52AM +0800, Zhang Jingwang wrote:
> >> I've sent two patches to solve this problem, you can try them.
> >>
> >> [PATCH] pnfs: set pnfs_curr_ld before calling initialize_mountpoin=
t
> >> [PATCH] pnfs: set pnfs_blksize before calling set_pnfs_layoutdrive=
r
> >
> > Thanks. =C2=A0With Benny's latest block all (97602fc6, which includ=
es the two
> > patches above), I'm back to the previous behavior:
> >
> >>
> >> 2010/5/18 J. Bruce Fields <[email protected]>:
> >> > On Mon, May 17, 2010 at 10:53:11AM -0400, J. Bruce Fields wrote:
> >> >> On Mon, May 17, 2010 at 05:24:39PM +0300, Boaz Harrosh wrote:
> >> >> > On 05/17/2010 04:53 PM, J. Bruce Fields wrote:
> >> >> > > On Wed, May 12, 2010 at 04:28:12PM -0400, bfields wrote:
> >> >> > >> The one thing I've noticed is that the connectathon genera=
l test has
> >> >> > >> started failing right at the start with an IO error. =C2=A0=
The last good
> >> >> > >> version I tested was b5c09c21, which was based on 33-rc6. =
=C2=A0The earliest
> >> >> > >> bad version I tested was 419312ada, based on 34-rc2. =C2=A0=
A quick look at
> >> >> > >> network traces from the two traces didn't turn up anything=
obvious. =C2=A0I
> >> >> > >> haven't had the chance yet to look closer.
> >
> > So I still see the IO error at the start of the connectathon genera=
l
> > tests.
> >
> > Also, I get the following warning--I don't know if it's new or not.
> >
> > --b.
> >
> > =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D
> > [ INFO: possible circular locking dependency detected ]
> > 2.6.34-pnfs-00322-g97602fc #141
> > -------------------------------------------------------
> > cp/2789 is trying to acquire lock:
> > =C2=A0(&(&nfsi->lo_lock)->rlock){+.+...}, at: [<ffffffff8124dbee>] =
T.947+0x4e/0x210
> >
> > but task is already holding lock:
> > =C2=A0(&sb->s_type->i_lock_key#11){+.+...}, at: [<ffffffff81223689>=
] nfs_updatepage+0x139/0x5a0
> >
> > which lock already depends on the new lock.
> >
> >
> > the existing dependency chain (in reverse order) is:
> >
> > -> #2 (&sb->s_type->i_lock_key#11){+.+...}:
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff81065913>] __lock_acquire+0x1293/0x1=
d30
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff81066442>] lock_acquire+0x92/0x170
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff81925d5b>] _raw_spin_lock+0x3b/0x50
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff81244173>] nfs_inode_set_delegation+=
0x203/0x2c0
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff81231b7a>] nfs4_opendata_to_nfs4_sta=
te+0x31a/0x3d0
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff81231fb2>] nfs4_do_open+0x242/0x460
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff81232a05>] nfs4_proc_create+0x85/0x2=
20
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff8120ec64>] nfs_create+0x74/0x120
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff810e5d63>] vfs_create+0xb3/0x100
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff810e656b>] do_last+0x59b/0x6c0
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff810e88e2>] do_filp_open+0x212/0x690
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff810d8059>] do_sys_open+0x69/0x140
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff810d8170>] sys_open+0x20/0x30
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff81002518>] system_call_fastpath+0x16=
/0x1b
> >
> > -> #1 (&(&clp->cl_lock)->rlock){+.+...}:
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff81065913>] __lock_acquire+0x1293/0x1=
d30
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff81066442>] lock_acquire+0x92/0x170
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff81925d5b>] _raw_spin_lock+0x3b/0x50
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff8124b378>] pnfs_update_layout+0x2f8/=
0xaf0
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff8124c7e4>] pnfs_file_write+0x64/0xc0
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff810daab7>] vfs_write+0xb7/0x180
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff810dac71>] sys_write+0x51/0x90
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff81002518>] system_call_fastpath+0x16=
/0x1b
> >
> > -> #0 (&(&nfsi->lo_lock)->rlock){+.+...}:
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff81065dd2>] __lock_acquire+0x1752/0x1=
d30
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff81066442>] lock_acquire+0x92/0x170
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff81925d5b>] _raw_spin_lock+0x3b/0x50
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff8124dbee>] T.947+0x4e/0x210
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff8124ddfb>] _pnfs_do_flush+0x4b/0xf0
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff8122364d>] nfs_updatepage+0xfd/0x5a0
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff812126b5>] nfs_write_end+0x265/0x3e0
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff810a3397>] generic_file_buffered_wri=
te+0x187/0x2a0
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff810a5890>] __generic_file_aio_write+=
0x240/0x460
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff810a5b17>] generic_file_aio_write+0x=
67/0xd0
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff81213661>] nfs_file_write+0xb1/0x1f0
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff810d9fca>] do_sync_write+0xda/0x120
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff8124c802>] pnfs_file_write+0x82/0xc0
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff810daab7>] vfs_write+0xb7/0x180
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff810dac71>] sys_write+0x51/0x90
> > =C2=A0 =C2=A0 =C2=A0 [<ffffffff81002518>] system_call_fastpath+0x16=
/0x1b
> >
> > other info that might help us debug this:
> >
> > 2 locks held by cp/2789:
> > =C2=A0#0: =C2=A0(&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffff=
ff810a5b04>] generic_file_aio_write+0x54/0xd0
> > =C2=A0#1: =C2=A0(&sb->s_type->i_lock_key#11){+.+...}, at: [<fffffff=
f81223689>] nfs_updatepage+0x139/0x5a0
> >
> > stack backtrace:
> > Pid: 2789, comm: cp Not tainted 2.6.34-pnfs-00322-g97602fc #141
> > Call Trace:
> > =C2=A0[<ffffffff81064033>] print_circular_bug+0xf3/0x100
> > =C2=A0[<ffffffff81065dd2>] __lock_acquire+0x1752/0x1d30
> > =C2=A0[<ffffffff81066442>] lock_acquire+0x92/0x170
> > =C2=A0[<ffffffff8124dbee>] ? T.947+0x4e/0x210
> > =C2=A0[<ffffffff81929d59>] ? sub_preempt_count+0x9/0xa0
> > =C2=A0[<ffffffff81925d5b>] _raw_spin_lock+0x3b/0x50
> > =C2=A0[<ffffffff8124dbee>] ? T.947+0x4e/0x210
> > =C2=A0[<ffffffff8124dbee>] T.947+0x4e/0x210
> > =C2=A0[<ffffffff8124ddfb>] _pnfs_do_flush+0x4b/0xf0
> > =C2=A0[<ffffffff8122364d>] nfs_updatepage+0xfd/0x5a0
> > =C2=A0[<ffffffff812126b5>] nfs_write_end+0x265/0x3e0
> > =C2=A0[<ffffffff810a3397>] generic_file_buffered_write+0x187/0x2a0
> > =C2=A0[<ffffffff810a5890>] __generic_file_aio_write+0x240/0x460
> > =C2=A0[<ffffffff81929d59>] ? sub_preempt_count+0x9/0xa0
> > =C2=A0[<ffffffff810a5b17>] generic_file_aio_write+0x67/0xd0
> > =C2=A0[<ffffffff81213661>] nfs_file_write+0xb1/0x1f0
> > =C2=A0[<ffffffff810d9fca>] do_sync_write+0xda/0x120
> > =C2=A0[<ffffffff810528a0>] ? autoremove_wake_function+0x0/0x40
> > =C2=A0[<ffffffff8124c802>] pnfs_file_write+0x82/0xc0
> > =C2=A0[<ffffffff810daab7>] vfs_write+0xb7/0x180
> > =C2=A0[<ffffffff810dac71>] sys_write+0x51/0x90
> > =C2=A0[<ffffffff81002518>] system_call_fastpath+0x16/0x1b
> > eth0: no IPv6 routers present
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-nfs=
" in
> > the body of a message to [email protected]
> > More majordomo info at =C2=A0http://vger.kernel.org/majordomo-info.=
html
> >
>=20
>=20
>=20
> --=20
> tao.

2010-05-19 21:38:52

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH] pnfsblock: Lookup list entry of layouts and tags in reverse order

On Wed, May 19, 2010 at 12:36:32PM -0400, J. Bruce Fields wrote:
> On Wed, May 19, 2010 at 12:56:42PM +0800, Tao Guo wrote:
> > I think the warning just indicate a possible bug:
> > nfs_inode_set_delegation():
> > clp->cl_lock
> > --> inode->i_lock
> > get_lock_alloc_layout():
> > nfsi->lo_lock
> > --> clp->cl_lock
> > nfs_try_to_update_request()->pnfs_do_flush()->_pnfs_do_flush()->
> > pnfs_find_get_lseg()->get_lock_current_layout():
> >
> > inode->i_lock
> >
> > -->nfsi->lo_lock
> > In nfs_inode_set_delegation(), maybe we should unlock clp->cl_lock before
> > taking inode->i_lock spinlock?
> >
> > PS: I just use the latest pnfsblock code(pnfs-all-2.6.34-2010-05-17) doing some
> > basic r/w tests and it works fine.
>
> Could you try running the connectathon general test?
>
> > Can you find out which code path
> > lead to IO errror?
>
> I'll try to narrow down the test case.

I haven't gotten the chance to narrow it down any more. So all I can
say is that:

~/cthon04# NFSTESTDIR=/mnt/TMP ./runtests -g

GENERAL TESTS: directory /mnt/TMP
if test ! -x runtests; then chmod a+x runtests; fi
cd /mnt/TMP; rm -f Makefile runtests runtests.wrk *.sh *.c
mkdummy rmdummy nroff.in makefile.tst
cp Makefile runtests runtests.wrk *.sh *.c mkdummy rmdummy
nroff.in makefile.tst /mnt/TMP

Small Compile
runtests.wrk: 63: ./stat: Input/output error
general tests failed

is a consistent result for me. (Using the tests from
http://www.connectathon.org/nfstests.html.)

--b.

2010-05-20 05:44:15

by Tao Guo

[permalink] [raw]
Subject: Re: [PATCH] pnfsblock: Lookup list entry of layouts and tags in reverse order

Hi, Fields
I used the connectathon general test with my latest code, and it
passed the tests.
Can you just try the two patches I have sent in the mailing list to
see if it helps?

On Thu, May 20, 2010 at 5:38 AM, J. Bruce Fields <[email protected]>=
wrote:
> On Wed, May 19, 2010 at 12:36:32PM -0400, J. Bruce Fields wrote:
>> On Wed, May 19, 2010 at 12:56:42PM +0800, Tao Guo wrote:
>> > I think the warning just indicate a possible bug:
>> > nfs_inode_set_delegation():
>> > =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=
=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0=
clp->cl_lock
>> > =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=
=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0=
=C2=A0 =C2=A0 --> inode->i_lock
>> > get_lock_alloc_layout():
>> > =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=
=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0nfsi->lo_loc=
k
>> > =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=
=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 --> =
clp->cl_lock
>> > nfs_try_to_update_request()->pnfs_do_flush()->_pnfs_do_flush()->
>> > pnfs_find_get_lseg()->get_lock_current_layout():
>> >
>> > inode->i_lock
>> >
>> > -->nfsi->lo_lock
>> > In nfs_inode_set_delegation(), maybe we should unlock clp->cl_lock=
before
>> > taking inode->i_lock spinlock?
>> >
>> > PS: I just use the latest pnfsblock code(pnfs-all-2.6.34-2010-05-1=
7) doing some
>> > basic r/w tests and it works fine.
>>
>> Could you try running the connectathon general test?
>>
>> > Can you find out which code path
>> > lead to IO errror?
>>
>> I'll try to narrow down the test case.
>
> I haven't gotten the chance to narrow it down any more. =C2=A0So all =
I can
> say is that:
>
> =C2=A0 =C2=A0 =C2=A0 =C2=A0~/cthon04# NFSTESTDIR=3D/mnt/TMP ./runtest=
s -g
>
> =C2=A0 =C2=A0 =C2=A0 =C2=A0GENERAL TESTS: directory /mnt/TMP
> =C2=A0 =C2=A0 =C2=A0 =C2=A0if test ! -x runtests; then chmod a+x runt=
ests; fi
> =C2=A0 =C2=A0 =C2=A0 =C2=A0cd /mnt/TMP; rm -f Makefile runtests runte=
sts.wrk *.sh *.c
> =C2=A0 =C2=A0 =C2=A0 =C2=A0mkdummy rmdummy nroff.in makefile.tst
> =C2=A0 =C2=A0 =C2=A0 =C2=A0cp Makefile runtests runtests.wrk *.sh *.c=
mkdummy rmdummy
> =C2=A0 =C2=A0 =C2=A0 =C2=A0nroff.in makefile.tst /mnt/TMP
>
> =C2=A0 =C2=A0 =C2=A0 =C2=A0Small Compile
> =C2=A0 =C2=A0 =C2=A0 =C2=A0runtests.wrk: 63: ./stat: Input/output err=
or
> =C2=A0 =C2=A0 =C2=A0 =C2=A0general tests failed
>
> is a consistent result for me. =C2=A0(Using the tests from
> http://www.connectathon.org/nfstests.html.)
>
> --b.
>



--=20
tao.

2010-05-12 06:46:48

by Benny Halevy

[permalink] [raw]
Subject: Re: [PATCH] pnfsblock: Lookup list entry of layouts and tags in reverse order

On May. 10, 2010, 6:36 +0300, Zhang Jingwang <[email protected]> wrote:
> Optimize for sequencial write. Layout infos and tags are organized by
> file offset. When appending data to a file whole list will be examined,
> which introduce notable performance decrease.

Looks good to me.

Fred, can you please double check?

Benny

P.S.: Zhang, please note Fred's new email address

>
> Signed-off-by: Zhang Jingwang <[email protected]>
> ---
> fs/nfs/blocklayout/extents.c | 126 +++++++++++++++++++++---------------------
> 1 files changed, 64 insertions(+), 62 deletions(-)
>
> diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
> index 3c311f2..514f2cc 100644
> --- a/fs/nfs/blocklayout/extents.c
> +++ b/fs/nfs/blocklayout/extents.c
> @@ -66,8 +66,8 @@ static int32_t _find_entry(struct my_tree_t *tree, u64 s)
> struct pnfs_inval_tracking *pos;
>
> dprintk("%s(%llu) enter\n", __func__, s);
> - list_for_each_entry(pos, &tree->mtt_stub, it_link) {
> - if (pos->it_sector < s)
> + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
> + if (pos->it_sector > s)
> continue;
> else if (pos->it_sector == s)
> return pos->it_tags & INTERNAL_MASK;
> @@ -102,8 +102,8 @@ static int _add_entry(struct my_tree_t *tree, u64 s, int32_t tag,
> struct pnfs_inval_tracking *pos;
>
> dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
> - list_for_each_entry(pos, &tree->mtt_stub, it_link) {
> - if (pos->it_sector < s)
> + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
> + if (pos->it_sector > s)
> continue;
> else if (pos->it_sector == s) {
> found = 1;
> @@ -125,7 +125,7 @@ static int _add_entry(struct my_tree_t *tree, u64 s, int32_t tag,
> }
> new->it_sector = s;
> new->it_tags = (1 << tag);
> - list_add_tail(&new->it_link, &pos->it_link);
> + list_add(&new->it_link, &pos->it_link);
> return 1;
> }
> }
> @@ -230,14 +230,14 @@ _range_has_tag(struct my_tree_t *tree, u64 start, u64 end, int32_t tag)
> u64 expect = 0;
>
> dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
> - list_for_each_entry(pos, &tree->mtt_stub, it_link) {
> - if (pos->it_sector < start)
> + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
> + if (pos->it_sector >= end)
> continue;
> if (!expect) {
> - if ((pos->it_sector == start) &&
> + if ((pos->it_sector == end - tree->mtt_step_size) &&
> (pos->it_tags & (1 << tag))) {
> - expect = start + tree->mtt_step_size;
> - if (expect == end)
> + expect = pos->it_sector - tree->mtt_step_size;
> + if (expect < start)
> return 1;
> continue;
> } else {
> @@ -246,8 +246,8 @@ _range_has_tag(struct my_tree_t *tree, u64 start, u64 end, int32_t tag)
> }
> if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
> return 0;
> - expect += tree->mtt_step_size;
> - if (expect == end)
> + expect -= tree->mtt_step_size;
> + if (expect < start)
> return 1;
> }
> return 0;
> @@ -594,65 +594,67 @@ add_and_merge_extent(struct pnfs_block_layout *bl,
> /* Scan for proper place to insert, extending new to the left
> * as much as possible.
> */
> - list_for_each_entry_safe(be, tmp, list, be_node) {
> - if (new->be_f_offset < be->be_f_offset)
> + list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
> + if (new->be_f_offset >= be->be_f_offset + be->be_length)
> break;
> - if (end <= be->be_f_offset + be->be_length) {
> - /* new is a subset of existing be*/
> + if (new->be_f_offset >= be->be_f_offset) {
> + if (end <= be->be_f_offset + be->be_length) {
> + /* new is a subset of existing be*/
> + if (extents_consistent(be, new)) {
> + dprintk("%s: new is subset, ignoring\n",
> + __func__);
> + put_extent(new);
> + return 0;
> + } else {
> + goto out_err;
> + }
> + } else {
> + /* |<-- be -->|
> + * |<-- new -->| */
> + if (extents_consistent(be, new)) {
> + /* extend new to fully replace be */
> + new->be_length += new->be_f_offset -
> + be->be_f_offset;
> + new->be_f_offset = be->be_f_offset;
> + new->be_v_offset = be->be_v_offset;
> + dprintk("%s: removing %p\n", __func__, be);
> + list_del(&be->be_node);
> + put_extent(be);
> + } else {
> + goto out_err;
> + }
> + }
> + } else if (end >= be->be_f_offset + be->be_length) {
> + /* new extent overlap existing be */
> if (extents_consistent(be, new)) {
> - dprintk("%s: new is subset, ignoring\n",
> - __func__);
> - put_extent(new);
> - return 0;
> - } else
> + /* extend new to fully replace be */
> + dprintk("%s: removing %p\n", __func__, be);
> + list_del(&be->be_node);
> + put_extent(be);
> + } else {
> goto out_err;
> - } else if (new->be_f_offset <=
> - be->be_f_offset + be->be_length) {
> - /* new overlaps or abuts existing be */
> - if (extents_consistent(be, new)) {
> + }
> + } else if (end > be->be_f_offset) {
> + /* |<-- be -->|
> + *|<-- new -->| */
> + if (extents_consistent(new, be)) {
> /* extend new to fully replace be */
> - new->be_length += new->be_f_offset -
> - be->be_f_offset;
> - new->be_f_offset = be->be_f_offset;
> - new->be_v_offset = be->be_v_offset;
> + new->be_length += be->be_f_offset + be->be_length -
> + new->be_f_offset - new->be_length;
> dprintk("%s: removing %p\n", __func__, be);
> list_del(&be->be_node);
> put_extent(be);
> - } else if (new->be_f_offset !=
> - be->be_f_offset + be->be_length)
> + } else {
> goto out_err;
> + }
> }
> }
> /* Note that if we never hit the above break, be will not point to a
> * valid extent. However, in that case &be->be_node==list.
> */
> - list_add_tail(&new->be_node, &be->be_node);
> + list_add(&new->be_node, &be->be_node);
> dprintk("%s: inserting new\n", __func__);
> print_elist(list);
> - /* Scan forward for overlaps. If we find any, extend new and
> - * remove the overlapped extent.
> - */
> - be = list_prepare_entry(new, list, be_node);
> - list_for_each_entry_safe_continue(be, tmp, list, be_node) {
> - if (end < be->be_f_offset)
> - break;
> - /* new overlaps or abuts existing be */
> - if (extents_consistent(be, new)) {
> - if (end < be->be_f_offset + be->be_length) {
> - /* extend new to fully cover be */
> - end = be->be_f_offset + be->be_length;
> - new->be_length = end - new->be_f_offset;
> - }
> - dprintk("%s: removing %p\n", __func__, be);
> - list_del(&be->be_node);
> - put_extent(be);
> - } else if (end != be->be_f_offset) {
> - list_del(&new->be_node);
> - goto out_err;
> - }
> - }
> - dprintk("%s: after merging\n", __func__);
> - print_elist(list);
> /* STUB - The per-list consistency checks have all been done,
> * should now check cross-list consistency.
> */
> @@ -685,10 +687,10 @@ find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
> if (ret &&
> (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
> break;
> - list_for_each_entry(be, &bl->bl_extents[i], be_node) {
> - if (isect < be->be_f_offset)
> + list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
> + if (isect >= be->be_f_offset + be->be_length)
> break;
> - if (isect < be->be_f_offset + be->be_length) {
> + if (isect >= be->be_f_offset) {
> /* We have found an extent */
> dprintk("%s Get %p (%i)\n", __func__, be,
> atomic_read(&be->be_refcnt.refcount));
> @@ -721,10 +723,10 @@ find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
> for (i = 0; i < EXTENT_LISTS; i++) {
> if (ret)
> break;
> - list_for_each_entry(be, &bl->bl_extents[i], be_node) {
> - if (isect < be->be_f_offset)
> + list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
> + if (isect >= be->be_f_offset + be->be_length)
> break;
> - if (isect < be->be_f_offset + be->be_length) {
> + if (isect >= be->be_f_offset) {
> /* We have found an extent */
> dprintk("%s Get %p (%i)\n", __func__, be,
> atomic_read(&be->be_refcnt.refcount));


2010-05-12 20:28:16

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH] pnfsblock: Lookup list entry of layouts and tags in reverse order

On Wed, May 12, 2010 at 09:46:43AM +0300, Benny Halevy wrote:
> On May. 10, 2010, 6:36 +0300, Zhang Jingwang <[email protected]> wrote:
> > Optimize for sequencial write. Layout infos and tags are organized by
> > file offset. When appending data to a file whole list will be examined,
> > which introduce notable performance decrease.
>
> Looks good to me.
>
> Fred, can you please double check?

I don't know if Fred's still up for reviewing block stuff?

I've been trying to keep up with at least some minimal testing, but not
as well as I'd like.

The one thing I've noticed is that the connectathon general test has
started failing right at the start with an IO error. The last good
version I tested was b5c09c21, which was based on 33-rc6. The earliest
bad version I tested was 419312ada, based on 34-rc2. A quick look at
network traces from the two traces didn't turn up anything obvious. I
haven't had the chance yet to look closer.

--b.

>
> Benny
>
> P.S.: Zhang, please note Fred's new email address
>
> >
> > Signed-off-by: Zhang Jingwang <[email protected]>
> > ---
> > fs/nfs/blocklayout/extents.c | 126 +++++++++++++++++++++---------------------
> > 1 files changed, 64 insertions(+), 62 deletions(-)
> >
> > diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
> > index 3c311f2..514f2cc 100644
> > --- a/fs/nfs/blocklayout/extents.c
> > +++ b/fs/nfs/blocklayout/extents.c
> > @@ -66,8 +66,8 @@ static int32_t _find_entry(struct my_tree_t *tree, u64 s)
> > struct pnfs_inval_tracking *pos;
> >
> > dprintk("%s(%llu) enter\n", __func__, s);
> > - list_for_each_entry(pos, &tree->mtt_stub, it_link) {
> > - if (pos->it_sector < s)
> > + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
> > + if (pos->it_sector > s)
> > continue;
> > else if (pos->it_sector == s)
> > return pos->it_tags & INTERNAL_MASK;
> > @@ -102,8 +102,8 @@ static int _add_entry(struct my_tree_t *tree, u64 s, int32_t tag,
> > struct pnfs_inval_tracking *pos;
> >
> > dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
> > - list_for_each_entry(pos, &tree->mtt_stub, it_link) {
> > - if (pos->it_sector < s)
> > + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
> > + if (pos->it_sector > s)
> > continue;
> > else if (pos->it_sector == s) {
> > found = 1;
> > @@ -125,7 +125,7 @@ static int _add_entry(struct my_tree_t *tree, u64 s, int32_t tag,
> > }
> > new->it_sector = s;
> > new->it_tags = (1 << tag);
> > - list_add_tail(&new->it_link, &pos->it_link);
> > + list_add(&new->it_link, &pos->it_link);
> > return 1;
> > }
> > }
> > @@ -230,14 +230,14 @@ _range_has_tag(struct my_tree_t *tree, u64 start, u64 end, int32_t tag)
> > u64 expect = 0;
> >
> > dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
> > - list_for_each_entry(pos, &tree->mtt_stub, it_link) {
> > - if (pos->it_sector < start)
> > + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
> > + if (pos->it_sector >= end)
> > continue;
> > if (!expect) {
> > - if ((pos->it_sector == start) &&
> > + if ((pos->it_sector == end - tree->mtt_step_size) &&
> > (pos->it_tags & (1 << tag))) {
> > - expect = start + tree->mtt_step_size;
> > - if (expect == end)
> > + expect = pos->it_sector - tree->mtt_step_size;
> > + if (expect < start)
> > return 1;
> > continue;
> > } else {
> > @@ -246,8 +246,8 @@ _range_has_tag(struct my_tree_t *tree, u64 start, u64 end, int32_t tag)
> > }
> > if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
> > return 0;
> > - expect += tree->mtt_step_size;
> > - if (expect == end)
> > + expect -= tree->mtt_step_size;
> > + if (expect < start)
> > return 1;
> > }
> > return 0;
> > @@ -594,65 +594,67 @@ add_and_merge_extent(struct pnfs_block_layout *bl,
> > /* Scan for proper place to insert, extending new to the left
> > * as much as possible.
> > */
> > - list_for_each_entry_safe(be, tmp, list, be_node) {
> > - if (new->be_f_offset < be->be_f_offset)
> > + list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
> > + if (new->be_f_offset >= be->be_f_offset + be->be_length)
> > break;
> > - if (end <= be->be_f_offset + be->be_length) {
> > - /* new is a subset of existing be*/
> > + if (new->be_f_offset >= be->be_f_offset) {
> > + if (end <= be->be_f_offset + be->be_length) {
> > + /* new is a subset of existing be*/
> > + if (extents_consistent(be, new)) {
> > + dprintk("%s: new is subset, ignoring\n",
> > + __func__);
> > + put_extent(new);
> > + return 0;
> > + } else {
> > + goto out_err;
> > + }
> > + } else {
> > + /* |<-- be -->|
> > + * |<-- new -->| */
> > + if (extents_consistent(be, new)) {
> > + /* extend new to fully replace be */
> > + new->be_length += new->be_f_offset -
> > + be->be_f_offset;
> > + new->be_f_offset = be->be_f_offset;
> > + new->be_v_offset = be->be_v_offset;
> > + dprintk("%s: removing %p\n", __func__, be);
> > + list_del(&be->be_node);
> > + put_extent(be);
> > + } else {
> > + goto out_err;
> > + }
> > + }
> > + } else if (end >= be->be_f_offset + be->be_length) {
> > + /* new extent overlap existing be */
> > if (extents_consistent(be, new)) {
> > - dprintk("%s: new is subset, ignoring\n",
> > - __func__);
> > - put_extent(new);
> > - return 0;
> > - } else
> > + /* extend new to fully replace be */
> > + dprintk("%s: removing %p\n", __func__, be);
> > + list_del(&be->be_node);
> > + put_extent(be);
> > + } else {
> > goto out_err;
> > - } else if (new->be_f_offset <=
> > - be->be_f_offset + be->be_length) {
> > - /* new overlaps or abuts existing be */
> > - if (extents_consistent(be, new)) {
> > + }
> > + } else if (end > be->be_f_offset) {
> > + /* |<-- be -->|
> > + *|<-- new -->| */
> > + if (extents_consistent(new, be)) {
> > /* extend new to fully replace be */
> > - new->be_length += new->be_f_offset -
> > - be->be_f_offset;
> > - new->be_f_offset = be->be_f_offset;
> > - new->be_v_offset = be->be_v_offset;
> > + new->be_length += be->be_f_offset + be->be_length -
> > + new->be_f_offset - new->be_length;
> > dprintk("%s: removing %p\n", __func__, be);
> > list_del(&be->be_node);
> > put_extent(be);
> > - } else if (new->be_f_offset !=
> > - be->be_f_offset + be->be_length)
> > + } else {
> > goto out_err;
> > + }
> > }
> > }
> > /* Note that if we never hit the above break, be will not point to a
> > * valid extent. However, in that case &be->be_node==list.
> > */
> > - list_add_tail(&new->be_node, &be->be_node);
> > + list_add(&new->be_node, &be->be_node);
> > dprintk("%s: inserting new\n", __func__);
> > print_elist(list);
> > - /* Scan forward for overlaps. If we find any, extend new and
> > - * remove the overlapped extent.
> > - */
> > - be = list_prepare_entry(new, list, be_node);
> > - list_for_each_entry_safe_continue(be, tmp, list, be_node) {
> > - if (end < be->be_f_offset)
> > - break;
> > - /* new overlaps or abuts existing be */
> > - if (extents_consistent(be, new)) {
> > - if (end < be->be_f_offset + be->be_length) {
> > - /* extend new to fully cover be */
> > - end = be->be_f_offset + be->be_length;
> > - new->be_length = end - new->be_f_offset;
> > - }
> > - dprintk("%s: removing %p\n", __func__, be);
> > - list_del(&be->be_node);
> > - put_extent(be);
> > - } else if (end != be->be_f_offset) {
> > - list_del(&new->be_node);
> > - goto out_err;
> > - }
> > - }
> > - dprintk("%s: after merging\n", __func__);
> > - print_elist(list);
> > /* STUB - The per-list consistency checks have all been done,
> > * should now check cross-list consistency.
> > */
> > @@ -685,10 +687,10 @@ find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
> > if (ret &&
> > (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
> > break;
> > - list_for_each_entry(be, &bl->bl_extents[i], be_node) {
> > - if (isect < be->be_f_offset)
> > + list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
> > + if (isect >= be->be_f_offset + be->be_length)
> > break;
> > - if (isect < be->be_f_offset + be->be_length) {
> > + if (isect >= be->be_f_offset) {
> > /* We have found an extent */
> > dprintk("%s Get %p (%i)\n", __func__, be,
> > atomic_read(&be->be_refcnt.refcount));
> > @@ -721,10 +723,10 @@ find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
> > for (i = 0; i < EXTENT_LISTS; i++) {
> > if (ret)
> > break;
> > - list_for_each_entry(be, &bl->bl_extents[i], be_node) {
> > - if (isect < be->be_f_offset)
> > + list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
> > + if (isect >= be->be_f_offset + be->be_length)
> > break;
> > - if (isect < be->be_f_offset + be->be_length) {
> > + if (isect >= be->be_f_offset) {
> > /* We have found an extent */
> > dprintk("%s Get %p (%i)\n", __func__, be,
> > atomic_read(&be->be_refcnt.refcount));
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html